diff options
Diffstat (limited to 'fs')
180 files changed, 8109 insertions, 3179 deletions
diff --git a/fs/Makefile b/fs/Makefile index 1148c555c4d3..98be354fdb61 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -37,7 +37,7 @@ obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FS_ENCRYPTION) += crypto/ obj-$(CONFIG_FS_VERITY) += verity/ obj-$(CONFIG_FILE_LOCKING) += locks.o -obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o +obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h index b7e844d2f321..699c4fa8b78b 100644 --- a/fs/adfs/adfs.h +++ b/fs/adfs/adfs.h @@ -26,14 +26,13 @@ static inline u16 adfs_filetype(u32 loadaddr) #define ADFS_NDA_PUBLIC_READ (1 << 5) #define ADFS_NDA_PUBLIC_WRITE (1 << 6) -#include "dir_f.h" - /* * adfs file system inode data in memory */ struct adfs_inode_info { loff_t mmu_private; __u32 parent_id; /* parent indirect disc address */ + __u32 indaddr; /* object indirect disc address */ __u32 loadaddr; /* RISC OS load address */ __u32 execaddr; /* RISC OS exec address */ unsigned int attr; /* RISC OS permissions */ @@ -93,15 +92,19 @@ struct adfs_dir { int nr_buffers; struct buffer_head *bh[4]; - - /* big directories need allocated buffers */ - struct buffer_head **bh_fplus; + struct buffer_head **bhs; unsigned int pos; __u32 parent_id; - struct adfs_dirheader dirhead; - union adfs_dirtail dirtail; + union { + struct adfs_dirheader *dirhead; + struct adfs_bigdirheader *bighead; + }; + union { + struct adfs_newdirtail *newtail; + struct adfs_bigdirtail *bigtail; + }; }; /* @@ -122,13 +125,13 @@ struct object_info { struct adfs_dir_ops { int (*read)(struct super_block *sb, unsigned int indaddr, unsigned int size, struct adfs_dir *dir); + int (*iterate)(struct adfs_dir *dir, struct dir_context *ctx); int (*setpos)(struct adfs_dir *dir, unsigned int fpos); int (*getnext)(struct adfs_dir *dir, struct object_info *obj); int (*update)(struct adfs_dir *dir, struct object_info *obj); int (*create)(struct adfs_dir *dir, struct object_info *obj); int (*remove)(struct adfs_dir *dir, struct object_info *obj); - int (*sync)(struct adfs_dir *dir); - void (*free)(struct adfs_dir *dir); + int (*commit)(struct adfs_dir *dir); }; struct adfs_discmap { @@ -145,7 +148,9 @@ int adfs_notify_change(struct dentry *dentry, struct iattr *attr); /* map.c */ int adfs_map_lookup(struct super_block *sb, u32 frag_id, unsigned int offset); -extern unsigned int adfs_map_free(struct super_block *sb); +void adfs_map_statfs(struct super_block *sb, struct kstatfs *buf); +struct adfs_discmap *adfs_read_map(struct super_block *sb, struct adfs_discrecord *dr); +void adfs_free_map(struct super_block *sb); /* Misc */ __printf(3, 4) @@ -167,6 +172,13 @@ extern const struct dentry_operations adfs_dentry_operations; extern const struct adfs_dir_ops adfs_f_dir_ops; extern const struct adfs_dir_ops adfs_fplus_dir_ops; +int adfs_dir_copyfrom(void *dst, struct adfs_dir *dir, unsigned int offset, + size_t len); +int adfs_dir_copyto(struct adfs_dir *dir, unsigned int offset, const void *src, + size_t len); +void adfs_dir_relse(struct adfs_dir *dir); +int adfs_dir_read_buffers(struct super_block *sb, u32 indaddr, + unsigned int size, struct adfs_dir *dir); void adfs_object_fixup(struct adfs_dir *dir, struct object_info *obj); extern int adfs_dir_update(struct super_block *sb, struct object_info *obj, int wait); diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index a54c53244992..77fbd196008f 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -6,12 +6,196 @@ * * Common directory handling for ADFS */ +#include <linux/slab.h> #include "adfs.h" /* * For future. This should probably be per-directory. */ -static DEFINE_RWLOCK(adfs_dir_lock); +static DECLARE_RWSEM(adfs_dir_rwsem); + +int adfs_dir_copyfrom(void *dst, struct adfs_dir *dir, unsigned int offset, + size_t len) +{ + struct super_block *sb = dir->sb; + unsigned int index, remain; + + index = offset >> sb->s_blocksize_bits; + offset &= sb->s_blocksize - 1; + remain = sb->s_blocksize - offset; + if (index + (remain < len) >= dir->nr_buffers) + return -EINVAL; + + if (remain < len) { + memcpy(dst, dir->bhs[index]->b_data + offset, remain); + dst += remain; + len -= remain; + index += 1; + offset = 0; + } + + memcpy(dst, dir->bhs[index]->b_data + offset, len); + + return 0; +} + +int adfs_dir_copyto(struct adfs_dir *dir, unsigned int offset, const void *src, + size_t len) +{ + struct super_block *sb = dir->sb; + unsigned int index, remain; + + index = offset >> sb->s_blocksize_bits; + offset &= sb->s_blocksize - 1; + remain = sb->s_blocksize - offset; + if (index + (remain < len) >= dir->nr_buffers) + return -EINVAL; + + if (remain < len) { + memcpy(dir->bhs[index]->b_data + offset, src, remain); + src += remain; + len -= remain; + index += 1; + offset = 0; + } + + memcpy(dir->bhs[index]->b_data + offset, src, len); + + return 0; +} + +static void __adfs_dir_cleanup(struct adfs_dir *dir) +{ + dir->nr_buffers = 0; + + if (dir->bhs != dir->bh) + kfree(dir->bhs); + dir->bhs = NULL; + dir->sb = NULL; +} + +void adfs_dir_relse(struct adfs_dir *dir) +{ + unsigned int i; + + for (i = 0; i < dir->nr_buffers; i++) + brelse(dir->bhs[i]); + + __adfs_dir_cleanup(dir); +} + +static void adfs_dir_forget(struct adfs_dir *dir) +{ + unsigned int i; + + for (i = 0; i < dir->nr_buffers; i++) + bforget(dir->bhs[i]); + + __adfs_dir_cleanup(dir); +} + +int adfs_dir_read_buffers(struct super_block *sb, u32 indaddr, + unsigned int size, struct adfs_dir *dir) +{ + struct buffer_head **bhs; + unsigned int i, num; + int block; + + num = ALIGN(size, sb->s_blocksize) >> sb->s_blocksize_bits; + if (num > ARRAY_SIZE(dir->bh)) { + /* We only allow one extension */ + if (dir->bhs != dir->bh) + return -EINVAL; + + bhs = kcalloc(num, sizeof(*bhs), GFP_KERNEL); + if (!bhs) + return -ENOMEM; + + if (dir->nr_buffers) + memcpy(bhs, dir->bhs, dir->nr_buffers * sizeof(*bhs)); + + dir->bhs = bhs; + } + + for (i = dir->nr_buffers; i < num; i++) { + block = __adfs_block_map(sb, indaddr, i); + if (!block) { + adfs_error(sb, "dir %06x has a hole at offset %u", + indaddr, i); + goto error; + } + + dir->bhs[i] = sb_bread(sb, block); + if (!dir->bhs[i]) { + adfs_error(sb, + "dir %06x failed read at offset %u, mapped block 0x%08x", + indaddr, i, block); + goto error; + } + + dir->nr_buffers++; + } + return 0; + +error: + adfs_dir_relse(dir); + + return -EIO; +} + +static int adfs_dir_read(struct super_block *sb, u32 indaddr, + unsigned int size, struct adfs_dir *dir) +{ + dir->sb = sb; + dir->bhs = dir->bh; + dir->nr_buffers = 0; + + return ADFS_SB(sb)->s_dir->read(sb, indaddr, size, dir); +} + +static int adfs_dir_read_inode(struct super_block *sb, struct inode *inode, + struct adfs_dir *dir) +{ + int ret; + + ret = adfs_dir_read(sb, ADFS_I(inode)->indaddr, inode->i_size, dir); + if (ret) + return ret; + + if (ADFS_I(inode)->parent_id != dir->parent_id) { + adfs_error(sb, + "parent directory id changed under me! (%06x but got %06x)\n", + ADFS_I(inode)->parent_id, dir->parent_id); + adfs_dir_relse(dir); + ret = -EIO; + } + + return ret; +} + +static void adfs_dir_mark_dirty(struct adfs_dir *dir) +{ + unsigned int i; + + /* Mark the buffers dirty */ + for (i = 0; i < dir->nr_buffers; i++) + mark_buffer_dirty(dir->bhs[i]); +} + +static int adfs_dir_sync(struct adfs_dir *dir) +{ + int err = 0; + int i; + + for (i = dir->nr_buffers - 1; i >= 0; i--) { + struct buffer_head *bh = dir->bhs[i]; + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) + err = -EIO; + } + + return err; +} void adfs_object_fixup(struct adfs_dir *dir, struct object_info *obj) { @@ -51,87 +235,90 @@ void adfs_object_fixup(struct adfs_dir *dir, struct object_info *obj) } } -static int -adfs_readdir(struct file *file, struct dir_context *ctx) +static int adfs_iterate(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; const struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir; - struct object_info obj; struct adfs_dir dir; - int ret = 0; - - if (ctx->pos >> 32) - return 0; + int ret; - ret = ops->read(sb, inode->i_ino, inode->i_size, &dir); + down_read(&adfs_dir_rwsem); + ret = adfs_dir_read_inode(sb, inode, &dir); if (ret) - return ret; + goto unlock; if (ctx->pos == 0) { if (!dir_emit_dot(file, ctx)) - goto free_out; + goto unlock_relse; ctx->pos = 1; } if (ctx->pos == 1) { if (!dir_emit(ctx, "..", 2, dir.parent_id, DT_DIR)) - goto free_out; + goto unlock_relse; ctx->pos = 2; } - read_lock(&adfs_dir_lock); + ret = ops->iterate(&dir, ctx); - ret = ops->setpos(&dir, ctx->pos - 2); - if (ret) - goto unlock_out; - while (ops->getnext(&dir, &obj) == 0) { - if (!dir_emit(ctx, obj.name, obj.name_len, - obj.indaddr, DT_UNKNOWN)) - break; - ctx->pos++; - } - -unlock_out: - read_unlock(&adfs_dir_lock); +unlock_relse: + up_read(&adfs_dir_rwsem); + adfs_dir_relse(&dir); + return ret; -free_out: - ops->free(&dir); +unlock: + up_read(&adfs_dir_rwsem); return ret; } int adfs_dir_update(struct super_block *sb, struct object_info *obj, int wait) { - int ret = -EINVAL; -#ifdef CONFIG_ADFS_FS_RW const struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir; struct adfs_dir dir; + int ret; - printk(KERN_INFO "adfs_dir_update: object %06x in dir %06x\n", - obj->indaddr, obj->parent_id); + if (!IS_ENABLED(CONFIG_ADFS_FS_RW)) + return -EINVAL; - if (!ops->update) { - ret = -EINVAL; - goto out; - } + if (!ops->update) + return -EINVAL; - ret = ops->read(sb, obj->parent_id, 0, &dir); + down_write(&adfs_dir_rwsem); + ret = adfs_dir_read(sb, obj->parent_id, 0, &dir); if (ret) - goto out; + goto unlock; - write_lock(&adfs_dir_lock); ret = ops->update(&dir, obj); - write_unlock(&adfs_dir_lock); + if (ret) + goto forget; - if (wait) { - int err = ops->sync(&dir); - if (!ret) - ret = err; - } + ret = ops->commit(&dir); + if (ret) + goto forget; + up_write(&adfs_dir_rwsem); + + adfs_dir_mark_dirty(&dir); + + if (wait) + ret = adfs_dir_sync(&dir); + + adfs_dir_relse(&dir); + return ret; + + /* + * If the updated failed because the entry wasn't found, we can + * just release the buffers. If it was any other error, forget + * the dirtied buffers so they aren't written back to the media. + */ +forget: + if (ret == -ENOENT) + adfs_dir_relse(&dir); + else + adfs_dir_forget(&dir); +unlock: + up_write(&adfs_dir_rwsem); - ops->free(&dir); -out: -#endif return ret; } @@ -167,25 +354,14 @@ static int adfs_dir_lookup_byname(struct inode *inode, const struct qstr *qstr, u32 name_len; int ret; - ret = ops->read(sb, inode->i_ino, inode->i_size, &dir); + down_read(&adfs_dir_rwsem); + ret = adfs_dir_read_inode(sb, inode, &dir); if (ret) - goto out; - - if (ADFS_I(inode)->parent_id != dir.parent_id) { - adfs_error(sb, - "parent directory changed under me! (%06x but got %06x)\n", - ADFS_I(inode)->parent_id, dir.parent_id); - ret = -EIO; - goto free_out; - } - - obj->parent_id = inode->i_ino; - - read_lock(&adfs_dir_lock); + goto unlock; ret = ops->setpos(&dir, 0); if (ret) - goto unlock_out; + goto unlock_relse; ret = -ENOENT; name = qstr->name; @@ -196,20 +372,22 @@ static int adfs_dir_lookup_byname(struct inode *inode, const struct qstr *qstr, break; } } + obj->parent_id = ADFS_I(inode)->indaddr; -unlock_out: - read_unlock(&adfs_dir_lock); +unlock_relse: + up_read(&adfs_dir_rwsem); + adfs_dir_relse(&dir); + return ret; -free_out: - ops->free(&dir); -out: +unlock: + up_read(&adfs_dir_rwsem); return ret; } const struct file_operations adfs_dir_operations = { .read = generic_read_dir, .llseek = generic_file_llseek, - .iterate = adfs_readdir, + .iterate_shared = adfs_iterate, .fsync = generic_file_fsync, }; diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c index c1a950c7400a..30d526fecc3f 100644 --- a/fs/adfs/dir_f.c +++ b/fs/adfs/dir_f.c @@ -9,8 +9,6 @@ #include "adfs.h" #include "dir_f.h" -static void adfs_f_free(struct adfs_dir *dir); - /* * Read an (unaligned) value of length 1..4 bytes */ @@ -60,7 +58,7 @@ static inline void adfs_writeval(unsigned char *p, int len, unsigned int val) #define bufoff(_bh,_idx) \ ({ int _buf = _idx >> blocksize_bits; \ int _off = _idx - (_buf << blocksize_bits);\ - (u8 *)(_bh[_buf]->b_data + _off); \ + (void *)(_bh[_buf]->b_data + _off); \ }) /* @@ -123,65 +121,49 @@ adfs_dir_checkbyte(const struct adfs_dir *dir) return (dircheck ^ (dircheck >> 8) ^ (dircheck >> 16) ^ (dircheck >> 24)) & 0xff; } -/* Read and check that a directory is valid */ -static int adfs_dir_read(struct super_block *sb, u32 indaddr, - unsigned int size, struct adfs_dir *dir) +static int adfs_f_validate(struct adfs_dir *dir) { - const unsigned int blocksize_bits = sb->s_blocksize_bits; - int blk = 0; - - /* - * Directories which are not a multiple of 2048 bytes - * are considered bad v2 [3.6] - */ - if (size & 2047) - goto bad_dir; - - size >>= blocksize_bits; - - dir->nr_buffers = 0; - dir->sb = sb; - - for (blk = 0; blk < size; blk++) { - int phys; + struct adfs_dirheader *head = dir->dirhead; + struct adfs_newdirtail *tail = dir->newtail; + + if (head->startmasseq != tail->endmasseq || + tail->dirlastmask || tail->reserved[0] || tail->reserved[1] || + (memcmp(&head->startname, "Nick", 4) && + memcmp(&head->startname, "Hugo", 4)) || + memcmp(&head->startname, &tail->endname, 4) || + adfs_dir_checkbyte(dir) != tail->dircheckbyte) + return -EIO; - phys = __adfs_block_map(sb, indaddr, blk); - if (!phys) { - adfs_error(sb, "dir %06x has a hole at offset %d", - indaddr, blk); - goto release_buffers; - } + return 0; +} - dir->bh[blk] = sb_bread(sb, phys); - if (!dir->bh[blk]) - goto release_buffers; - } +/* Read and check that a directory is valid */ +static int adfs_f_read(struct super_block *sb, u32 indaddr, unsigned int size, + struct adfs_dir *dir) +{ + const unsigned int blocksize_bits = sb->s_blocksize_bits; + int ret; - memcpy(&dir->dirhead, bufoff(dir->bh, 0), sizeof(dir->dirhead)); - memcpy(&dir->dirtail, bufoff(dir->bh, 2007), sizeof(dir->dirtail)); + if (size && size != ADFS_NEWDIR_SIZE) + return -EIO; - if (dir->dirhead.startmasseq != dir->dirtail.new.endmasseq || - memcmp(&dir->dirhead.startname, &dir->dirtail.new.endname, 4)) - goto bad_dir; + ret = adfs_dir_read_buffers(sb, indaddr, ADFS_NEWDIR_SIZE, dir); + if (ret) + return ret; - if (memcmp(&dir->dirhead.startname, "Nick", 4) && - memcmp(&dir->dirhead.startname, "Hugo", 4)) - goto bad_dir; + dir->dirhead = bufoff(dir->bh, 0); + dir->newtail = bufoff(dir->bh, 2007); - if (adfs_dir_checkbyte(dir) != dir->dirtail.new.dircheckbyte) + if (adfs_f_validate(dir)) goto bad_dir; - dir->nr_buffers = blk; + dir->parent_id = adfs_readval(dir->newtail->dirparent, 3); return 0; bad_dir: adfs_error(sb, "dir %06x is corrupted", indaddr); -release_buffers: - for (blk -= 1; blk >= 0; blk -= 1) - brelse(dir->bh[blk]); - - dir->sb = NULL; + adfs_dir_relse(dir); return -EIO; } @@ -232,24 +214,12 @@ adfs_obj2dir(struct adfs_direntry *de, struct object_info *obj) static int __adfs_dir_get(struct adfs_dir *dir, int pos, struct object_info *obj) { - struct super_block *sb = dir->sb; struct adfs_direntry de; - int thissize, buffer, offset; - - buffer = pos >> sb->s_blocksize_bits; - - if (buffer > dir->nr_buffers) - return -EINVAL; - - offset = pos & (sb->s_blocksize - 1); - thissize = sb->s_blocksize - offset; - if (thissize > 26) - thissize = 26; + int ret; - memcpy(&de, dir->bh[buffer]->b_data + offset, thissize); - if (thissize != 26) - memcpy(((char *)&de) + thissize, dir->bh[buffer + 1]->b_data, - 26 - thissize); + ret = adfs_dir_copyfrom(&de, dir, pos, 26); + if (ret) + return ret; if (!de.dirobname[0]) return -ENOENT; @@ -260,89 +230,6 @@ __adfs_dir_get(struct adfs_dir *dir, int pos, struct object_info *obj) } static int -__adfs_dir_put(struct adfs_dir *dir, int pos, struct object_info *obj) -{ - struct super_block *sb = dir->sb; - struct adfs_direntry de; - int thissize, buffer, offset; - - buffer = pos >> sb->s_blocksize_bits; - - if (buffer > dir->nr_buffers) - return -EINVAL; - - offset = pos & (sb->s_blocksize - 1); - thissize = sb->s_blocksize - offset; - if (thissize > 26) - thissize = 26; - - /* - * Get the entry in total - */ - memcpy(&de, dir->bh[buffer]->b_data + offset, thissize); - if (thissize != 26) - memcpy(((char *)&de) + thissize, dir->bh[buffer + 1]->b_data, - 26 - thissize); - - /* - * update it - */ - adfs_obj2dir(&de, obj); - - /* - * Put the new entry back - */ - memcpy(dir->bh[buffer]->b_data + offset, &de, thissize); - if (thissize != 26) - memcpy(dir->bh[buffer + 1]->b_data, ((char *)&de) + thissize, - 26 - thissize); - - return 0; -} - -/* - * the caller is responsible for holding the necessary - * locks. - */ -static int adfs_dir_find_entry(struct adfs_dir *dir, u32 indaddr) -{ - int pos, ret; - - ret = -ENOENT; - - for (pos = 5; pos < ADFS_NUM_DIR_ENTRIES * 26 + 5; pos += 26) { - struct object_info obj; - - if (!__adfs_dir_get(dir, pos, &obj)) - break; - - if (obj.indaddr == indaddr) { - ret = pos; - break; - } - } - - return ret; -} - -static int adfs_f_read(struct super_block *sb, u32 indaddr, unsigned int size, - struct adfs_dir *dir) -{ - int ret; - - if (size != ADFS_NEWDIR_SIZE) - return -EIO; - - ret = adfs_dir_read(sb, indaddr, size, dir); - if (ret) - adfs_error(sb, "unable to read directory"); - else - dir->parent_id = adfs_readval(dir->dirtail.new.dirparent, 3); - - return ret; -} - -static int adfs_f_setpos(struct adfs_dir *dir, unsigned int fpos) { if (fpos >= ADFS_NUM_DIR_ENTRIES) @@ -364,99 +251,74 @@ adfs_f_getnext(struct adfs_dir *dir, struct object_info *obj) return ret; } -static int -adfs_f_update(struct adfs_dir *dir, struct object_info *obj) +static int adfs_f_iterate(struct adfs_dir *dir, struct dir_context *ctx) { - struct super_block *sb = dir->sb; - int ret, i; + struct object_info obj; + int pos = 5 + (ctx->pos - 2) * 26; - ret = adfs_dir_find_entry(dir, obj->indaddr); - if (ret < 0) { - adfs_error(dir->sb, "unable to locate entry to update"); - goto out; + while (ctx->pos < 2 + ADFS_NUM_DIR_ENTRIES) { + if (__adfs_dir_get(dir, pos, &obj)) + break; + if (!dir_emit(ctx, obj.name, obj.name_len, + obj.indaddr, DT_UNKNOWN)) + break; + pos += 26; + ctx->pos++; } + return 0; +} - __adfs_dir_put(dir, ret, obj); - - /* - * Increment directory sequence number - */ - dir->bh[0]->b_data[0] += 1; - dir->bh[dir->nr_buffers - 1]->b_data[sb->s_blocksize - 6] += 1; - - ret = adfs_dir_checkbyte(dir); - /* - * Update directory check byte - */ - dir->bh[dir->nr_buffers - 1]->b_data[sb->s_blocksize - 1] = ret; - -#if 1 - { - const unsigned int blocksize_bits = sb->s_blocksize_bits; - - memcpy(&dir->dirhead, bufoff(dir->bh, 0), sizeof(dir->dirhead)); - memcpy(&dir->dirtail, bufoff(dir->bh, 2007), sizeof(dir->dirtail)); +static int adfs_f_update(struct adfs_dir *dir, struct object_info *obj) +{ + struct adfs_direntry de; + int offset, ret; - if (dir->dirhead.startmasseq != dir->dirtail.new.endmasseq || - memcmp(&dir->dirhead.startname, &dir->dirtail.new.endname, 4)) - goto bad_dir; + offset = 5 - (int)sizeof(de); - if (memcmp(&dir->dirhead.startname, "Nick", 4) && - memcmp(&dir->dirhead.startname, "Hugo", 4)) - goto bad_dir; + do { + offset += sizeof(de); + ret = adfs_dir_copyfrom(&de, dir, offset, sizeof(de)); + if (ret) { + adfs_error(dir->sb, "error reading directory entry"); + return -ENOENT; + } + if (!de.dirobname[0]) { + adfs_error(dir->sb, "unable to locate entry to update"); + return -ENOENT; + } + } while (adfs_readval(de.dirinddiscadd, 3) != obj->indaddr); - if (adfs_dir_checkbyte(dir) != dir->dirtail.new.dircheckbyte) - goto bad_dir; - } -#endif - for (i = dir->nr_buffers - 1; i >= 0; i--) - mark_buffer_dirty(dir->bh[i]); + /* Update the directory entry with the new object state */ + adfs_obj2dir(&de, obj); - ret = 0; -out: - return ret; -#if 1 -bad_dir: - adfs_error(dir->sb, "whoops! I broke a directory!"); - return -EIO; -#endif + /* Write the directory entry back to the directory */ + return adfs_dir_copyto(dir, offset, &de, 26); } -static int -adfs_f_sync(struct adfs_dir *dir) +static int adfs_f_commit(struct adfs_dir *dir) { - int err = 0; - int i; - - for (i = dir->nr_buffers - 1; i >= 0; i--) { - struct buffer_head *bh = dir->bh[i]; - sync_dirty_buffer(bh); - if (buffer_req(bh) && !buffer_uptodate(bh)) - err = -EIO; - } + int ret; - return err; -} + /* Increment directory sequence number */ + dir->dirhead->startmasseq += 1; + dir->newtail->endmasseq += 1; -static void -adfs_f_free(struct adfs_dir *dir) -{ - int i; + /* Update directory check byte */ + dir->newtail->dircheckbyte = adfs_dir_checkbyte(dir); - for (i = dir->nr_buffers - 1; i >= 0; i--) { - brelse(dir->bh[i]); - dir->bh[i] = NULL; - } + /* Make sure the directory still validates correctly */ + ret = adfs_f_validate(dir); + if (ret) + adfs_msg(dir->sb, KERN_ERR, "error: update broke directory"); - dir->nr_buffers = 0; - dir->sb = NULL; + return ret; } const struct adfs_dir_ops adfs_f_dir_ops = { .read = adfs_f_read, + .iterate = adfs_f_iterate, .setpos = adfs_f_setpos, .getnext = adfs_f_getnext, .update = adfs_f_update, - .sync = adfs_f_sync, - .free = adfs_f_free + .commit = adfs_f_commit, }; diff --git a/fs/adfs/dir_f.h b/fs/adfs/dir_f.h index 5aec332b90f5..a5393e6cf9f4 100644 --- a/fs/adfs/dir_f.h +++ b/fs/adfs/dir_f.h @@ -13,9 +13,9 @@ * Directory header */ struct adfs_dirheader { - unsigned char startmasseq; - unsigned char startname[4]; -}; + __u8 startmasseq; + __u8 startname[4]; +} __attribute__((packed)); #define ADFS_NEWDIR_SIZE 2048 #define ADFS_NUM_DIR_ENTRIES 77 @@ -31,32 +31,36 @@ struct adfs_direntry { __u8 dirlen[4]; __u8 dirinddiscadd[3]; __u8 newdiratts; -}; +} __attribute__((packed)); /* * Directory tail */ +struct adfs_olddirtail { + __u8 dirlastmask; + char dirname[10]; + __u8 dirparent[3]; + char dirtitle[19]; + __u8 reserved[14]; + __u8 endmasseq; + __u8 endname[4]; + __u8 dircheckbyte; +} __attribute__((packed)); + +struct adfs_newdirtail { + __u8 dirlastmask; + __u8 reserved[2]; + __u8 dirparent[3]; + char dirtitle[19]; + char dirname[10]; + __u8 endmasseq; + __u8 endname[4]; + __u8 dircheckbyte; +} __attribute__((packed)); + union adfs_dirtail { - struct { - unsigned char dirlastmask; - char dirname[10]; - unsigned char dirparent[3]; - char dirtitle[19]; - unsigned char reserved[14]; - unsigned char endmasseq; - unsigned char endname[4]; - unsigned char dircheckbyte; - } old; - struct { - unsigned char dirlastmask; - unsigned char reserved[2]; - unsigned char dirparent[3]; - char dirtitle[19]; - char dirname[10]; - unsigned char endmasseq; - unsigned char endname[4]; - unsigned char dircheckbyte; - } new; + struct adfs_olddirtail old; + struct adfs_newdirtail new; }; #endif diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c index d56924c11b17..4a15924014da 100644 --- a/fs/adfs/dir_fplus.c +++ b/fs/adfs/dir_fplus.c @@ -4,123 +4,163 @@ * * Copyright (C) 1997-1999 Russell King */ -#include <linux/slab.h> #include "adfs.h" #include "dir_fplus.h" -static int -adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct adfs_dir *dir) +/* Return the byte offset to directory entry pos */ +static unsigned int adfs_fplus_offset(const struct adfs_bigdirheader *h, + unsigned int pos) { - struct adfs_bigdirheader *h; - struct adfs_bigdirtail *t; - unsigned long block; - unsigned int blk, size; - int i, ret = -EIO; + return offsetof(struct adfs_bigdirheader, bigdirname) + + ALIGN(le32_to_cpu(h->bigdirnamelen), 4) + + pos * sizeof(struct adfs_bigdirentry); +} - dir->nr_buffers = 0; +static int adfs_fplus_validate_header(const struct adfs_bigdirheader *h) +{ + unsigned int size = le32_to_cpu(h->bigdirsize); + unsigned int len; - /* start off using fixed bh set - only alloc for big dirs */ - dir->bh_fplus = &dir->bh[0]; + if (h->bigdirversion[0] != 0 || h->bigdirversion[1] != 0 || + h->bigdirversion[2] != 0 || + h->bigdirstartname != cpu_to_le32(BIGDIRSTARTNAME) || + !size || size & 2047 || size > SZ_4M) + return -EIO; - block = __adfs_block_map(sb, id, 0); - if (!block) { - adfs_error(sb, "dir object %X has a hole at offset 0", id); - goto out; - } + size -= sizeof(struct adfs_bigdirtail) + + offsetof(struct adfs_bigdirheader, bigdirname); - dir->bh_fplus[0] = sb_bread(sb, block); - if (!dir->bh_fplus[0]) - goto out; - dir->nr_buffers += 1; + /* Check that bigdirnamelen fits within the directory */ + len = ALIGN(le32_to_cpu(h->bigdirnamelen), 4); + if (len > size) + return -EIO; - h = (struct adfs_bigdirheader *)dir->bh_fplus[0]->b_data; - size = le32_to_cpu(h->bigdirsize); - if (size != sz) { - adfs_msg(sb, KERN_WARNING, - "directory header size %X does not match directory size %X", - size, sz); + size -= len; + + /* Check that bigdirnamesize fits within the directory */ + len = le32_to_cpu(h->bigdirnamesize); + if (len > size) + return -EIO; + + size -= len; + + /* + * Avoid division, we know that absolute maximum number of entries + * can not be so large to cause overflow of the multiplication below. + */ + len = le32_to_cpu(h->bigdirentries); + if (len > SZ_4M / sizeof(struct adfs_bigdirentry) || + len * sizeof(struct adfs_bigdirentry) > size) + return -EIO; + + return 0; +} + +static int adfs_fplus_validate_tail(const struct adfs_bigdirheader *h, + const struct adfs_bigdirtail *t) +{ + if (t->bigdirendname != cpu_to_le32(BIGDIRENDNAME) || + t->bigdirendmasseq != h->startmasseq || + t->reserved[0] != 0 || t->reserved[1] != 0) + return -EIO; + + return 0; +} + +static u8 adfs_fplus_checkbyte(struct adfs_dir *dir) +{ + struct adfs_bigdirheader *h = dir->bighead; + struct adfs_bigdirtail *t = dir->bigtail; + unsigned int end, bs, bi, i; + __le32 *bp; + u32 dircheck; + + end = adfs_fplus_offset(h, le32_to_cpu(h->bigdirentries)) + + le32_to_cpu(h->bigdirnamesize); + + /* Accumulate the contents of the header, entries and names */ + for (dircheck = 0, bi = 0; end; bi++) { + bp = (void *)dir->bhs[bi]->b_data; + bs = dir->bhs[bi]->b_size; + if (bs > end) + bs = end; + + for (i = 0; i < bs; i += sizeof(u32)) + dircheck = ror32(dircheck, 13) ^ le32_to_cpup(bp++); + + end -= bs; } - if (h->bigdirversion[0] != 0 || h->bigdirversion[1] != 0 || - h->bigdirversion[2] != 0 || size & 2047 || - h->bigdirstartname != cpu_to_le32(BIGDIRSTARTNAME)) { - adfs_error(sb, "dir %06x has malformed header", id); + /* Accumulate the contents of the tail except for the check byte */ + dircheck = ror32(dircheck, 13) ^ le32_to_cpu(t->bigdirendname); + dircheck = ror32(dircheck, 13) ^ t->bigdirendmasseq; + dircheck = ror32(dircheck, 13) ^ t->reserved[0]; + dircheck = ror32(dircheck, 13) ^ t->reserved[1]; + + return dircheck ^ dircheck >> 8 ^ dircheck >> 16 ^ dircheck >> 24; +} + +static int adfs_fplus_read(struct super_block *sb, u32 indaddr, + unsigned int size, struct adfs_dir *dir) +{ + struct adfs_bigdirheader *h; + struct adfs_bigdirtail *t; + unsigned int dirsize; + int ret; + + /* Read first buffer */ + ret = adfs_dir_read_buffers(sb, indaddr, sb->s_blocksize, dir); + if (ret) + return ret; + + dir->bighead = h = (void *)dir->bhs[0]->b_data; + ret = adfs_fplus_validate_header(h); + if (ret) { + adfs_error(sb, "dir %06x has malformed header", indaddr); goto out; } - size >>= sb->s_blocksize_bits; - if (size > ARRAY_SIZE(dir->bh)) { - /* this directory is too big for fixed bh set, must allocate */ - struct buffer_head **bh_fplus = - kcalloc(size, sizeof(struct buffer_head *), - GFP_KERNEL); - if (!bh_fplus) { - adfs_msg(sb, KERN_ERR, - "not enough memory for dir object %X (%d blocks)", - id, size); - ret = -ENOMEM; - goto out; - } - dir->bh_fplus = bh_fplus; - /* copy over the pointer to the block that we've already read */ - dir->bh_fplus[0] = dir->bh[0]; + dirsize = le32_to_cpu(h->bigdirsize); + if (size && dirsize != size) { + adfs_msg(sb, KERN_WARNING, + "dir %06x header size %X does not match directory size %X", + indaddr, dirsize, size); } - for (blk = 1; blk < size; blk++) { - block = __adfs_block_map(sb, id, blk); - if (!block) { - adfs_error(sb, "dir object %X has a hole at offset %d", id, blk); - goto out; - } + /* Read remaining buffers */ + ret = adfs_dir_read_buffers(sb, indaddr, dirsize, dir); + if (ret) + return ret; - dir->bh_fplus[blk] = sb_bread(sb, block); - if (!dir->bh_fplus[blk]) { - adfs_error(sb, "dir object %x failed read for offset %d, mapped block %lX", - id, blk, block); - goto out; - } + dir->bigtail = t = (struct adfs_bigdirtail *) + (dir->bhs[dir->nr_buffers - 1]->b_data + (sb->s_blocksize - 8)); - dir->nr_buffers += 1; + ret = adfs_fplus_validate_tail(h, t); + if (ret) { + adfs_error(sb, "dir %06x has malformed tail", indaddr); + goto out; } - t = (struct adfs_bigdirtail *) - (dir->bh_fplus[size - 1]->b_data + (sb->s_blocksize - 8)); - - if (t->bigdirendname != cpu_to_le32(BIGDIRENDNAME) || - t->bigdirendmasseq != h->startmasseq || - t->reserved[0] != 0 || t->reserved[1] != 0) { - adfs_error(sb, "dir %06x has malformed tail", id); + if (adfs_fplus_checkbyte(dir) != t->bigdircheckbyte) { + adfs_error(sb, "dir %06x checkbyte mismatch\n", indaddr); goto out; } dir->parent_id = le32_to_cpu(h->bigdirparent); - dir->sb = sb; return 0; out: - if (dir->bh_fplus) { - for (i = 0; i < dir->nr_buffers; i++) - brelse(dir->bh_fplus[i]); - - if (&dir->bh[0] != dir->bh_fplus) - kfree(dir->bh_fplus); + adfs_dir_relse(dir); - dir->bh_fplus = NULL; - } - - dir->nr_buffers = 0; - dir->sb = NULL; return ret; } static int adfs_fplus_setpos(struct adfs_dir *dir, unsigned int fpos) { - struct adfs_bigdirheader *h = - (struct adfs_bigdirheader *) dir->bh_fplus[0]->b_data; int ret = -ENOENT; - if (fpos <= le32_to_cpu(h->bigdirentries)) { + if (fpos <= le32_to_cpu(dir->bighead->bigdirentries)) { dir->pos = fpos; ret = 0; } @@ -128,51 +168,23 @@ adfs_fplus_setpos(struct adfs_dir *dir, unsigned int fpos) return ret; } -static void -dir_memcpy(struct adfs_dir *dir, unsigned int offset, void *to, int len) -{ - struct super_block *sb = dir->sb; - unsigned int buffer, partial, remainder; - - buffer = offset >> sb->s_blocksize_bits; - offset &= sb->s_blocksize - 1; - - partial = sb->s_blocksize - offset; - - if (partial >= len) - memcpy(to, dir->bh_fplus[buffer]->b_data + offset, len); - else { - char *c = (char *)to; - - remainder = len - partial; - - memcpy(c, - dir->bh_fplus[buffer]->b_data + offset, - partial); - - memcpy(c + partial, - dir->bh_fplus[buffer + 1]->b_data, - remainder); - } -} - static int adfs_fplus_getnext(struct adfs_dir *dir, struct object_info *obj) { - struct adfs_bigdirheader *h = - (struct adfs_bigdirheader *) dir->bh_fplus[0]->b_data; + struct adfs_bigdirheader *h = dir->bighead; struct adfs_bigdirentry bde; unsigned int offset; - int ret = -ENOENT; + int ret; if (dir->pos >= le32_to_cpu(h->bigdirentries)) - goto out; + return -ENOENT; - offset = offsetof(struct adfs_bigdirheader, bigdirname); - offset += ((le32_to_cpu(h->bigdirnamelen) + 4) & ~3); - offset += dir->pos * sizeof(struct adfs_bigdirentry); + offset = adfs_fplus_offset(h, dir->pos); - dir_memcpy(dir, offset, &bde, sizeof(struct adfs_bigdirentry)); + ret = adfs_dir_copyfrom(&bde, dir, offset, + sizeof(struct adfs_bigdirentry)); + if (ret) + return ret; obj->loadaddr = le32_to_cpu(bde.bigdirload); obj->execaddr = le32_to_cpu(bde.bigdirexec); @@ -181,59 +193,95 @@ adfs_fplus_getnext(struct adfs_dir *dir, struct object_info *obj) obj->attr = le32_to_cpu(bde.bigdirattr); obj->name_len = le32_to_cpu(bde.bigdirobnamelen); - offset = offsetof(struct adfs_bigdirheader, bigdirname); - offset += ((le32_to_cpu(h->bigdirnamelen) + 4) & ~3); - offset += le32_to_cpu(h->bigdirentries) * sizeof(struct adfs_bigdirentry); + offset = adfs_fplus_offset(h, le32_to_cpu(h->bigdirentries)); offset += le32_to_cpu(bde.bigdirobnameptr); - dir_memcpy(dir, offset, obj->name, obj->name_len); + ret = adfs_dir_copyfrom(obj->name, dir, offset, obj->name_len); + if (ret) + return ret; + adfs_object_fixup(dir, obj); dir->pos += 1; - ret = 0; -out: - return ret; + + return 0; } -static int -adfs_fplus_sync(struct adfs_dir *dir) +static int adfs_fplus_iterate(struct adfs_dir *dir, struct dir_context *ctx) { - int err = 0; - int i; - - for (i = dir->nr_buffers - 1; i >= 0; i--) { - struct buffer_head *bh = dir->bh_fplus[i]; - sync_dirty_buffer(bh); - if (buffer_req(bh) && !buffer_uptodate(bh)) - err = -EIO; + struct object_info obj; + + if ((ctx->pos - 2) >> 32) + return 0; + + if (adfs_fplus_setpos(dir, ctx->pos - 2)) + return 0; + + while (!adfs_fplus_getnext(dir, &obj)) { + if (!dir_emit(ctx, obj.name, obj.name_len, + obj.indaddr, DT_UNKNOWN)) + break; + ctx->pos++; } - return err; + return 0; } -static void -adfs_fplus_free(struct adfs_dir *dir) +static int adfs_fplus_update(struct adfs_dir *dir, struct object_info *obj) { - int i; + struct adfs_bigdirheader *h = dir->bighead; + struct adfs_bigdirentry bde; + int offset, end, ret; - if (dir->bh_fplus) { - for (i = 0; i < dir->nr_buffers; i++) - brelse(dir->bh_fplus[i]); + offset = adfs_fplus_offset(h, 0) - sizeof(bde); + end = adfs_fplus_offset(h, le32_to_cpu(h->bigdirentries)); - if (&dir->bh[0] != dir->bh_fplus) - kfree(dir->bh_fplus); + do { + offset += sizeof(bde); + if (offset >= end) { + adfs_error(dir->sb, "unable to locate entry to update"); + return -ENOENT; + } + ret = adfs_dir_copyfrom(&bde, dir, offset, sizeof(bde)); + if (ret) { + adfs_error(dir->sb, "error reading directory entry"); + return -ENOENT; + } + } while (le32_to_cpu(bde.bigdirindaddr) != obj->indaddr); - dir->bh_fplus = NULL; - } + bde.bigdirload = cpu_to_le32(obj->loadaddr); + bde.bigdirexec = cpu_to_le32(obj->execaddr); + bde.bigdirlen = cpu_to_le32(obj->size); + bde.bigdirindaddr = cpu_to_le32(obj->indaddr); + bde.bigdirattr = cpu_to_le32(obj->attr); + + return adfs_dir_copyto(dir, offset, &bde, sizeof(bde)); +} + +static int adfs_fplus_commit(struct adfs_dir *dir) +{ + int ret; - dir->nr_buffers = 0; - dir->sb = NULL; + /* Increment directory sequence number */ + dir->bighead->startmasseq += 1; + dir->bigtail->bigdirendmasseq += 1; + + /* Update directory check byte */ + dir->bigtail->bigdircheckbyte = adfs_fplus_checkbyte(dir); + + /* Make sure the directory still validates correctly */ + ret = adfs_fplus_validate_header(dir->bighead); + if (ret == 0) + ret = adfs_fplus_validate_tail(dir->bighead, dir->bigtail); + + return ret; } const struct adfs_dir_ops adfs_fplus_dir_ops = { .read = adfs_fplus_read, + .iterate = adfs_fplus_iterate, .setpos = adfs_fplus_setpos, .getnext = adfs_fplus_getnext, - .sync = adfs_fplus_sync, - .free = adfs_fplus_free + .update = adfs_fplus_update, + .commit = adfs_fplus_commit, }; diff --git a/fs/adfs/dir_fplus.h b/fs/adfs/dir_fplus.h index 4ec0931e36ad..d729b1591e5e 100644 --- a/fs/adfs/dir_fplus.h +++ b/fs/adfs/dir_fplus.h @@ -22,7 +22,7 @@ struct adfs_bigdirheader { __le32 bigdirnamesize; __le32 bigdirparent; char bigdirname[1]; -}; +} __attribute__((packed, aligned(4))); struct adfs_bigdirentry { __le32 bigdirload; @@ -32,11 +32,11 @@ struct adfs_bigdirentry { __le32 bigdirattr; __le32 bigdirobnamelen; __le32 bigdirobnameptr; -}; +} __attribute__((packed, aligned(4))); struct adfs_bigdirtail { __le32 bigdirendname; __u8 bigdirendmasseq; __u8 reserved[2]; __u8 bigdircheckbyte; -}; +} __attribute__((packed, aligned(4))); diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 124de75413a5..32620f4a7623 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -20,7 +20,8 @@ adfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh, if (block >= inode->i_blocks) goto abort_toobig; - block = __adfs_block_map(inode->i_sb, inode->i_ino, block); + block = __adfs_block_map(inode->i_sb, ADFS_I(inode)->indaddr, + block); if (block) map_bh(bh, inode->i_sb, block); return 0; @@ -126,29 +127,29 @@ adfs_atts2mode(struct super_block *sb, struct inode *inode) * Convert Linux permission to ADFS attribute. We try to do the reverse * of atts2mode, but there is not a 1:1 translation. */ -static int -adfs_mode2atts(struct super_block *sb, struct inode *inode) +static int adfs_mode2atts(struct super_block *sb, struct inode *inode, + umode_t ia_mode) { + struct adfs_sb_info *asb = ADFS_SB(sb); umode_t mode; int attr; - struct adfs_sb_info *asb = ADFS_SB(sb); /* FIXME: should we be able to alter a link? */ if (S_ISLNK(inode->i_mode)) return ADFS_I(inode)->attr; + /* Directories do not have read/write permissions on the media */ if (S_ISDIR(inode->i_mode)) - attr = ADFS_NDA_DIRECTORY; - else - attr = 0; + return ADFS_NDA_DIRECTORY; - mode = inode->i_mode & asb->s_owner_mask; + attr = 0; + mode = ia_mode & asb->s_owner_mask; if (mode & S_IRUGO) attr |= ADFS_NDA_OWNER_READ; if (mode & S_IWUGO) attr |= ADFS_NDA_OWNER_WRITE; - mode = inode->i_mode & asb->s_other_mask; + mode = ia_mode & asb->s_other_mask; mode &= ~asb->s_owner_mask; if (mode & S_IRUGO) attr |= ADFS_NDA_PUBLIC_READ; @@ -158,6 +159,8 @@ adfs_mode2atts(struct super_block *sb, struct inode *inode) return attr; } +static const s64 nsec_unix_epoch_diff_risc_os_epoch = 2208988800000000000LL; + /* * Convert an ADFS time to Unix time. ADFS has a 40-bit centi-second time * referenced to 1 Jan 1900 (til 2248) so we need to discard 2208988800 seconds @@ -170,8 +173,6 @@ adfs_adfs2unix_time(struct timespec64 *tv, struct inode *inode) /* 01 Jan 1970 00:00:00 (Unix epoch) as nanoseconds since * 01 Jan 1900 00:00:00 (RISC OS epoch) */ - static const s64 nsec_unix_epoch_diff_risc_os_epoch = - 2208988800000000000LL; s64 nsec; if (!adfs_inode_is_stamped(inode)) @@ -204,24 +205,23 @@ adfs_adfs2unix_time(struct timespec64 *tv, struct inode *inode) return; } -/* - * Convert an Unix time to ADFS time. We only do this if the entry has a - * time/date stamp already. - */ -static void -adfs_unix2adfs_time(struct inode *inode, unsigned int secs) +/* Convert an Unix time to ADFS time for an entry that is already stamped. */ +static void adfs_unix2adfs_time(struct inode *inode, + const struct timespec64 *ts) { - unsigned int high, low; + s64 cs, nsec = timespec64_to_ns(ts); - if (adfs_inode_is_stamped(inode)) { - /* convert 32-bit seconds to 40-bit centi-seconds */ - low = (secs & 255) * 100; - high = (secs / 256) * 100 + (low >> 8) + 0x336e996a; + /* convert from Unix to RISC OS epoch */ + nsec += nsec_unix_epoch_diff_risc_os_epoch; - ADFS_I(inode)->loadaddr = (high >> 24) | - (ADFS_I(inode)->loadaddr & ~0xff); - ADFS_I(inode)->execaddr = (low & 255) | (high << 8); - } + /* convert from nanoseconds to centiseconds */ + cs = div_s64(nsec, 10000000); + + cs = clamp_t(s64, cs, 0, 0xffffffffff); + + ADFS_I(inode)->loadaddr &= ~0xff; + ADFS_I(inode)->loadaddr |= (cs >> 32) & 0xff; + ADFS_I(inode)->execaddr = cs; } /* @@ -260,6 +260,7 @@ adfs_iget(struct super_block *sb, struct object_info *obj) * for cross-directory renames. */ ADFS_I(inode)->parent_id = obj->parent_id; + ADFS_I(inode)->indaddr = obj->indaddr; ADFS_I(inode)->loadaddr = obj->loadaddr; ADFS_I(inode)->execaddr = obj->execaddr; ADFS_I(inode)->attr = obj->attr; @@ -315,10 +316,11 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr) if (ia_valid & ATTR_SIZE) truncate_setsize(inode, attr->ia_size); - if (ia_valid & ATTR_MTIME) { - inode->i_mtime = attr->ia_mtime; - adfs_unix2adfs_time(inode, attr->ia_mtime.tv_sec); + if (ia_valid & ATTR_MTIME && adfs_inode_is_stamped(inode)) { + adfs_unix2adfs_time(inode, &attr->ia_mtime); + adfs_adfs2unix_time(&inode->i_mtime, inode); } + /* * FIXME: should we make these == to i_mtime since we don't * have the ability to represent them in our filesystem? @@ -328,7 +330,7 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr) if (ia_valid & ATTR_CTIME) inode->i_ctime = attr->ia_ctime; if (ia_valid & ATTR_MODE) { - ADFS_I(inode)->attr = adfs_mode2atts(sb, inode); + ADFS_I(inode)->attr = adfs_mode2atts(sb, inode, attr->ia_mode); inode->i_mode = adfs_atts2mode(sb, inode); } @@ -353,7 +355,7 @@ int adfs_write_inode(struct inode *inode, struct writeback_control *wbc) struct object_info obj; int ret; - obj.indaddr = inode->i_ino; + obj.indaddr = ADFS_I(inode)->indaddr; obj.name_len = 0; obj.parent_id = ADFS_I(inode)->parent_id; obj.loadaddr = ADFS_I(inode)->loadaddr; diff --git a/fs/adfs/map.c b/fs/adfs/map.c index f44d12cef5be..a81de80c45c1 100644 --- a/fs/adfs/map.c +++ b/fs/adfs/map.c @@ -4,6 +4,8 @@ * * Copyright (C) 1997-2002 Russell King */ +#include <linux/slab.h> +#include <linux/statfs.h> #include <asm/unaligned.h> #include "adfs.h" @@ -66,54 +68,41 @@ static DEFINE_RWLOCK(adfs_map_lock); static int lookup_zone(const struct adfs_discmap *dm, const unsigned int idlen, const u32 frag_id, unsigned int *offset) { - const unsigned int mapsize = dm->dm_endbit; + const unsigned int endbit = dm->dm_endbit; const u32 idmask = (1 << idlen) - 1; - unsigned char *map = dm->dm_bh->b_data + 4; + unsigned char *map = dm->dm_bh->b_data; unsigned int start = dm->dm_startbit; - unsigned int mapptr; + unsigned int freelink, fragend; u32 frag; + frag = GET_FRAG_ID(map, 8, idmask & 0x7fff); + freelink = frag ? 8 + frag : 0; + do { frag = GET_FRAG_ID(map, start, idmask); - mapptr = start + idlen; - - /* - * find end of fragment - */ - { - __le32 *_map = (__le32 *)map; - u32 v = le32_to_cpu(_map[mapptr >> 5]) >> (mapptr & 31); - while (v == 0) { - mapptr = (mapptr & ~31) + 32; - if (mapptr >= mapsize) - goto error; - v = le32_to_cpu(_map[mapptr >> 5]); - } - - mapptr += 1 + ffz(~v); + + fragend = find_next_bit_le(map, endbit, start + idlen); + if (fragend >= endbit) + goto error; + + if (start == freelink) { + freelink += frag & 0x7fff; + } else if (frag == frag_id) { + unsigned int length = fragend + 1 - start; + + if (*offset < length) + return start + *offset; + *offset -= length; } - if (frag == frag_id) - goto found; -again: - start = mapptr; - } while (mapptr < mapsize); + start = fragend + 1; + } while (start < endbit); return -1; error: printk(KERN_ERR "adfs: oversized fragment 0x%x at 0x%x-0x%x\n", - frag, start, mapptr); + frag, start, fragend); return -1; - -found: - { - int length = mapptr - start; - if (*offset >= length) { - *offset -= length; - goto again; - } - } - return start + *offset; } /* @@ -125,12 +114,12 @@ found: static unsigned int scan_free_map(struct adfs_sb_info *asb, struct adfs_discmap *dm) { - const unsigned int mapsize = dm->dm_endbit + 32; + const unsigned int endbit = dm->dm_endbit; const unsigned int idlen = asb->s_idlen; const unsigned int frag_idlen = idlen <= 15 ? idlen : 15; const u32 idmask = (1 << frag_idlen) - 1; unsigned char *map = dm->dm_bh->b_data; - unsigned int start = 8, mapptr; + unsigned int start = 8, fragend; u32 frag; unsigned long total = 0; @@ -149,29 +138,13 @@ scan_free_map(struct adfs_sb_info *asb, struct adfs_discmap *dm) do { start += frag; - /* - * get fragment id - */ frag = GET_FRAG_ID(map, start, idmask); - mapptr = start + idlen; - - /* - * find end of fragment - */ - { - __le32 *_map = (__le32 *)map; - u32 v = le32_to_cpu(_map[mapptr >> 5]) >> (mapptr & 31); - while (v == 0) { - mapptr = (mapptr & ~31) + 32; - if (mapptr >= mapsize) - goto error; - v = le32_to_cpu(_map[mapptr >> 5]); - } - - mapptr += 1 + ffz(~v); - } - total += mapptr - start; + fragend = find_next_bit_le(map, endbit, start + idlen); + if (fragend >= endbit) + goto error; + + total += fragend + 1 - start; } while (frag >= idlen + 1); if (frag != 0) @@ -220,10 +193,10 @@ found: * total_free = E(free_in_zone_n) * nzones */ -unsigned int -adfs_map_free(struct super_block *sb) +void adfs_map_statfs(struct super_block *sb, struct kstatfs *buf) { struct adfs_sb_info *asb = ADFS_SB(sb); + struct adfs_discrecord *dr = adfs_map_discrecord(asb->s_map); struct adfs_discmap *dm; unsigned int total = 0; unsigned int zone; @@ -235,7 +208,10 @@ adfs_map_free(struct super_block *sb) total += scan_free_map(asb, dm++); } while (--zone > 0); - return signed_asl(total, asb->s_map2blk); + buf->f_blocks = adfs_disc_size(dr) >> sb->s_blocksize_bits; + buf->f_files = asb->s_ids_per_zone * asb->s_map_size; + buf->f_bavail = + buf->f_bfree = signed_asl(total, asb->s_map2blk); } int adfs_map_lookup(struct super_block *sb, u32 frag_id, unsigned int offset) @@ -280,3 +256,152 @@ bad_fragment: frag_id, zone, asb->s_map_size); return 0; } + +static unsigned char adfs_calczonecheck(struct super_block *sb, unsigned char *map) +{ + unsigned int v0, v1, v2, v3; + int i; + + v0 = v1 = v2 = v3 = 0; + for (i = sb->s_blocksize - 4; i; i -= 4) { + v0 += map[i] + (v3 >> 8); + v3 &= 0xff; + v1 += map[i + 1] + (v0 >> 8); + v0 &= 0xff; + v2 += map[i + 2] + (v1 >> 8); + v1 &= 0xff; + v3 += map[i + 3] + (v2 >> 8); + v2 &= 0xff; + } + v0 += v3 >> 8; + v1 += map[1] + (v0 >> 8); + v2 += map[2] + (v1 >> 8); + v3 += map[3] + (v2 >> 8); + + return v0 ^ v1 ^ v2 ^ v3; +} + +static int adfs_checkmap(struct super_block *sb, struct adfs_discmap *dm) +{ + unsigned char crosscheck = 0, zonecheck = 1; + int i; + + for (i = 0; i < ADFS_SB(sb)->s_map_size; i++) { + unsigned char *map; + + map = dm[i].dm_bh->b_data; + + if (adfs_calczonecheck(sb, map) != map[0]) { + adfs_error(sb, "zone %d fails zonecheck", i); + zonecheck = 0; + } + crosscheck ^= map[3]; + } + if (crosscheck != 0xff) + adfs_error(sb, "crosscheck != 0xff"); + return crosscheck == 0xff && zonecheck; +} + +/* + * Layout the map - the first zone contains a copy of the disc record, + * and the last zone must be limited to the size of the filesystem. + */ +static void adfs_map_layout(struct adfs_discmap *dm, unsigned int nzones, + struct adfs_discrecord *dr) +{ + unsigned int zone, zone_size; + u64 size; + + zone_size = (8 << dr->log2secsize) - le16_to_cpu(dr->zone_spare); + + dm[0].dm_bh = NULL; + dm[0].dm_startblk = 0; + dm[0].dm_startbit = 32 + ADFS_DR_SIZE_BITS; + dm[0].dm_endbit = 32 + zone_size; + + for (zone = 1; zone < nzones; zone++) { + dm[zone].dm_bh = NULL; + dm[zone].dm_startblk = zone * zone_size - ADFS_DR_SIZE_BITS; + dm[zone].dm_startbit = 32; + dm[zone].dm_endbit = 32 + zone_size; + } + + size = adfs_disc_size(dr) >> dr->log2bpmb; + size -= (nzones - 1) * zone_size - ADFS_DR_SIZE_BITS; + dm[nzones - 1].dm_endbit = 32 + size; +} + +static int adfs_map_read(struct adfs_discmap *dm, struct super_block *sb, + unsigned int map_addr, unsigned int nzones) +{ + unsigned int zone; + + for (zone = 0; zone < nzones; zone++) { + dm[zone].dm_bh = sb_bread(sb, map_addr + zone); + if (!dm[zone].dm_bh) + return -EIO; + } + + return 0; +} + +static void adfs_map_relse(struct adfs_discmap *dm, unsigned int nzones) +{ + unsigned int zone; + + for (zone = 0; zone < nzones; zone++) + brelse(dm[zone].dm_bh); +} + +struct adfs_discmap *adfs_read_map(struct super_block *sb, struct adfs_discrecord *dr) +{ + struct adfs_sb_info *asb = ADFS_SB(sb); + struct adfs_discmap *dm; + unsigned int map_addr, zone_size, nzones; + int ret; + + nzones = dr->nzones | dr->nzones_high << 8; + zone_size = (8 << dr->log2secsize) - le16_to_cpu(dr->zone_spare); + + asb->s_idlen = dr->idlen; + asb->s_map_size = nzones; + asb->s_map2blk = dr->log2bpmb - dr->log2secsize; + asb->s_log2sharesize = dr->log2sharesize; + asb->s_ids_per_zone = zone_size / (asb->s_idlen + 1); + + map_addr = (nzones >> 1) * zone_size - + ((nzones > 1) ? ADFS_DR_SIZE_BITS : 0); + map_addr = signed_asl(map_addr, asb->s_map2blk); + + dm = kmalloc_array(nzones, sizeof(*dm), GFP_KERNEL); + if (dm == NULL) { + adfs_error(sb, "not enough memory"); + return ERR_PTR(-ENOMEM); + } + + adfs_map_layout(dm, nzones, dr); + + ret = adfs_map_read(dm, sb, map_addr, nzones); + if (ret) { + adfs_error(sb, "unable to read map"); + goto error_free; + } + + if (adfs_checkmap(sb, dm)) + return dm; + + adfs_error(sb, "map corrupted"); + +error_free: + adfs_map_relse(dm, nzones); + kfree(dm); + return ERR_PTR(-EIO); +} + +void adfs_free_map(struct super_block *sb) +{ + struct adfs_sb_info *asb = ADFS_SB(sb); + + adfs_map_relse(asb->s_map, asb->s_map_size); + kfree(asb->s_map); +} diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 65b04ebb51c3..a3cc8ecb50da 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -88,59 +88,11 @@ static int adfs_checkdiscrecord(struct adfs_discrecord *dr) return 0; } -static unsigned char adfs_calczonecheck(struct super_block *sb, unsigned char *map) -{ - unsigned int v0, v1, v2, v3; - int i; - - v0 = v1 = v2 = v3 = 0; - for (i = sb->s_blocksize - 4; i; i -= 4) { - v0 += map[i] + (v3 >> 8); - v3 &= 0xff; - v1 += map[i + 1] + (v0 >> 8); - v0 &= 0xff; - v2 += map[i + 2] + (v1 >> 8); - v1 &= 0xff; - v3 += map[i + 3] + (v2 >> 8); - v2 &= 0xff; - } - v0 += v3 >> 8; - v1 += map[1] + (v0 >> 8); - v2 += map[2] + (v1 >> 8); - v3 += map[3] + (v2 >> 8); - - return v0 ^ v1 ^ v2 ^ v3; -} - -static int adfs_checkmap(struct super_block *sb, struct adfs_discmap *dm) -{ - unsigned char crosscheck = 0, zonecheck = 1; - int i; - - for (i = 0; i < ADFS_SB(sb)->s_map_size; i++) { - unsigned char *map; - - map = dm[i].dm_bh->b_data; - - if (adfs_calczonecheck(sb, map) != map[0]) { - adfs_error(sb, "zone %d fails zonecheck", i); - zonecheck = 0; - } - crosscheck ^= map[3]; - } - if (crosscheck != 0xff) - adfs_error(sb, "crosscheck != 0xff"); - return crosscheck == 0xff && zonecheck; -} - static void adfs_put_super(struct super_block *sb) { - int i; struct adfs_sb_info *asb = ADFS_SB(sb); - for (i = 0; i < asb->s_map_size; i++) - brelse(asb->s_map[i].dm_bh); - kfree(asb->s_map); + adfs_free_map(sb); kfree_rcu(asb, rcu); } @@ -249,16 +201,13 @@ static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct adfs_sb_info *sbi = ADFS_SB(sb); - struct adfs_discrecord *dr = adfs_map_discrecord(sbi->s_map); u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + adfs_map_statfs(sb, buf); + buf->f_type = ADFS_SUPER_MAGIC; buf->f_namelen = sbi->s_namelen; buf->f_bsize = sb->s_blocksize; - buf->f_blocks = adfs_disc_size(dr) >> sb->s_blocksize_bits; - buf->f_files = sbi->s_ids_per_zone * sbi->s_map_size; - buf->f_bavail = - buf->f_bfree = adfs_map_free(sb); buf->f_ffree = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks; buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); @@ -282,6 +231,12 @@ static void adfs_free_inode(struct inode *inode) kmem_cache_free(adfs_inode_cachep, ADFS_I(inode)); } +static int adfs_drop_inode(struct inode *inode) +{ + /* always drop inodes if we are read-only */ + return !IS_ENABLED(CONFIG_ADFS_FS_RW) || IS_RDONLY(inode); +} + static void init_once(void *foo) { struct adfs_inode_info *ei = (struct adfs_inode_info *) foo; @@ -314,7 +269,7 @@ static void destroy_inodecache(void) static const struct super_operations adfs_sops = { .alloc_inode = adfs_alloc_inode, .free_inode = adfs_free_inode, - .drop_inode = generic_delete_inode, + .drop_inode = adfs_drop_inode, .write_inode = adfs_write_inode, .put_super = adfs_put_super, .statfs = adfs_statfs, @@ -322,66 +277,94 @@ static const struct super_operations adfs_sops = { .show_options = adfs_show_options, }; -static struct adfs_discmap *adfs_read_map(struct super_block *sb, struct adfs_discrecord *dr) +static int adfs_probe(struct super_block *sb, unsigned int offset, int silent, + int (*validate)(struct super_block *sb, + struct buffer_head *bh, + struct adfs_discrecord **bhp)) { - struct adfs_discmap *dm; - unsigned int map_addr, zone_size, nzones; - int i, zone; struct adfs_sb_info *asb = ADFS_SB(sb); + struct adfs_discrecord *dr; + struct buffer_head *bh; + unsigned int blocksize = BLOCK_SIZE; + int ret, try; + + for (try = 0; try < 2; try++) { + /* try to set the requested block size */ + if (sb->s_blocksize != blocksize && + !sb_set_blocksize(sb, blocksize)) { + if (!silent) + adfs_msg(sb, KERN_ERR, + "error: unsupported blocksize"); + return -EINVAL; + } - nzones = asb->s_map_size; - zone_size = (8 << dr->log2secsize) - le16_to_cpu(dr->zone_spare); - map_addr = (nzones >> 1) * zone_size - - ((nzones > 1) ? ADFS_DR_SIZE_BITS : 0); - map_addr = signed_asl(map_addr, asb->s_map2blk); + /* read the buffer */ + bh = sb_bread(sb, offset >> sb->s_blocksize_bits); + if (!bh) { + adfs_msg(sb, KERN_ERR, + "error: unable to read block %u, try %d", + offset >> sb->s_blocksize_bits, try); + return -EIO; + } + + /* validate it */ + ret = validate(sb, bh, &dr); + if (ret) { + brelse(bh); + return ret; + } - asb->s_ids_per_zone = zone_size / (asb->s_idlen + 1); + /* does the block size match the filesystem block size? */ + blocksize = 1 << dr->log2secsize; + if (sb->s_blocksize == blocksize) { + asb->s_map = adfs_read_map(sb, dr); + brelse(bh); + return PTR_ERR_OR_ZERO(asb->s_map); + } - dm = kmalloc_array(nzones, sizeof(*dm), GFP_KERNEL); - if (dm == NULL) { - adfs_error(sb, "not enough memory"); - return ERR_PTR(-ENOMEM); + brelse(bh); } - for (zone = 0; zone < nzones; zone++, map_addr++) { - dm[zone].dm_startbit = 0; - dm[zone].dm_endbit = zone_size; - dm[zone].dm_startblk = zone * zone_size - ADFS_DR_SIZE_BITS; - dm[zone].dm_bh = sb_bread(sb, map_addr); + return -EIO; +} - if (!dm[zone].dm_bh) { - adfs_error(sb, "unable to read map"); - goto error_free; - } - } +static int adfs_validate_bblk(struct super_block *sb, struct buffer_head *bh, + struct adfs_discrecord **drp) +{ + struct adfs_discrecord *dr; + unsigned char *b_data; - /* adjust the limits for the first and last map zones */ - i = zone - 1; - dm[0].dm_startblk = 0; - dm[0].dm_startbit = ADFS_DR_SIZE_BITS; - dm[i].dm_endbit = (adfs_disc_size(dr) >> dr->log2bpmb) + - (ADFS_DR_SIZE_BITS - i * zone_size); + b_data = bh->b_data + (ADFS_DISCRECORD % sb->s_blocksize); + if (adfs_checkbblk(b_data)) + return -EILSEQ; - if (adfs_checkmap(sb, dm)) - return dm; + /* Do some sanity checks on the ADFS disc record */ + dr = (struct adfs_discrecord *)(b_data + ADFS_DR_OFFSET); + if (adfs_checkdiscrecord(dr)) + return -EILSEQ; + + *drp = dr; + return 0; +} - adfs_error(sb, "map corrupted"); +static int adfs_validate_dr0(struct super_block *sb, struct buffer_head *bh, + struct adfs_discrecord **drp) +{ + struct adfs_discrecord *dr; -error_free: - while (--zone >= 0) - brelse(dm[zone].dm_bh); + /* Do some sanity checks on the ADFS disc record */ + dr = (struct adfs_discrecord *)(bh->b_data + 4); + if (adfs_checkdiscrecord(dr) || dr->nzones_high || dr->nzones != 1) + return -EILSEQ; - kfree(dm); - return ERR_PTR(-EIO); + *drp = dr; + return 0; } static int adfs_fill_super(struct super_block *sb, void *data, int silent) { struct adfs_discrecord *dr; - struct buffer_head *bh; struct object_info root_obj; - unsigned char *b_data; - unsigned int blocksize; struct adfs_sb_info *asb; struct inode *root; int ret = -EINVAL; @@ -391,7 +374,10 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) asb = kzalloc(sizeof(*asb), GFP_KERNEL); if (!asb) return -ENOMEM; + sb->s_fs_info = asb; + sb->s_magic = ADFS_SUPER_MAGIC; + sb->s_time_gran = 10000000; /* set default options */ asb->s_uid = GLOBAL_ROOT_UID; @@ -403,78 +389,21 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) if (parse_options(sb, asb, data)) goto error; - sb_set_blocksize(sb, BLOCK_SIZE); - if (!(bh = sb_bread(sb, ADFS_DISCRECORD / BLOCK_SIZE))) { - adfs_msg(sb, KERN_ERR, "error: unable to read superblock"); - ret = -EIO; - goto error; - } - - b_data = bh->b_data + (ADFS_DISCRECORD % BLOCK_SIZE); - - if (adfs_checkbblk(b_data)) { - ret = -EINVAL; - goto error_badfs; - } - - dr = (struct adfs_discrecord *)(b_data + ADFS_DR_OFFSET); - - /* - * Do some sanity checks on the ADFS disc record - */ - if (adfs_checkdiscrecord(dr)) { - ret = -EINVAL; - goto error_badfs; - } - - blocksize = 1 << dr->log2secsize; - brelse(bh); - - if (sb_set_blocksize(sb, blocksize)) { - bh = sb_bread(sb, ADFS_DISCRECORD / sb->s_blocksize); - if (!bh) { - adfs_msg(sb, KERN_ERR, - "error: couldn't read superblock on 2nd try."); - ret = -EIO; - goto error; - } - b_data = bh->b_data + (ADFS_DISCRECORD % sb->s_blocksize); - if (adfs_checkbblk(b_data)) { - adfs_msg(sb, KERN_ERR, - "error: disc record mismatch, very weird!"); - ret = -EINVAL; - goto error_free_bh; - } - dr = (struct adfs_discrecord *)(b_data + ADFS_DR_OFFSET); - } else { + /* Try to probe the filesystem boot block */ + ret = adfs_probe(sb, ADFS_DISCRECORD, 1, adfs_validate_bblk); + if (ret == -EILSEQ) + ret = adfs_probe(sb, 0, silent, adfs_validate_dr0); + if (ret == -EILSEQ) { if (!silent) adfs_msg(sb, KERN_ERR, - "error: unsupported blocksize"); + "error: can't find an ADFS filesystem on dev %s.", + sb->s_id); ret = -EINVAL; - goto error; } + if (ret) + goto error; - /* - * blocksize on this device should now be set to the ADFS log2secsize - */ - - sb->s_magic = ADFS_SUPER_MAGIC; - asb->s_idlen = dr->idlen; - asb->s_map_size = dr->nzones | (dr->nzones_high << 8); - asb->s_map2blk = dr->log2bpmb - dr->log2secsize; - asb->s_log2sharesize = dr->log2sharesize; - - asb->s_map = adfs_read_map(sb, dr); - if (IS_ERR(asb->s_map)) { - ret = PTR_ERR(asb->s_map); - goto error_free_bh; - } - - brelse(bh); - - /* - * set up enough so that we can read an inode - */ + /* set up enough so that we can read an inode */ sb->s_op = &adfs_sops; dr = adfs_map_discrecord(asb->s_map); @@ -511,23 +440,13 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) root = adfs_iget(sb, &root_obj); sb->s_root = d_make_root(root); if (!sb->s_root) { - int i; - for (i = 0; i < asb->s_map_size; i++) - brelse(asb->s_map[i].dm_bh); - kfree(asb->s_map); + adfs_free_map(sb); adfs_error(sb, "get root inode failed\n"); ret = -EIO; goto error; } return 0; -error_badfs: - if (!silent) - adfs_msg(sb, KERN_ERR, - "error: can't find an ADFS filesystem on dev %s.", - sb->s_id); -error_free_bh: - brelse(bh); error: sb->s_fs_info = NULL; kfree(asb); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index ecd8d2698515..f4713ea76e82 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -97,7 +97,7 @@ static struct linux_binfmt elf_format = { .min_coredump = ELF_EXEC_PAGESIZE, }; -#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) +#define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE)) static int set_brk(unsigned long start, unsigned long end, int prot) { @@ -161,9 +161,11 @@ static int padzero(unsigned long elf_bss) #endif static int -create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, - unsigned long load_addr, unsigned long interp_load_addr) +create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, + unsigned long load_addr, unsigned long interp_load_addr, + unsigned long e_entry) { + struct mm_struct *mm = current->mm; unsigned long p = bprm->p; int argc = bprm->argc; int envc = bprm->envc; @@ -176,7 +178,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, unsigned char k_rand_bytes[16]; int items; elf_addr_t *elf_info; - int ei_index = 0; + int ei_index; const struct cred *cred = current_cred(); struct vm_area_struct *vma; @@ -226,12 +228,12 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, return -EFAULT; /* Create the ELF interpreter info */ - elf_info = (elf_addr_t *)current->mm->saved_auxv; + elf_info = (elf_addr_t *)mm->saved_auxv; /* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */ #define NEW_AUX_ENT(id, val) \ do { \ - elf_info[ei_index++] = id; \ - elf_info[ei_index++] = val; \ + *elf_info++ = id; \ + *elf_info++ = val; \ } while (0) #ifdef ARCH_DLINFO @@ -251,7 +253,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, NEW_AUX_ENT(AT_PHNUM, exec->e_phnum); NEW_AUX_ENT(AT_BASE, interp_load_addr); NEW_AUX_ENT(AT_FLAGS, 0); - NEW_AUX_ENT(AT_ENTRY, exec->e_entry); + NEW_AUX_ENT(AT_ENTRY, e_entry); NEW_AUX_ENT(AT_UID, from_kuid_munged(cred->user_ns, cred->uid)); NEW_AUX_ENT(AT_EUID, from_kuid_munged(cred->user_ns, cred->euid)); NEW_AUX_ENT(AT_GID, from_kgid_munged(cred->user_ns, cred->gid)); @@ -275,12 +277,13 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, } #undef NEW_AUX_ENT /* AT_NULL is zero; clear the rest too */ - memset(&elf_info[ei_index], 0, - sizeof current->mm->saved_auxv - ei_index * sizeof elf_info[0]); + memset(elf_info, 0, (char *)mm->saved_auxv + + sizeof(mm->saved_auxv) - (char *)elf_info); /* And advance past the AT_NULL entry. */ - ei_index += 2; + elf_info += 2; + ei_index = elf_info - (elf_addr_t *)mm->saved_auxv; sp = STACK_ADD(p, ei_index); items = (argc + 1) + (envc + 1) + 1; @@ -299,7 +302,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, * Grow the stack manually; some architectures have a limit on how * far ahead a user-space access may be in order to grow the stack. */ - vma = find_extend_vma(current->mm, bprm->p); + vma = find_extend_vma(mm, bprm->p); if (!vma) return -EFAULT; @@ -308,7 +311,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, return -EFAULT; /* Populate list of argv pointers back to argv strings. */ - p = current->mm->arg_end = current->mm->arg_start; + p = mm->arg_end = mm->arg_start; while (argc-- > 0) { size_t len; if (__put_user((elf_addr_t)p, sp++)) @@ -320,10 +323,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, } if (__put_user(0, sp++)) return -EFAULT; - current->mm->arg_end = p; + mm->arg_end = p; /* Populate list of envp pointers back to envp strings. */ - current->mm->env_end = current->mm->env_start = p; + mm->env_end = mm->env_start = p; while (envc-- > 0) { size_t len; if (__put_user((elf_addr_t)p, sp++)) @@ -335,10 +338,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, } if (__put_user(0, sp++)) return -EFAULT; - current->mm->env_end = p; + mm->env_end = p; /* Put the elf_info on the stack in the right place. */ - if (copy_to_user(sp, elf_info, ei_index * sizeof(elf_addr_t))) + if (copy_to_user(sp, mm->saved_auxv, ei_index * sizeof(elf_addr_t))) return -EFAULT; return 0; } @@ -689,15 +692,17 @@ static int load_elf_binary(struct linux_binprm *bprm) int bss_prot = 0; int retval, i; unsigned long elf_entry; + unsigned long e_entry; unsigned long interp_load_addr = 0; unsigned long start_code, end_code, start_data, end_data; unsigned long reloc_func_desc __maybe_unused = 0; int executable_stack = EXSTACK_DEFAULT; + struct elfhdr *elf_ex = (struct elfhdr *)bprm->buf; struct { - struct elfhdr elf_ex; struct elfhdr interp_elf_ex; } *loc; struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE; + struct mm_struct *mm; struct pt_regs *regs; loc = kmalloc(sizeof(*loc), GFP_KERNEL); @@ -705,30 +710,27 @@ static int load_elf_binary(struct linux_binprm *bprm) retval = -ENOMEM; goto out_ret; } - - /* Get the exec-header */ - loc->elf_ex = *((struct elfhdr *)bprm->buf); retval = -ENOEXEC; /* First of all, some simple consistency checks */ - if (memcmp(loc->elf_ex.e_ident, ELFMAG, SELFMAG) != 0) + if (memcmp(elf_ex->e_ident, ELFMAG, SELFMAG) != 0) goto out; - if (loc->elf_ex.e_type != ET_EXEC && loc->elf_ex.e_type != ET_DYN) + if (elf_ex->e_type != ET_EXEC && elf_ex->e_type != ET_DYN) goto out; - if (!elf_check_arch(&loc->elf_ex)) + if (!elf_check_arch(elf_ex)) goto out; - if (elf_check_fdpic(&loc->elf_ex)) + if (elf_check_fdpic(elf_ex)) goto out; if (!bprm->file->f_op->mmap) goto out; - elf_phdata = load_elf_phdrs(&loc->elf_ex, bprm->file); + elf_phdata = load_elf_phdrs(elf_ex, bprm->file); if (!elf_phdata) goto out; elf_ppnt = elf_phdata; - for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { + for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++) { char *elf_interpreter; if (elf_ppnt->p_type != PT_INTERP) @@ -782,7 +784,7 @@ out_free_interp: } elf_ppnt = elf_phdata; - for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) + for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++) switch (elf_ppnt->p_type) { case PT_GNU_STACK: if (elf_ppnt->p_flags & PF_X) @@ -792,7 +794,7 @@ out_free_interp: break; case PT_LOPROC ... PT_HIPROC: - retval = arch_elf_pt_proc(&loc->elf_ex, elf_ppnt, + retval = arch_elf_pt_proc(elf_ex, elf_ppnt, bprm->file, false, &arch_state); if (retval) @@ -836,7 +838,7 @@ out_free_interp: * still possible to return an error to the code that invoked * the exec syscall. */ - retval = arch_check_elf(&loc->elf_ex, + retval = arch_check_elf(elf_ex, !!interpreter, &loc->interp_elf_ex, &arch_state); if (retval) @@ -849,8 +851,8 @@ out_free_interp: /* Do this immediately, since STACK_TOP as used in setup_arg_pages may depend on the personality. */ - SET_PERSONALITY2(loc->elf_ex, &arch_state); - if (elf_read_implies_exec(loc->elf_ex, executable_stack)) + SET_PERSONALITY2(*elf_ex, &arch_state); + if (elf_read_implies_exec(*elf_ex, executable_stack)) current->personality |= READ_IMPLIES_EXEC; if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) @@ -877,7 +879,7 @@ out_free_interp: /* Now we do a little grungy work by mmapping the ELF image into the correct location in memory. */ for(i = 0, elf_ppnt = elf_phdata; - i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { + i < elf_ex->e_phnum; i++, elf_ppnt++) { int elf_prot, elf_flags; unsigned long k, vaddr; unsigned long total_size = 0; @@ -921,9 +923,9 @@ out_free_interp: * If we are loading ET_EXEC or we have already performed * the ET_DYN load_addr calculations, proceed normally. */ - if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) { + if (elf_ex->e_type == ET_EXEC || load_addr_set) { elf_flags |= MAP_FIXED; - } else if (loc->elf_ex.e_type == ET_DYN) { + } else if (elf_ex->e_type == ET_DYN) { /* * This logic is run once for the first LOAD Program * Header for ET_DYN binaries to calculate the @@ -972,7 +974,7 @@ out_free_interp: load_bias = ELF_PAGESTART(load_bias - vaddr); total_size = total_mapping_size(elf_phdata, - loc->elf_ex.e_phnum); + elf_ex->e_phnum); if (!total_size) { retval = -EINVAL; goto out_free_dentry; @@ -990,7 +992,7 @@ out_free_interp: if (!load_addr_set) { load_addr_set = 1; load_addr = (elf_ppnt->p_vaddr - elf_ppnt->p_offset); - if (loc->elf_ex.e_type == ET_DYN) { + if (elf_ex->e_type == ET_DYN) { load_bias += error - ELF_PAGESTART(load_bias + vaddr); load_addr += load_bias; @@ -998,7 +1000,7 @@ out_free_interp: } } k = elf_ppnt->p_vaddr; - if (k < start_code) + if ((elf_ppnt->p_flags & PF_X) && k < start_code) start_code = k; if (start_data < k) start_data = k; @@ -1031,7 +1033,7 @@ out_free_interp: } } - loc->elf_ex.e_entry += load_bias; + e_entry = elf_ex->e_entry + load_bias; elf_bss += load_bias; elf_brk += load_bias; start_code += load_bias; @@ -1074,7 +1076,7 @@ out_free_interp: allow_write_access(interpreter); fput(interpreter); } else { - elf_entry = loc->elf_ex.e_entry; + elf_entry = e_entry; if (BAD_ADDR(elf_entry)) { retval = -EINVAL; goto out_free_dentry; @@ -1092,15 +1094,17 @@ out_free_interp: goto out; #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ - retval = create_elf_tables(bprm, &loc->elf_ex, - load_addr, interp_load_addr); + retval = create_elf_tables(bprm, elf_ex, + load_addr, interp_load_addr, e_entry); if (retval < 0) goto out; - current->mm->end_code = end_code; - current->mm->start_code = start_code; - current->mm->start_data = start_data; - current->mm->end_data = end_data; - current->mm->start_stack = bprm->p; + + mm = current->mm; + mm->end_code = end_code; + mm->start_code = start_code; + mm->start_data = start_data; + mm->end_data = end_data; + mm->start_stack = bprm->p; if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) { /* @@ -1111,12 +1115,11 @@ out_free_interp: * growing down), and into the unused ELF_ET_DYN_BASE region. */ if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && - loc->elf_ex.e_type == ET_DYN && !interpreter) - current->mm->brk = current->mm->start_brk = - ELF_ET_DYN_BASE; + elf_ex->e_type == ET_DYN && !interpreter) { + mm->brk = mm->start_brk = ELF_ET_DYN_BASE; + } - current->mm->brk = current->mm->start_brk = - arch_randomize_brk(current->mm); + mm->brk = mm->start_brk = arch_randomize_brk(mm); #ifdef compat_brk_randomized current->brk_randomized = 1; #endif @@ -1574,6 +1577,7 @@ static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata, */ static int fill_files_note(struct memelfnote *note) { + struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned count, size, names_ofs, remaining, n; user_long_t *data; @@ -1581,7 +1585,7 @@ static int fill_files_note(struct memelfnote *note) char *name_base, *name_curpos; /* *Estimated* file count and total data size needed */ - count = current->mm->map_count; + count = mm->map_count; if (count > UINT_MAX / 64) return -EINVAL; size = count * 64; @@ -1591,6 +1595,10 @@ static int fill_files_note(struct memelfnote *note) if (size >= MAX_FILE_NOTE_SIZE) /* paranoia check */ return -EINVAL; size = round_up(size, PAGE_SIZE); + /* + * "size" can be 0 here legitimately. + * Let it ENOMEM and omit NT_FILE section which will be empty anyway. + */ data = kvmalloc(size, GFP_KERNEL); if (ZERO_OR_NULL_PTR(data)) return -ENOMEM; @@ -1599,7 +1607,7 @@ static int fill_files_note(struct memelfnote *note) name_base = name_curpos = ((char *)data) + names_ofs; remaining = size - names_ofs; count = 0; - for (vma = current->mm->mmap; vma != NULL; vma = vma->vm_next) { + for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { struct file *file; const char *filename; @@ -1633,10 +1641,10 @@ static int fill_files_note(struct memelfnote *note) data[0] = count; data[1] = PAGE_SIZE; /* - * Count usually is less than current->mm->map_count, + * Count usually is less than mm->map_count, * we need to move filenames down. */ - n = current->mm->map_count - count; + n = mm->map_count - count; if (n != 0) { unsigned shift_bytes = n * 3 * sizeof(data[0]); memmove(name_base - shift_bytes, name_base, @@ -2182,7 +2190,7 @@ static int elf_core_dump(struct coredump_params *cprm) int segs, i; size_t vma_data_size = 0; struct vm_area_struct *vma, *gate_vma; - struct elfhdr *elf = NULL; + struct elfhdr elf; loff_t offset = 0, dataoff; struct elf_note_info info = { }; struct elf_phdr *phdr4note = NULL; @@ -2203,10 +2211,6 @@ static int elf_core_dump(struct coredump_params *cprm) * exists while dumping the mm->vm_next areas to the core file. */ - /* alloc memory for large data structures: too large to be on stack */ - elf = kmalloc(sizeof(*elf), GFP_KERNEL); - if (!elf) - goto out; /* * The number of segs are recored into ELF header as 16bit value. * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here. @@ -2230,7 +2234,7 @@ static int elf_core_dump(struct coredump_params *cprm) * Collect all the non-memory information about the process for the * notes. This also sets up the file header. */ - if (!fill_note_info(elf, e_phnum, &info, cprm->siginfo, cprm->regs)) + if (!fill_note_info(&elf, e_phnum, &info, cprm->siginfo, cprm->regs)) goto cleanup; has_dumped = 1; @@ -2238,7 +2242,7 @@ static int elf_core_dump(struct coredump_params *cprm) fs = get_fs(); set_fs(KERNEL_DS); - offset += sizeof(*elf); /* Elf header */ + offset += sizeof(elf); /* Elf header */ offset += segs * sizeof(struct elf_phdr); /* Program headers */ /* Write notes phdr entry */ @@ -2257,11 +2261,13 @@ static int elf_core_dump(struct coredump_params *cprm) dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - if (segs - 1 > ULONG_MAX / sizeof(*vma_filesz)) - goto end_coredump; + /* + * Zero vma process will get ZERO_SIZE_PTR here. + * Let coredump continue for register state at least. + */ vma_filesz = kvmalloc(array_size(sizeof(*vma_filesz), (segs - 1)), GFP_KERNEL); - if (ZERO_OR_NULL_PTR(vma_filesz)) + if (!vma_filesz) goto end_coredump; for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; @@ -2281,12 +2287,12 @@ static int elf_core_dump(struct coredump_params *cprm) shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL); if (!shdr4extnum) goto end_coredump; - fill_extnum_info(elf, shdr4extnum, e_shoff, segs); + fill_extnum_info(&elf, shdr4extnum, e_shoff, segs); } offset = dataoff; - if (!dump_emit(cprm, elf, sizeof(*elf))) + if (!dump_emit(cprm, &elf, sizeof(elf))) goto end_coredump; if (!dump_emit(cprm, phdr4note, sizeof(*phdr4note))) @@ -2370,8 +2376,6 @@ cleanup: kfree(shdr4extnum); kvfree(vma_filesz); kfree(phdr4note); - kfree(elf); -out: return has_dumped; } diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index de95ad27722f..9ab610cc9114 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -1290,7 +1290,7 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, /* copy bytes from the working buffer into the pages */ while (working_bytes > 0) { bytes = min_t(unsigned long, bvec.bv_len, - PAGE_SIZE - buf_offset); + PAGE_SIZE - (buf_offset % PAGE_SIZE)); bytes = min(bytes, working_bytes); kaddr = kmap_atomic(bvec.bv_page); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index a6c90a003c12..05615a1099db 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -20,9 +20,13 @@ #include <linux/refcount.h> #include "compression.h" +/* workspace buffer size for s390 zlib hardware support */ +#define ZLIB_DFLTCC_BUF_SIZE (4 * PAGE_SIZE) + struct workspace { z_stream strm; char *buf; + unsigned int buf_size; struct list_head list; int level; }; @@ -61,7 +65,21 @@ struct list_head *zlib_alloc_workspace(unsigned int level) zlib_inflate_workspacesize()); workspace->strm.workspace = kvmalloc(workspacesize, GFP_KERNEL); workspace->level = level; - workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + workspace->buf = NULL; + /* + * In case of s390 zlib hardware support, allocate lager workspace + * buffer. If allocator fails, fall back to a single page buffer. + */ + if (zlib_deflate_dfltcc_enabled()) { + workspace->buf = kmalloc(ZLIB_DFLTCC_BUF_SIZE, + __GFP_NOMEMALLOC | __GFP_NORETRY | + __GFP_NOWARN | GFP_NOIO); + workspace->buf_size = ZLIB_DFLTCC_BUF_SIZE; + } + if (!workspace->buf) { + workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + workspace->buf_size = PAGE_SIZE; + } if (!workspace->strm.workspace || !workspace->buf) goto fail; @@ -85,6 +103,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, struct page *in_page = NULL; struct page *out_page = NULL; unsigned long bytes_left; + unsigned int in_buf_pages; unsigned long len = *total_out; unsigned long nr_dest_pages = *out_pages; const unsigned long max_out = nr_dest_pages * PAGE_SIZE; @@ -102,9 +121,6 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, workspace->strm.total_in = 0; workspace->strm.total_out = 0; - in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = kmap(in_page); - out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); if (out_page == NULL) { ret = -ENOMEM; @@ -114,12 +130,51 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, pages[0] = out_page; nr_pages = 1; - workspace->strm.next_in = data_in; + workspace->strm.next_in = workspace->buf; + workspace->strm.avail_in = 0; workspace->strm.next_out = cpage_out; workspace->strm.avail_out = PAGE_SIZE; - workspace->strm.avail_in = min(len, PAGE_SIZE); while (workspace->strm.total_in < len) { + /* + * Get next input pages and copy the contents to + * the workspace buffer if required. + */ + if (workspace->strm.avail_in == 0) { + bytes_left = len - workspace->strm.total_in; + in_buf_pages = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE), + workspace->buf_size / PAGE_SIZE); + if (in_buf_pages > 1) { + int i; + + for (i = 0; i < in_buf_pages; i++) { + if (in_page) { + kunmap(in_page); + put_page(in_page); + } + in_page = find_get_page(mapping, + start >> PAGE_SHIFT); + data_in = kmap(in_page); + memcpy(workspace->buf + i * PAGE_SIZE, + data_in, PAGE_SIZE); + start += PAGE_SIZE; + } + workspace->strm.next_in = workspace->buf; + } else { + if (in_page) { + kunmap(in_page); + put_page(in_page); + } + in_page = find_get_page(mapping, + start >> PAGE_SHIFT); + data_in = kmap(in_page); + start += PAGE_SIZE; + workspace->strm.next_in = data_in; + } + workspace->strm.avail_in = min(bytes_left, + (unsigned long) workspace->buf_size); + } + ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH); if (ret != Z_OK) { pr_debug("BTRFS: deflate in loop returned %d\n", @@ -161,33 +216,43 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, /* we're all done */ if (workspace->strm.total_in >= len) break; - - /* we've read in a full page, get a new one */ - if (workspace->strm.avail_in == 0) { - if (workspace->strm.total_out > max_out) - break; - - bytes_left = len - workspace->strm.total_in; - kunmap(in_page); - put_page(in_page); - - start += PAGE_SIZE; - in_page = find_get_page(mapping, - start >> PAGE_SHIFT); - data_in = kmap(in_page); - workspace->strm.avail_in = min(bytes_left, - PAGE_SIZE); - workspace->strm.next_in = data_in; - } + if (workspace->strm.total_out > max_out) + break; } workspace->strm.avail_in = 0; - ret = zlib_deflate(&workspace->strm, Z_FINISH); - zlib_deflateEnd(&workspace->strm); - - if (ret != Z_STREAM_END) { - ret = -EIO; - goto out; + /* + * Call deflate with Z_FINISH flush parameter providing more output + * space but no more input data, until it returns with Z_STREAM_END. + */ + while (ret != Z_STREAM_END) { + ret = zlib_deflate(&workspace->strm, Z_FINISH); + if (ret == Z_STREAM_END) + break; + if (ret != Z_OK && ret != Z_BUF_ERROR) { + zlib_deflateEnd(&workspace->strm); + ret = -EIO; + goto out; + } else if (workspace->strm.avail_out == 0) { + /* get another page for the stream end */ + kunmap(out_page); + if (nr_pages == nr_dest_pages) { + out_page = NULL; + ret = -E2BIG; + goto out; + } + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + cpage_out = kmap(out_page); + pages[nr_pages] = out_page; + nr_pages++; + workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.next_out = cpage_out; + } } + zlib_deflateEnd(&workspace->strm); if (workspace->strm.total_out >= workspace->strm.total_in) { ret = -E2BIG; @@ -231,7 +296,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) workspace->strm.total_out = 0; workspace->strm.next_out = workspace->buf; - workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.avail_out = workspace->buf_size; /* If it's deflate, and it's got no preset dictionary, then we can tell zlib to skip the adler32 check. */ @@ -270,7 +335,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) } workspace->strm.next_out = workspace->buf; - workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.avail_out = workspace->buf_size; if (workspace->strm.avail_in == 0) { unsigned long tmp; @@ -320,7 +385,7 @@ int zlib_decompress(struct list_head *ws, unsigned char *data_in, workspace->strm.total_in = 0; workspace->strm.next_out = workspace->buf; - workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.avail_out = workspace->buf_size; workspace->strm.total_out = 0; /* If it's deflate, and it's got no preset dictionary, then we can tell zlib to skip the adler32 check. */ @@ -364,7 +429,7 @@ int zlib_decompress(struct list_head *ws, unsigned char *data_in, buf_offset = 0; bytes = min(PAGE_SIZE - pg_offset, - PAGE_SIZE - buf_offset); + PAGE_SIZE - (buf_offset % PAGE_SIZE)); bytes = min(bytes, bytes_left); kaddr = kmap_atomic(dest_page); @@ -375,7 +440,7 @@ int zlib_decompress(struct list_head *ws, unsigned char *data_in, bytes_left -= bytes; next: workspace->strm.next_out = workspace->buf; - workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.avail_out = workspace->buf_size; } if (ret != Z_STREAM_END && bytes_left != 0) diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c deleted file mode 100644 index 358ea2ecf36b..000000000000 --- a/fs/compat_ioctl.c +++ /dev/null @@ -1,261 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * ioctl32.c: Conversion between 32bit and 64bit native ioctls. - * - * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com) - * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be) - * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs - * Copyright (C) 2003 Pavel Machek (pavel@ucw.cz) - * - * These routines maintain argument size conversion between 32bit and 64bit - * ioctls. - */ - -#include <linux/types.h> -#include <linux/compat.h> -#include <linux/kernel.h> -#include <linux/capability.h> -#include <linux/compiler.h> -#include <linux/sched.h> -#include <linux/smp.h> -#include <linux/ioctl.h> -#include <linux/if.h> -#include <linux/raid/md_u.h> -#include <linux/falloc.h> -#include <linux/file.h> -#include <linux/ppp-ioctl.h> -#include <linux/if_pppox.h> -#include <linux/tty.h> -#include <linux/vt_kern.h> -#include <linux/blkdev.h> -#include <linux/serial.h> -#include <linux/ctype.h> -#include <linux/syscalls.h> -#include <linux/gfp.h> -#include <linux/cec.h> - -#include "internal.h" - -#ifdef CONFIG_BLOCK -#include <linux/cdrom.h> -#include <linux/fd.h> -#include <scsi/scsi.h> -#include <scsi/scsi_ioctl.h> -#include <scsi/sg.h> -#endif - -#include <linux/uaccess.h> -#include <linux/watchdog.h> - -#include <linux/hiddev.h> - - -#include <linux/sort.h> - -/* - * simple reversible transform to make our table more evenly - * distributed after sorting. - */ -#define XFORM(i) (((i) ^ ((i) << 27) ^ ((i) << 17)) & 0xffffffff) - -#define COMPATIBLE_IOCTL(cmd) XFORM((u32)cmd), -static unsigned int ioctl_pointer[] = { -#ifdef CONFIG_BLOCK -/* Big S */ -COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN) -COMPATIBLE_IOCTL(SCSI_IOCTL_DOORLOCK) -COMPATIBLE_IOCTL(SCSI_IOCTL_DOORUNLOCK) -COMPATIBLE_IOCTL(SCSI_IOCTL_TEST_UNIT_READY) -COMPATIBLE_IOCTL(SCSI_IOCTL_GET_BUS_NUMBER) -COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND) -COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST) -COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI) -#endif -#ifdef CONFIG_BLOCK -/* SG stuff */ -COMPATIBLE_IOCTL(SG_IO) -COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE) -COMPATIBLE_IOCTL(SG_SET_TIMEOUT) -COMPATIBLE_IOCTL(SG_GET_TIMEOUT) -COMPATIBLE_IOCTL(SG_EMULATED_HOST) -COMPATIBLE_IOCTL(SG_GET_TRANSFORM) -COMPATIBLE_IOCTL(SG_SET_RESERVED_SIZE) -COMPATIBLE_IOCTL(SG_GET_RESERVED_SIZE) -COMPATIBLE_IOCTL(SG_GET_SCSI_ID) -COMPATIBLE_IOCTL(SG_SET_FORCE_LOW_DMA) -COMPATIBLE_IOCTL(SG_GET_LOW_DMA) -COMPATIBLE_IOCTL(SG_SET_FORCE_PACK_ID) -COMPATIBLE_IOCTL(SG_GET_PACK_ID) -COMPATIBLE_IOCTL(SG_GET_NUM_WAITING) -COMPATIBLE_IOCTL(SG_SET_DEBUG) -COMPATIBLE_IOCTL(SG_GET_SG_TABLESIZE) -COMPATIBLE_IOCTL(SG_GET_COMMAND_Q) -COMPATIBLE_IOCTL(SG_SET_COMMAND_Q) -COMPATIBLE_IOCTL(SG_GET_VERSION_NUM) -COMPATIBLE_IOCTL(SG_NEXT_CMD_LEN) -COMPATIBLE_IOCTL(SG_SCSI_RESET) -COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE) -COMPATIBLE_IOCTL(SG_SET_KEEP_ORPHAN) -COMPATIBLE_IOCTL(SG_GET_KEEP_ORPHAN) -#endif -}; - -/* - * Convert common ioctl arguments based on their command number - * - * Please do not add any code in here. Instead, implement - * a compat_ioctl operation in the place that handleÑ• the - * ioctl for the native case. - */ -static long do_ioctl_trans(unsigned int cmd, - unsigned long arg, struct file *file) -{ - return -ENOIOCTLCMD; -} - -static int compat_ioctl_check_table(unsigned int xcmd) -{ -#ifdef CONFIG_BLOCK - int i; - const int max = ARRAY_SIZE(ioctl_pointer) - 1; - - BUILD_BUG_ON(max >= (1 << 16)); - - /* guess initial offset into table, assuming a - normalized distribution */ - i = ((xcmd >> 16) * max) >> 16; - - /* do linear search up first, until greater or equal */ - while (ioctl_pointer[i] < xcmd && i < max) - i++; - - /* then do linear search down */ - while (ioctl_pointer[i] > xcmd && i > 0) - i--; - - return ioctl_pointer[i] == xcmd; -#else - return 0; -#endif -} - -COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, - compat_ulong_t, arg32) -{ - unsigned long arg = arg32; - struct fd f = fdget(fd); - int error = -EBADF; - if (!f.file) - goto out; - - /* RED-PEN how should LSM module know it's handling 32bit? */ - error = security_file_ioctl(f.file, cmd, arg); - if (error) - goto out_fput; - - switch (cmd) { - /* these are never seen by ->ioctl(), no argument or int argument */ - case FIOCLEX: - case FIONCLEX: - case FIFREEZE: - case FITHAW: - case FICLONE: - goto do_ioctl; - /* these are never seen by ->ioctl(), pointer argument */ - case FIONBIO: - case FIOASYNC: - case FIOQSIZE: - case FS_IOC_FIEMAP: - case FIGETBSZ: - case FICLONERANGE: - case FIDEDUPERANGE: - goto found_handler; - /* - * The next group is the stuff handled inside file_ioctl(). - * For regular files these never reach ->ioctl(); for - * devices, sockets, etc. they do and one (FIONREAD) is - * even accepted in some cases. In all those cases - * argument has the same type, so we can handle these - * here, shunting them towards do_vfs_ioctl(). - * ->compat_ioctl() will never see any of those. - */ - /* pointer argument, never actually handled by ->ioctl() */ - case FIBMAP: - goto found_handler; - /* handled by some ->ioctl(); always a pointer to int */ - case FIONREAD: - goto found_handler; - /* these get messy on amd64 due to alignment differences */ -#if defined(CONFIG_X86_64) - case FS_IOC_RESVSP_32: - case FS_IOC_RESVSP64_32: - error = compat_ioctl_preallocate(f.file, 0, compat_ptr(arg)); - goto out_fput; - case FS_IOC_UNRESVSP_32: - case FS_IOC_UNRESVSP64_32: - error = compat_ioctl_preallocate(f.file, FALLOC_FL_PUNCH_HOLE, - compat_ptr(arg)); - goto out_fput; - case FS_IOC_ZERO_RANGE_32: - error = compat_ioctl_preallocate(f.file, FALLOC_FL_ZERO_RANGE, - compat_ptr(arg)); - goto out_fput; -#else - case FS_IOC_RESVSP: - case FS_IOC_RESVSP64: - case FS_IOC_UNRESVSP: - case FS_IOC_UNRESVSP64: - case FS_IOC_ZERO_RANGE: - goto found_handler; -#endif - - default: - if (f.file->f_op->compat_ioctl) { - error = f.file->f_op->compat_ioctl(f.file, cmd, arg); - if (error != -ENOIOCTLCMD) - goto out_fput; - } - - if (!f.file->f_op->unlocked_ioctl) - goto do_ioctl; - break; - } - - if (compat_ioctl_check_table(XFORM(cmd))) - goto found_handler; - - error = do_ioctl_trans(cmd, arg, f.file); - if (error == -ENOIOCTLCMD) - error = -ENOTTY; - - goto out_fput; - - found_handler: - arg = (unsigned long)compat_ptr(arg); - do_ioctl: - error = do_vfs_ioctl(f.file, fd, cmd, arg); - out_fput: - fdput(f); - out: - return error; -} - -static int __init init_sys32_ioctl_cmp(const void *p, const void *q) -{ - unsigned int a, b; - a = *(unsigned int *)p; - b = *(unsigned int *)q; - if (a > b) - return 1; - if (a < b) - return -1; - return 0; -} - -static int __init init_sys32_ioctl(void) -{ - sort(ioctl_pointer, ARRAY_SIZE(ioctl_pointer), sizeof(*ioctl_pointer), - init_sys32_ioctl_cmp, NULL); - return 0; -} -__initcall(init_sys32_ioctl); diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index dede25247b81..634b09d18b77 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -142,18 +142,21 @@ EXPORT_SYMBOL_GPL(debugfs_file_put); * We also need to exclude any file that has ways to write or alter it as root * can bypass the permissions check. */ -static bool debugfs_is_locked_down(struct inode *inode, - struct file *filp, - const struct file_operations *real_fops) +static int debugfs_locked_down(struct inode *inode, + struct file *filp, + const struct file_operations *real_fops) { if ((inode->i_mode & 07777) == 0444 && !(filp->f_mode & FMODE_WRITE) && !real_fops->unlocked_ioctl && !real_fops->compat_ioctl && !real_fops->mmap) - return false; + return 0; - return security_locked_down(LOCKDOWN_DEBUGFS); + if (security_locked_down(LOCKDOWN_DEBUGFS)) + return -EPERM; + + return 0; } static int open_proxy_open(struct inode *inode, struct file *filp) @@ -168,7 +171,7 @@ static int open_proxy_open(struct inode *inode, struct file *filp) real_fops = debugfs_real_fops(filp); - r = debugfs_is_locked_down(inode, filp, real_fops); + r = debugfs_locked_down(inode, filp, real_fops); if (r) goto out; @@ -298,7 +301,7 @@ static int full_proxy_open(struct inode *inode, struct file *filp) real_fops = debugfs_real_fops(filp); - r = debugfs_is_locked_down(inode, filp, real_fops); + r = debugfs_locked_down(inode, filp, real_fops); if (r) goto out; @@ -496,10 +499,10 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n"); * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * - * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will + * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will * be returned. */ struct dentry *debugfs_create_u32(const char *name, umode_t mode, @@ -581,10 +584,10 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n"); * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * - * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will + * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will * be returned. */ struct dentry *debugfs_create_ulong(const char *name, umode_t mode, @@ -846,10 +849,10 @@ static const struct file_operations fops_bool_wo = { * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * - * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will + * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will * be returned. */ struct dentry *debugfs_create_bool(const char *name, umode_t mode, @@ -899,10 +902,10 @@ static const struct file_operations fops_blob = { * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * - * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will + * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will * be returned. */ struct dentry *debugfs_create_blob(const char *name, umode_t mode, @@ -1091,10 +1094,10 @@ static const struct file_operations fops_regset32 = { * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * - * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will + * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will * be returned. */ struct dentry *debugfs_create_regset32(const char *name, umode_t mode, @@ -1158,4 +1161,3 @@ struct dentry *debugfs_create_devm_seqfile(struct device *dev, const char *name, &debugfs_devm_entry_ops); } EXPORT_SYMBOL_GPL(debugfs_create_devm_seqfile); - diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index f4d8df5e4714..dc6cffc4feba 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -423,7 +423,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode, * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be @@ -502,7 +502,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_file_unsafe); * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be @@ -534,7 +534,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_file_size); * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be @@ -627,7 +627,7 @@ EXPORT_SYMBOL(debugfs_create_automount); * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the symbolic * link is to be removed (no automatic cleanup happens if your module is - * unloaded, you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) + * unloaded, you are responsible here.) If an error occurs, ERR_PTR(-ERROR) * will be returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be @@ -906,4 +906,3 @@ static int __init debugfs_init(void) return retval; } core_initcall(debugfs_init); - diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 3951d39b9b75..cdfaf4f0e11a 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1035,7 +1035,7 @@ static void sctp_connect_to_sock(struct connection *con) int result; int addr_len; struct socket *sock; - struct timeval tv = { .tv_sec = 5, .tv_usec = 0 }; + struct __kernel_sock_timeval tv = { .tv_sec = 5, .tv_usec = 0 }; if (con->nodeid == 0) { log_print("attempt to connect sock 0 foiled"); @@ -1087,12 +1087,12 @@ static void sctp_connect_to_sock(struct connection *con) * since O_NONBLOCK argument in connect() function does not work here, * then, we should restore the default value of this attribute. */ - kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_OLD, (char *)&tv, + kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_NEW, (char *)&tv, sizeof(tv)); result = sock->ops->connect(sock, (struct sockaddr *)&daddr, addr_len, 0); memset(&tv, 0, sizeof(tv)); - kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_OLD, (char *)&tv, + kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_NEW, (char *)&tv, sizeof(tv)); if (result == -EINPROGRESS) diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index f91db24bbf3b..db1ef144c63a 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1586,7 +1586,7 @@ ecryptfs_process_key_cipher(struct crypto_skcipher **key_tfm, } crypto_skcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); if (*key_size == 0) - *key_size = crypto_skcipher_default_keysize(*key_tfm); + *key_size = crypto_skcipher_max_keysize(*key_tfm); get_random_bytes(dummy_key, *key_size); rc = crypto_skcipher_setkey(*key_tfm, dummy_key, *key_size); if (rc) { diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 216fbe6a4837..7d326aa0308e 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -2204,9 +2204,9 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes, if (mount_crypt_stat->global_default_cipher_key_size == 0) { printk(KERN_WARNING "No key size specified at mount; " "defaulting to [%d]\n", - crypto_skcipher_default_keysize(tfm)); + crypto_skcipher_max_keysize(tfm)); mount_crypt_stat->global_default_cipher_key_size = - crypto_skcipher_default_keysize(tfm); + crypto_skcipher_max_keysize(tfm); } if (crypt_stat->key_size == 0) crypt_stat->key_size = diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 2890a67a1ded..5779a15c2cd6 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -306,24 +306,22 @@ static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq, } src = kmap_atomic(*rq->in); - if (!rq->out[0]) { - dst = NULL; - } else { + if (rq->out[0]) { dst = kmap_atomic(rq->out[0]); memcpy(dst + rq->pageofs_out, src, righthalf); + kunmap_atomic(dst); } - if (rq->out[1] == *rq->in) { - memmove(src, src + righthalf, rq->pageofs_out); - } else if (nrpages_out == 2) { - if (dst) - kunmap_atomic(dst); + if (nrpages_out == 2) { DBG_BUGON(!rq->out[1]); - dst = kmap_atomic(rq->out[1]); - memcpy(dst, src + righthalf, rq->pageofs_out); + if (rq->out[1] == *rq->in) { + memmove(src, src + righthalf, rq->pageofs_out); + } else { + dst = kmap_atomic(rq->out[1]); + memcpy(dst, src + righthalf, rq->pageofs_out); + kunmap_atomic(dst); + } } - if (dst) - kunmap_atomic(dst); kunmap_atomic(src); return 0; } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 1ed5beff7d11..c4c6dcdc89ad 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -401,9 +401,9 @@ static inline void *erofs_get_pcpubuf(unsigned int pagenr) #ifdef CONFIG_EROFS_FS_ZIP int erofs_workgroup_put(struct erofs_workgroup *grp); struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, - pgoff_t index, bool *tag); + pgoff_t index); int erofs_register_workgroup(struct super_block *sb, - struct erofs_workgroup *grp, bool tag); + struct erofs_workgroup *grp); void erofs_workgroup_free_rcu(struct erofs_workgroup *grp); void erofs_shrinker_register(struct super_block *sb); void erofs_shrinker_unregister(struct super_block *sb); diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index 1e8e1450d5b0..fddc5059c930 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -59,7 +59,7 @@ repeat: } struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, - pgoff_t index, bool *tag) + pgoff_t index) { struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_workgroup *grp; @@ -68,9 +68,6 @@ repeat: rcu_read_lock(); grp = radix_tree_lookup(&sbi->workstn_tree, index); if (grp) { - *tag = xa_pointer_tag(grp); - grp = xa_untag_pointer(grp); - if (erofs_workgroup_get(grp)) { /* prefer to relax rcu read side */ rcu_read_unlock(); @@ -84,8 +81,7 @@ repeat: } int erofs_register_workgroup(struct super_block *sb, - struct erofs_workgroup *grp, - bool tag) + struct erofs_workgroup *grp) { struct erofs_sb_info *sbi; int err; @@ -103,8 +99,6 @@ int erofs_register_workgroup(struct super_block *sb, sbi = EROFS_SB(sb); xa_lock(&sbi->workstn_tree); - grp = xa_tag_pointer(grp, tag); - /* * Bump up reference count before making this workgroup * visible to other users in order to avoid potential UAF @@ -175,8 +169,7 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, * however in order to avoid some race conditions, add a * DBG_BUGON to observe this in advance. */ - DBG_BUGON(xa_untag_pointer(radix_tree_delete(&sbi->workstn_tree, - grp->index)) != grp); + DBG_BUGON(radix_tree_delete(&sbi->workstn_tree, grp->index) != grp); /* * If managed cache is on, last refcount should indicate @@ -201,7 +194,7 @@ repeat: batch, first_index, PAGEVEC_SIZE); for (i = 0; i < found; ++i) { - struct erofs_workgroup *grp = xa_untag_pointer(batch[i]); + struct erofs_workgroup *grp = batch[i]; first_index = grp->index + 1; diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h index 3585b84d2f20..50966f1c676e 100644 --- a/fs/erofs/xattr.h +++ b/fs/erofs/xattr.h @@ -46,18 +46,19 @@ extern const struct xattr_handler erofs_xattr_security_handler; static inline const struct xattr_handler *erofs_xattr_handler(unsigned int idx) { -static const struct xattr_handler *xattr_handler_map[] = { - [EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler, + static const struct xattr_handler *xattr_handler_map[] = { + [EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler, #ifdef CONFIG_EROFS_FS_POSIX_ACL - [EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, - [EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] = - &posix_acl_default_xattr_handler, + [EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = + &posix_acl_access_xattr_handler, + [EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] = + &posix_acl_default_xattr_handler, #endif - [EROFS_XATTR_INDEX_TRUSTED] = &erofs_xattr_trusted_handler, + [EROFS_XATTR_INDEX_TRUSTED] = &erofs_xattr_trusted_handler, #ifdef CONFIG_EROFS_FS_SECURITY - [EROFS_XATTR_INDEX_SECURITY] = &erofs_xattr_security_handler, + [EROFS_XATTR_INDEX_SECURITY] = &erofs_xattr_security_handler, #endif -}; + }; return idx && idx < ARRAY_SIZE(xattr_handler_map) ? xattr_handler_map[idx] : NULL; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index ca99425a4536..80e47f07d946 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -345,9 +345,8 @@ static int z_erofs_lookup_collection(struct z_erofs_collector *clt, struct z_erofs_pcluster *pcl; struct z_erofs_collection *cl; unsigned int length; - bool tag; - grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT, &tag); + grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT); if (!grp) return -ENOENT; @@ -438,7 +437,7 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, */ mutex_trylock(&cl->lock); - err = erofs_register_workgroup(inode->i_sb, &pcl->obj, 0); + err = erofs_register_workgroup(inode->i_sb, &pcl->obj); if (err) { mutex_unlock(&cl->lock); kmem_cache_free(pcluster_cachep, pcl); @@ -1149,21 +1148,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, qtail[JQ_BYPASS] = &pcl->next; } -static bool postsubmit_is_all_bypassed(struct z_erofs_decompressqueue *q[], - unsigned int nr_bios, bool force_fg) -{ - /* - * although background is preferred, no one is pending for submission. - * don't issue workqueue for decompression but drop it directly instead. - */ - if (force_fg || nr_bios) - return false; - - kvfree(q[JQ_SUBMIT]); - return true; -} - -static bool z_erofs_submit_queue(struct super_block *sb, +static void z_erofs_submit_queue(struct super_block *sb, z_erofs_next_pcluster_t owned_head, struct list_head *pagepool, struct z_erofs_decompressqueue *fgq, @@ -1172,19 +1157,12 @@ static bool z_erofs_submit_queue(struct super_block *sb, struct erofs_sb_info *const sbi = EROFS_SB(sb); z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; - struct bio *bio; void *bi_private; /* since bio will be NULL, no need to initialize last_index */ pgoff_t uninitialized_var(last_index); - bool force_submit = false; - unsigned int nr_bios; - - if (owned_head == Z_EROFS_PCLUSTER_TAIL) - return false; + unsigned int nr_bios = 0; + struct bio *bio = NULL; - force_submit = false; - bio = NULL; - nr_bios = 0; bi_private = jobqueueset_init(sb, q, fgq, force_fg); qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head; @@ -1194,11 +1172,9 @@ static bool z_erofs_submit_queue(struct super_block *sb, do { struct z_erofs_pcluster *pcl; - unsigned int clusterpages; - pgoff_t first_index; - struct page *page; - unsigned int i = 0, bypass = 0; - int err; + pgoff_t cur, end; + unsigned int i = 0; + bool bypass = true; /* no possible 'owned_head' equals the following */ DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); @@ -1206,55 +1182,50 @@ static bool z_erofs_submit_queue(struct super_block *sb, pcl = container_of(owned_head, struct z_erofs_pcluster, next); - clusterpages = BIT(pcl->clusterbits); + cur = pcl->obj.index; + end = cur + BIT(pcl->clusterbits); /* close the main owned chain at first */ owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, Z_EROFS_PCLUSTER_TAIL_CLOSED); - first_index = pcl->obj.index; - force_submit |= (first_index != last_index + 1); + do { + struct page *page; + int err; -repeat: - page = pickup_page_for_submission(pcl, i, pagepool, - MNGD_MAPPING(sbi), - GFP_NOFS); - if (!page) { - force_submit = true; - ++bypass; - goto skippage; - } + page = pickup_page_for_submission(pcl, i++, pagepool, + MNGD_MAPPING(sbi), + GFP_NOFS); + if (!page) + continue; - if (bio && force_submit) { + if (bio && cur != last_index + 1) { submit_bio_retry: - submit_bio(bio); - bio = NULL; - } - - if (!bio) { - bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); + submit_bio(bio); + bio = NULL; + } - bio->bi_end_io = z_erofs_decompressqueue_endio; - bio_set_dev(bio, sb->s_bdev); - bio->bi_iter.bi_sector = (sector_t)(first_index + i) << - LOG_SECTORS_PER_BLOCK; - bio->bi_private = bi_private; - bio->bi_opf = REQ_OP_READ; + if (!bio) { + bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); - ++nr_bios; - } + bio->bi_end_io = z_erofs_decompressqueue_endio; + bio_set_dev(bio, sb->s_bdev); + bio->bi_iter.bi_sector = (sector_t)cur << + LOG_SECTORS_PER_BLOCK; + bio->bi_private = bi_private; + bio->bi_opf = REQ_OP_READ; + ++nr_bios; + } - err = bio_add_page(bio, page, PAGE_SIZE, 0); - if (err < PAGE_SIZE) - goto submit_bio_retry; + err = bio_add_page(bio, page, PAGE_SIZE, 0); + if (err < PAGE_SIZE) + goto submit_bio_retry; - force_submit = false; - last_index = first_index + i; -skippage: - if (++i < clusterpages) - goto repeat; + last_index = cur; + bypass = false; + } while (++cur < end); - if (bypass < clusterpages) + if (!bypass) qtail[JQ_SUBMIT] = &pcl->next; else move_to_bypass_jobqueue(pcl, qtail, owned_head); @@ -1263,11 +1234,15 @@ skippage: if (bio) submit_bio(bio); - if (postsubmit_is_all_bypassed(q, nr_bios, *force_fg)) - return true; - + /* + * although background is preferred, no one is pending for submission. + * don't issue workqueue for decompression but drop it directly instead. + */ + if (!*force_fg && !nr_bios) { + kvfree(q[JQ_SUBMIT]); + return; + } z_erofs_decompress_kickoff(q[JQ_SUBMIT], *force_fg, nr_bios); - return true; } static void z_erofs_runqueue(struct super_block *sb, @@ -1276,9 +1251,9 @@ static void z_erofs_runqueue(struct super_block *sb, { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; - if (!z_erofs_submit_queue(sb, clt->owned_head, - pagepool, io, &force_fg)) + if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) return; + z_erofs_submit_queue(sb, clt->owned_head, pagepool, io, &force_fg); /* handle bypass queue (no i/o pclusters) immediately */ z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 67a395039268..b041b66002db 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -354,12 +354,6 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p) return container_of(p, struct ep_pqueue, pt)->epi; } -/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ -static inline int ep_op_has_event(int op) -{ - return op != EPOLL_CTL_DEL; -} - /* Initialize the poll safe wake up structure */ static void ep_nested_calls_init(struct nested_calls *ncalls) { @@ -2074,27 +2068,28 @@ SYSCALL_DEFINE1(epoll_create, int, size) return do_epoll_create(0); } -/* - * The following function implements the controller interface for - * the eventpoll file that enables the insertion/removal/change of - * file descriptors inside the interest set. - */ -SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, - struct epoll_event __user *, event) +static inline int epoll_mutex_lock(struct mutex *mutex, int depth, + bool nonblock) +{ + if (!nonblock) { + mutex_lock_nested(mutex, depth); + return 0; + } + if (mutex_trylock(mutex)) + return 0; + return -EAGAIN; +} + +int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, + bool nonblock) { int error; int full_check = 0; struct fd f, tf; struct eventpoll *ep; struct epitem *epi; - struct epoll_event epds; struct eventpoll *tep = NULL; - error = -EFAULT; - if (ep_op_has_event(op) && - copy_from_user(&epds, event, sizeof(struct epoll_event))) - goto error_return; - error = -EBADF; f = fdget(epfd); if (!f.file) @@ -2112,7 +2107,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, /* Check if EPOLLWAKEUP is allowed */ if (ep_op_has_event(op)) - ep_take_care_of_epollwakeup(&epds); + ep_take_care_of_epollwakeup(epds); /* * We have to check that the file structure underneath the file descriptor @@ -2128,11 +2123,11 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation. * Also, we do not currently supported nested exclusive wakeups. */ - if (ep_op_has_event(op) && (epds.events & EPOLLEXCLUSIVE)) { + if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) { if (op == EPOLL_CTL_MOD) goto error_tgt_fput; if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) || - (epds.events & ~EPOLLEXCLUSIVE_OK_BITS))) + (epds->events & ~EPOLLEXCLUSIVE_OK_BITS))) goto error_tgt_fput; } @@ -2157,13 +2152,17 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, * deep wakeup paths from forming in parallel through multiple * EPOLL_CTL_ADD operations. */ - mutex_lock_nested(&ep->mtx, 0); + error = epoll_mutex_lock(&ep->mtx, 0, nonblock); + if (error) + goto error_tgt_fput; if (op == EPOLL_CTL_ADD) { if (!list_empty(&f.file->f_ep_links) || is_file_epoll(tf.file)) { - full_check = 1; mutex_unlock(&ep->mtx); - mutex_lock(&epmutex); + error = epoll_mutex_lock(&epmutex, 0, nonblock); + if (error) + goto error_tgt_fput; + full_check = 1; if (is_file_epoll(tf.file)) { error = -ELOOP; if (ep_loop_check(ep, tf.file) != 0) { @@ -2173,10 +2172,19 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, } else list_add(&tf.file->f_tfile_llink, &tfile_check_list); - mutex_lock_nested(&ep->mtx, 0); + error = epoll_mutex_lock(&ep->mtx, 0, nonblock); + if (error) { +out_del: + list_del(&tf.file->f_tfile_llink); + goto error_tgt_fput; + } if (is_file_epoll(tf.file)) { tep = tf.file->private_data; - mutex_lock_nested(&tep->mtx, 1); + error = epoll_mutex_lock(&tep->mtx, 1, nonblock); + if (error) { + mutex_unlock(&ep->mtx); + goto out_del; + } } } } @@ -2192,8 +2200,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, switch (op) { case EPOLL_CTL_ADD: if (!epi) { - epds.events |= EPOLLERR | EPOLLHUP; - error = ep_insert(ep, &epds, tf.file, fd, full_check); + epds->events |= EPOLLERR | EPOLLHUP; + error = ep_insert(ep, epds, tf.file, fd, full_check); } else error = -EEXIST; if (full_check) @@ -2208,8 +2216,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, case EPOLL_CTL_MOD: if (epi) { if (!(epi->event.events & EPOLLEXCLUSIVE)) { - epds.events |= EPOLLERR | EPOLLHUP; - error = ep_modify(ep, epi, &epds); + epds->events |= EPOLLERR | EPOLLHUP; + error = ep_modify(ep, epi, epds); } } else error = -ENOENT; @@ -2232,6 +2240,23 @@ error_return: } /* + * The following function implements the controller interface for + * the eventpoll file that enables the insertion/removal/change of + * file descriptors inside the interest set. + */ +SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, + struct epoll_event __user *, event) +{ + struct epoll_event epds; + + if (ep_op_has_event(op) && + copy_from_user(&epds, event, sizeof(struct epoll_event))) + return -EFAULT; + + return do_epoll_ctl(epfd, op, fd, &epds, false); +} + +/* * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_wait(2). */ diff --git a/fs/exec.c b/fs/exec.c index 74d88dab98dd..db17be51b112 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -272,7 +272,6 @@ static int __bprm_mm_init(struct linux_binprm *bprm) goto err; mm->stack_vm = mm->total_vm = 1; - arch_bprm_mm_init(mm, vma); up_write(&mm->mmap_sem); bprm->p = vma->vm_end - sizeof(void *); return 0; @@ -761,6 +760,11 @@ int setup_arg_pages(struct linux_binprm *bprm, goto out_unlock; BUG_ON(prev != vma); + if (unlikely(vm_flags & VM_EXEC)) { + pr_warn_once("process '%pD4' started with executable stack\n", + bprm->file); + } + /* Move stack pages down in memory. */ if (stack_shift) { ret = shift_arg_pages(vma, stack_shift); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index bcffe25da2f0..4a4ab683250d 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1073,9 +1073,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) if (EXT2_BLOCKS_PER_GROUP(sb) == 0) goto cantfind_ext2; - sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) - - le32_to_cpu(es->s_first_data_block) - 1) - / EXT2_BLOCKS_PER_GROUP(sb)) + 1; + sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) - + le32_to_cpu(es->s_first_data_block) - 1) + / EXT2_BLOCKS_PER_GROUP(sb)) + 1; db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) / EXT2_DESC_PER_BLOCK(sb); sbi->s_group_desc = kmalloc_array (db_count, @@ -1138,6 +1138,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) ext2_count_dirs(sb), GFP_KERNEL); } if (err) { + ret = err; ext2_msg(sb, KERN_ERR, "error: insufficient memory"); goto failed_mount3; } diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index db9bfa08d3e0..2a592e38cdfe 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -4,12 +4,7 @@ # kernels after the removal of ext3 driver. config EXT3_FS tristate "The Extended 3 (ext3) filesystem" - # These must match EXT4_FS selects... select EXT4_FS - select JBD2 - select CRC16 - select CRYPTO - select CRYPTO_CRC32C help This config option is here only for backward compatibility. ext3 filesystem is now handled by the ext4 driver. @@ -33,7 +28,6 @@ config EXT3_FS_SECURITY config EXT4_FS tristate "The Extended 4 (ext4) filesystem" - # Please update EXT3_FS selects when changing these select JBD2 select CRC16 select CRYPTO @@ -109,7 +103,7 @@ config EXT4_DEBUG echo 1 > /sys/module/ext4/parameters/mballoc_debug config EXT4_KUNIT_TESTS - bool "KUnit tests for ext4" + tristate "KUnit tests for ext4" select EXT4_FS depends on KUNIT help diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 840b91d040f1..4ccb3c9189d8 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -13,5 +13,6 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o -ext4-$(CONFIG_EXT4_KUNIT_TESTS) += inode-test.o +ext4-inode-test-objs += inode-test.o +obj-$(CONFIG_EXT4_KUNIT_TESTS) += ext4-inode-test.o ext4-$(CONFIG_FS_VERITY) += verity.o diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 0b202e00d93f..5f993a411251 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -371,7 +371,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb, if (buffer_verified(bh)) goto verified; if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group, - desc, bh))) { + desc, bh) || + ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); ext4_mark_group_bitmap_corrupted(sb, block_group, @@ -505,7 +506,9 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, if (!desc) return -EFSCORRUPTED; wait_on_buffer(bh); + ext4_simulate_fail_bh(sb, bh, EXT4_SIM_BBITMAP_EIO); if (!buffer_uptodate(bh)) { + ext4_set_errno(sb, EIO); ext4_error(sb, "Cannot read block bitmap - " "block_group = %u, block_bitmap = %llu", block_group, (unsigned long long) bh->b_blocknr); diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 4e093277c8bf..1f340743c9a8 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -462,7 +462,6 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, new_fn->name_len = ent_name->len; new_fn->file_type = dirent->file_type; memcpy(new_fn->name, ent_name->name, ent_name->len); - new_fn->name[ent_name->len] = 0; while (*p) { parent = *p; @@ -672,9 +671,11 @@ static int ext4_d_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { struct qstr qstr = {.name = str, .len = len }; - struct inode *inode = dentry->d_parent->d_inode; + const struct dentry *parent = READ_ONCE(dentry->d_parent); + const struct inode *inode = READ_ONCE(parent->d_inode); - if (!IS_CASEFOLDED(inode) || !EXT4_SB(inode->i_sb)->s_encoding) { + if (!inode || !IS_CASEFOLDED(inode) || + !EXT4_SB(inode->i_sb)->s_encoding) { if (len != name->len) return -1; return memcmp(str, name->name, len); @@ -687,10 +688,11 @@ static int ext4_d_hash(const struct dentry *dentry, struct qstr *str) { const struct ext4_sb_info *sbi = EXT4_SB(dentry->d_sb); const struct unicode_map *um = sbi->s_encoding; + const struct inode *inode = READ_ONCE(dentry->d_inode); unsigned char *norm; int len, ret = 0; - if (!IS_CASEFOLDED(dentry->d_inode) || !um) + if (!inode || !IS_CASEFOLDED(inode) || !um) return 0; norm = kmalloc(PATH_MAX, GFP_ATOMIC); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f8578caba40d..9a2ee2428ecc 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1052,8 +1052,6 @@ struct ext4_inode_info { /* allocation reservation info for delalloc */ /* In case of bigalloc, this refer to clusters rather than blocks */ unsigned int i_reserved_data_blocks; - ext4_lblk_t i_da_metadata_calc_last_lblock; - int i_da_metadata_calc_len; /* pending cluster reservations for bigalloc file systems */ struct ext4_pending_tree i_pending_tree; @@ -1343,7 +1341,8 @@ struct ext4_super_block { __u8 s_lastcheck_hi; __u8 s_first_error_time_hi; __u8 s_last_error_time_hi; - __u8 s_pad[2]; + __u8 s_first_error_errcode; + __u8 s_last_error_errcode; __le16 s_encoding; /* Filename charset encoding */ __le16 s_encoding_flags; /* Filename charset encoding flags */ __le32 s_reserved[95]; /* Padding to the end of the block */ @@ -1556,6 +1555,9 @@ struct ext4_sb_info { /* Barrier between changing inodes' journal flags and writepages ops. */ struct percpu_rw_semaphore s_journal_flag_rwsem; struct dax_device *s_daxdev; +#ifdef CONFIG_EXT4_DEBUG + unsigned long s_simulate_fail; +#endif }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1575,6 +1577,66 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) } /* + * Simulate_fail codes + */ +#define EXT4_SIM_BBITMAP_EIO 1 +#define EXT4_SIM_BBITMAP_CRC 2 +#define EXT4_SIM_IBITMAP_EIO 3 +#define EXT4_SIM_IBITMAP_CRC 4 +#define EXT4_SIM_INODE_EIO 5 +#define EXT4_SIM_INODE_CRC 6 +#define EXT4_SIM_DIRBLOCK_EIO 7 +#define EXT4_SIM_DIRBLOCK_CRC 8 + +static inline bool ext4_simulate_fail(struct super_block *sb, + unsigned long code) +{ +#ifdef CONFIG_EXT4_DEBUG + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (unlikely(sbi->s_simulate_fail == code)) { + sbi->s_simulate_fail = 0; + return true; + } +#endif + return false; +} + +static inline void ext4_simulate_fail_bh(struct super_block *sb, + struct buffer_head *bh, + unsigned long code) +{ + if (!IS_ERR(bh) && ext4_simulate_fail(sb, code)) + clear_buffer_uptodate(bh); +} + +/* + * Error number codes for s_{first,last}_error_errno + * + * Linux errno numbers are architecture specific, so we need to translate + * them into something which is architecture independent. We don't define + * codes for all errno's; just the ones which are most likely to be the cause + * of an ext4_error() call. + */ +#define EXT4_ERR_UNKNOWN 1 +#define EXT4_ERR_EIO 2 +#define EXT4_ERR_ENOMEM 3 +#define EXT4_ERR_EFSBADCRC 4 +#define EXT4_ERR_EFSCORRUPTED 5 +#define EXT4_ERR_ENOSPC 6 +#define EXT4_ERR_ENOKEY 7 +#define EXT4_ERR_EROFS 8 +#define EXT4_ERR_EFBIG 9 +#define EXT4_ERR_EEXIST 10 +#define EXT4_ERR_ERANGE 11 +#define EXT4_ERR_EOVERFLOW 12 +#define EXT4_ERR_EBUSY 13 +#define EXT4_ERR_ENOTDIR 14 +#define EXT4_ERR_ENOTEMPTY 15 +#define EXT4_ERR_ESHUTDOWN 16 +#define EXT4_ERR_EFAULT 17 + +/* * Inode dynamic state flags */ enum { @@ -2628,7 +2690,6 @@ extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, /* indirect.c */ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); -extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); extern void ext4_ind_truncate(handle_t *, struct inode *inode); extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode, @@ -2679,8 +2740,6 @@ extern struct buffer_head *ext4_sb_bread(struct super_block *sb, extern int ext4_seq_options_show(struct seq_file *seq, void *offset); extern int ext4_calculate_overhead(struct super_block *sb); extern void ext4_superblock_csum_set(struct super_block *sb); -extern void *ext4_kvmalloc(size_t size, gfp_t flags); -extern void *ext4_kvzalloc(size_t size, gfp_t flags); extern int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup); extern const char *ext4_decode_error(struct super_block *sb, int errno, @@ -2688,6 +2747,7 @@ extern const char *ext4_decode_error(struct super_block *sb, int errno, extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb, ext4_group_t block_group, unsigned int flags); +extern void ext4_set_errno(struct super_block *sb, int err); extern __printf(4, 5) void __ext4_error(struct super_block *, const char *, unsigned int, @@ -3254,7 +3314,6 @@ struct ext4_extent; #define EXT_MAX_BLOCKS 0xffffffff extern int ext4_ext_tree_init(handle_t *handle, struct inode *); -extern int ext4_ext_writepage_trans_blocks(struct inode *, int); extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); @@ -3271,14 +3330,9 @@ extern int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end); extern int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); -extern int ext4_ext_calc_metadata_amount(struct inode *inode, - ext4_lblk_t lblocks); extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int num, struct ext4_ext_path *path); -extern int ext4_can_extents_be_merged(struct inode *inode, - struct ext4_extent *ex1, - struct ext4_extent *ex2); extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path **, struct ext4_extent *, int); @@ -3294,8 +3348,6 @@ extern int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); extern int ext4_ext_precache(struct inode *inode); -extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); -extern int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len); extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, ext4_lblk_t count, @@ -3390,6 +3442,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) } extern const struct iomap_ops ext4_iomap_ops; +extern const struct iomap_ops ext4_iomap_overwrite_ops; extern const struct iomap_ops ext4_iomap_report_ops; static inline int ext4_buffer_uptodate(struct buffer_head *bh) diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 98bd0e9ee7df..1c216fcc202a 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -267,10 +267,5 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, 0xffff); } -#define ext4_ext_dirty(handle, inode, path) \ - __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) -int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, - struct inode *inode, struct ext4_ext_path *path); - #endif /* _EXT4_EXTENTS */ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index d3b8cdea5df7..1f53d64e42a5 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -7,6 +7,28 @@ #include <trace/events/ext4.h> +int ext4_inode_journal_mode(struct inode *inode) +{ + if (EXT4_JOURNAL(inode) == NULL) + return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ + /* We do not support data journalling with delayed allocation */ + if (!S_ISREG(inode->i_mode) || + ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || + test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || + (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && + !test_opt(inode->i_sb, DELALLOC))) { + /* We do not support data journalling for encrypted data */ + if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) + return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ + return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ + } + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) + return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ + BUG(); +} + /* Just increment the non-pointer handle value */ static handle_t *ext4_get_nojournal(void) { @@ -58,6 +80,7 @@ static int ext4_journal_check_start(struct super_block *sb) * take the FS itself readonly cleanly. */ if (journal && is_journal_aborted(journal)) { + ext4_set_errno(sb, -journal->j_errno); ext4_abort(sb, "Detected aborted journal"); return -EROFS; } @@ -249,6 +272,7 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle, if (err) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); + ext4_set_errno(inode->i_sb, -err); __ext4_abort(inode->i_sb, where, line, "error %d when attempting revoke", err); } @@ -320,6 +344,7 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, es = EXT4_SB(inode->i_sb)->s_es; es->s_last_error_block = cpu_to_le64(bh->b_blocknr); + ext4_set_errno(inode->i_sb, EIO); ext4_error_inode(inode, where, line, bh->b_blocknr, "IO error syncing itable block"); diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index a6b9b66dbfad..7ea4f6fa173b 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -463,27 +463,7 @@ int ext4_force_commit(struct super_block *sb); #define EXT4_INODE_ORDERED_DATA_MODE 0x02 /* ordered data mode */ #define EXT4_INODE_WRITEBACK_DATA_MODE 0x04 /* writeback data mode */ -static inline int ext4_inode_journal_mode(struct inode *inode) -{ - if (EXT4_JOURNAL(inode) == NULL) - return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ - /* We do not support data journalling with delayed allocation */ - if (!S_ISREG(inode->i_mode) || - ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || - test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || - (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && - !test_opt(inode->i_sb, DELALLOC))) { - /* We do not support data journalling for encrypted data */ - if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) - return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ - return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ - } - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) - return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) - return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ - BUG(); -} +int ext4_inode_journal_mode(struct inode *inode); static inline int ext4_should_journal_data(struct inode *inode) { diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 0e8708b77da6..954013d6076b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -161,8 +161,9 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode, * - ENOMEM * - EIO */ -int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, - struct inode *inode, struct ext4_ext_path *path) +static int __ext4_ext_dirty(const char *where, unsigned int line, + handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) { int err; @@ -179,6 +180,9 @@ int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, return err; } +#define ext4_ext_dirty(handle, inode, path) \ + __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) + static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) @@ -309,53 +313,6 @@ ext4_force_split_extent_at(handle_t *handle, struct inode *inode, (nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0)); } -/* - * Calculate the number of metadata blocks needed - * to allocate @blocks - * Worse case is one block per extent - */ -int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - int idxs; - - idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) - / sizeof(struct ext4_extent_idx)); - - /* - * If the new delayed allocation block is contiguous with the - * previous da block, it can share index blocks with the - * previous block, so we only need to allocate a new index - * block every idxs leaf blocks. At ldxs**2 blocks, we need - * an additional index block, and at ldxs**3 blocks, yet - * another index blocks. - */ - if (ei->i_da_metadata_calc_len && - ei->i_da_metadata_calc_last_lblock+1 == lblock) { - int num = 0; - - if ((ei->i_da_metadata_calc_len % idxs) == 0) - num++; - if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0) - num++; - if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) { - num++; - ei->i_da_metadata_calc_len = 0; - } else - ei->i_da_metadata_calc_len++; - ei->i_da_metadata_calc_last_lblock++; - return num; - } - - /* - * In the worst case we need a new set of index blocks at - * every level of the inode's extent tree. - */ - ei->i_da_metadata_calc_len = 1; - ei->i_da_metadata_calc_last_lblock = lblock; - return ext_depth(inode) + 1; -} - static int ext4_ext_max_entries(struct inode *inode, int depth) { @@ -492,6 +449,7 @@ static int __ext4_ext_check(const char *function, unsigned int line, return 0; corrupted: + ext4_set_errno(inode->i_sb, -err); ext4_error_inode(inode, function, line, 0, "pblk %llu bad header/extent: %s - magic %x, " "entries %u, max %u(%u), depth %u(%u)", @@ -510,6 +468,30 @@ int ext4_ext_check_inode(struct inode *inode) return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0); } +static void ext4_cache_extents(struct inode *inode, + struct ext4_extent_header *eh) +{ + struct ext4_extent *ex = EXT_FIRST_EXTENT(eh); + ext4_lblk_t prev = 0; + int i; + + for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) { + unsigned int status = EXTENT_STATUS_WRITTEN; + ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); + int len = ext4_ext_get_actual_len(ex); + + if (prev && (prev != lblk)) + ext4_es_cache_extent(inode, prev, lblk - prev, ~0, + EXTENT_STATUS_HOLE); + + if (ext4_ext_is_unwritten(ex)) + status = EXTENT_STATUS_UNWRITTEN; + ext4_es_cache_extent(inode, lblk, len, + ext4_ext_pblock(ex), status); + prev = lblk + len; + } +} + static struct buffer_head * __read_extent_tree_block(const char *function, unsigned int line, struct inode *inode, ext4_fsblk_t pblk, int depth, @@ -544,26 +526,7 @@ __read_extent_tree_block(const char *function, unsigned int line, */ if (!(flags & EXT4_EX_NOCACHE) && depth == 0) { struct ext4_extent_header *eh = ext_block_hdr(bh); - struct ext4_extent *ex = EXT_FIRST_EXTENT(eh); - ext4_lblk_t prev = 0; - int i; - - for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) { - unsigned int status = EXTENT_STATUS_WRITTEN; - ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); - int len = ext4_ext_get_actual_len(ex); - - if (prev && (prev != lblk)) - ext4_es_cache_extent(inode, prev, - lblk - prev, ~0, - EXTENT_STATUS_HOLE); - - if (ext4_ext_is_unwritten(ex)) - status = EXTENT_STATUS_UNWRITTEN; - ext4_es_cache_extent(inode, lblk, len, - ext4_ext_pblock(ex), status); - prev = lblk + len; - } + ext4_cache_extents(inode, eh); } return bh; errout: @@ -649,8 +612,9 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) ext_debug("path:"); for (k = 0; k <= l; k++, path++) { if (path->p_idx) { - ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), - ext4_idx_pblock(path->p_idx)); + ext_debug(" %d->%llu", + le32_to_cpu(path->p_idx->ei_block), + ext4_idx_pblock(path->p_idx)); } else if (path->p_ext) { ext_debug(" %d:[%d]%d:%llu ", le32_to_cpu(path->p_ext->ee_block), @@ -731,11 +695,12 @@ void ext4_ext_drop_refs(struct ext4_ext_path *path) if (!path) return; depth = path->p_depth; - for (i = 0; i <= depth; i++, path++) + for (i = 0; i <= depth; i++, path++) { if (path->p_bh) { brelse(path->p_bh); path->p_bh = NULL; } + } } /* @@ -777,8 +742,8 @@ ext4_ext_binsearch_idx(struct inode *inode, chix = ix = EXT_FIRST_INDEX(eh); for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { - if (k != 0 && - le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { + if (k != 0 && le32_to_cpu(ix->ei_block) <= + le32_to_cpu(ix[-1].ei_block)) { printk(KERN_DEBUG "k=%d, ix=0x%p, " "first=0x%p\n", k, ix, EXT_FIRST_INDEX(eh)); @@ -911,6 +876,8 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, path[0].p_bh = NULL; i = depth; + if (!(flags & EXT4_EX_NOCACHE) && depth == 0) + ext4_cache_extents(inode, eh); /* walk through the tree */ while (i) { ext_debug("depth %d: num %d, max %d\n", @@ -1632,17 +1599,16 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path) return EXT_MAX_BLOCKS; while (depth >= 0) { + struct ext4_ext_path *p = &path[depth]; + if (depth == path->p_depth) { /* leaf */ - if (path[depth].p_ext && - path[depth].p_ext != - EXT_LAST_EXTENT(path[depth].p_hdr)) - return le32_to_cpu(path[depth].p_ext[1].ee_block); + if (p->p_ext && p->p_ext != EXT_LAST_EXTENT(p->p_hdr)) + return le32_to_cpu(p->p_ext[1].ee_block); } else { /* index */ - if (path[depth].p_idx != - EXT_LAST_INDEX(path[depth].p_hdr)) - return le32_to_cpu(path[depth].p_idx[1].ei_block); + if (p->p_idx != EXT_LAST_INDEX(p->p_hdr)) + return le32_to_cpu(p->p_idx[1].ei_block); } depth--; } @@ -1742,9 +1708,9 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, return err; } -int -ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, - struct ext4_extent *ex2) +static int ext4_can_extents_be_merged(struct inode *inode, + struct ext4_extent *ex1, + struct ext4_extent *ex2) { unsigned short ext1_ee_len, ext2_ee_len; @@ -1758,11 +1724,6 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, le32_to_cpu(ex2->ee_block)) return 0; - /* - * To allow future support for preallocated extents to be added - * as an RO_COMPAT feature, refuse to merge to extents if - * this can result in the top bit of ee_len being set. - */ if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) return 0; @@ -1870,13 +1831,14 @@ static void ext4_ext_try_to_merge_up(handle_t *handle, } /* - * This function tries to merge the @ex extent to neighbours in the tree. - * return 1 if merge left else 0. + * This function tries to merge the @ex extent to neighbours in the tree, then + * tries to collapse the extent tree into the inode. */ static void ext4_ext_try_to_merge(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, - struct ext4_extent *ex) { + struct ext4_extent *ex) +{ struct ext4_extent_header *eh; unsigned int depth; int merge_done = 0; @@ -3718,9 +3680,6 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, max_zeroout = sbi->s_extent_max_zeroout_kb >> (inode->i_sb->s_blocksize_bits - 10); - if (IS_ENCRYPTED(inode)) - max_zeroout = 0; - /* * five cases: * 1. split the extent into three extents. @@ -4706,6 +4665,10 @@ retry: return ret > 0 ? ret2 : ret; } +static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); + +static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len); + static long ext4_zero_range(struct file *file, loff_t offset, loff_t len, int mode) { @@ -4723,9 +4686,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, trace_ext4_zero_range(inode, offset, len, mode); - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - /* Call ext4_force_commit to flush all data in case of data=journal. */ if (ext4_should_journal_data(inode)) { ret = ext4_force_commit(inode->i_sb); @@ -4765,7 +4725,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, } if (!(mode & FALLOC_FL_KEEP_SIZE) && - (offset + len > i_size_read(inode) || + (offset + len > inode->i_size || offset + len > EXT4_I(inode)->i_disksize)) { new_size = offset + len; ret = inode_newsize_ok(inode, new_size); @@ -4849,7 +4809,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, * Mark that we allocate beyond EOF so the subsequent truncate * can proceed even if the new size is the same as i_size. */ - if ((offset + len) > i_size_read(inode)) + if (offset + len > inode->i_size) ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); } ext4_mark_inode_dirty(handle, inode); @@ -4890,14 +4850,9 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) * range since we would need to re-encrypt blocks with a * different IV or XTS tweak (which are based on the logical * block number). - * - * XXX It's not clear why zero range isn't working, but we'll - * leave it disabled for encrypted inodes for now. This is a - * bug we should fix.... */ if (IS_ENCRYPTED(inode) && - (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE | - FALLOC_FL_ZERO_RANGE))) + (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; /* Return error if mode is not supported */ @@ -4941,7 +4896,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) } if (!(mode & FALLOC_FL_KEEP_SIZE) && - (offset + len > i_size_read(inode) || + (offset + len > inode->i_size || offset + len > EXT4_I(inode)->i_disksize)) { new_size = offset + len; ret = inode_newsize_ok(inode, new_size); @@ -5268,7 +5223,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, { int depth, err = 0; struct ext4_extent *ex_start, *ex_last; - bool update = 0; + bool update = false; depth = path->p_depth; while (depth >= 0) { @@ -5284,7 +5239,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, goto out; if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) - update = 1; + update = true; while (ex_start <= ex_last) { if (SHIFT == SHIFT_LEFT) { @@ -5472,7 +5427,7 @@ out: * This implements the fallocate's collapse range functionality for ext4 * Returns: 0 and non-zero on error. */ -int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) +static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) { struct super_block *sb = inode->i_sb; ext4_lblk_t punch_start, punch_stop; @@ -5489,12 +5444,8 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return -EOPNOTSUPP; - /* Collapse range works only on fs block size aligned offsets. */ - if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) || - len & (EXT4_CLUSTER_SIZE(sb) - 1)) - return -EINVAL; - - if (!S_ISREG(inode->i_mode)) + /* Collapse range works only on fs cluster size aligned regions. */ + if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) return -EINVAL; trace_ext4_collapse_range(inode, offset, len); @@ -5514,7 +5465,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) * There is no need to overlap collapse range with EOF, in which case * it is effectively a truncate operation */ - if (offset + len >= i_size_read(inode)) { + if (offset + len >= inode->i_size) { ret = -EINVAL; goto out_mutex; } @@ -5592,7 +5543,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) goto out_stop; } - new_size = i_size_read(inode) - len; + new_size = inode->i_size - len; i_size_write(inode, new_size); EXT4_I(inode)->i_disksize = new_size; @@ -5620,7 +5571,7 @@ out_mutex: * by len bytes. * Returns 0 on success, error otherwise. */ -int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) +static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) { struct super_block *sb = inode->i_sb; handle_t *handle; @@ -5639,14 +5590,10 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return -EOPNOTSUPP; - /* Insert range works only on fs block size aligned offsets. */ - if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) || - len & (EXT4_CLUSTER_SIZE(sb) - 1)) + /* Insert range works only on fs cluster size aligned regions. */ + if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) return -EINVAL; - if (!S_ISREG(inode->i_mode)) - return -EOPNOTSUPP; - trace_ext4_insert_range(inode, offset, len); offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb); @@ -5666,14 +5613,14 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) goto out_mutex; } - /* Check for wrap through zero */ - if (inode->i_size + len > inode->i_sb->s_maxbytes) { + /* Check whether the maximum file size would be exceeded */ + if (len > inode->i_sb->s_maxbytes - inode->i_size) { ret = -EFBIG; goto out_mutex; } - /* Offset should be less than i_size */ - if (offset >= i_size_read(inode)) { + /* Offset must be less than i_size */ + if (offset >= inode->i_size) { ret = -EINVAL; goto out_mutex; } diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 825313c59752..4ec30a798260 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -209,6 +209,12 @@ static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) return es->es_pblk & ~ES_MASK; } +static inline ext4_fsblk_t ext4_es_show_pblock(struct extent_status *es) +{ + ext4_fsblk_t pblock = ext4_es_pblock(es); + return pblock == ~ES_MASK ? 0 : pblock; +} + static inline void ext4_es_store_pblock(struct extent_status *es, ext4_fsblk_t pb) { diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 6a7293a5cda2..5f225881176b 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -88,9 +88,10 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; - if (!inode_trylock_shared(inode)) { - if (iocb->ki_flags & IOCB_NOWAIT) + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock_shared(inode)) return -EAGAIN; + } else { inode_lock_shared(inode); } /* @@ -165,19 +166,25 @@ static int ext4_release_file(struct inode *inode, struct file *filp) * threads are at work on the same unwritten block, they must be synchronized * or one thread will zero the other's data, causing corruption. */ -static int -ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos) +static bool +ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos) { struct super_block *sb = inode->i_sb; - int blockmask = sb->s_blocksize - 1; - - if (pos >= ALIGN(i_size_read(inode), sb->s_blocksize)) - return 0; + unsigned long blockmask = sb->s_blocksize - 1; if ((pos | iov_iter_alignment(from)) & blockmask) - return 1; + return true; - return 0; + return false; +} + +static bool +ext4_extending_io(struct inode *inode, loff_t offset, size_t len) +{ + if (offset + len > i_size_read(inode) || + offset + len > EXT4_I(inode)->i_disksize) + return true; + return false; } /* Is IO overwriting allocated and initialized blocks? */ @@ -203,7 +210,8 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len) return err == blklen && (map.m_flags & EXT4_MAP_MAPPED); } -static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) +static ssize_t ext4_generic_write_checks(struct kiocb *iocb, + struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; @@ -227,11 +235,21 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); } + return iov_iter_count(from); +} + +static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) +{ + ssize_t ret, count; + + count = ext4_generic_write_checks(iocb, from); + if (count <= 0) + return count; + ret = file_modified(iocb->ki_filp); if (ret) return ret; - - return iov_iter_count(from); + return count; } static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, @@ -363,62 +381,137 @@ static const struct iomap_dio_ops ext4_dio_write_ops = { .end_io = ext4_dio_write_end_io, }; +/* + * The intention here is to start with shared lock acquired then see if any + * condition requires an exclusive inode lock. If yes, then we restart the + * whole operation by releasing the shared lock and acquiring exclusive lock. + * + * - For unaligned_io we never take shared lock as it may cause data corruption + * when two unaligned IO tries to modify the same block e.g. while zeroing. + * + * - For extending writes case we don't take the shared lock, since it requires + * updating inode i_disksize and/or orphan handling with exclusive lock. + * + * - shared locking will only be true mostly with overwrites. Otherwise we will + * switch to exclusive i_rwsem lock. + */ +static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, + bool *ilock_shared, bool *extend) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + loff_t offset; + size_t count; + ssize_t ret; + +restart: + ret = ext4_generic_write_checks(iocb, from); + if (ret <= 0) + goto out; + + offset = iocb->ki_pos; + count = ret; + if (ext4_extending_io(inode, offset, count)) + *extend = true; + /* + * Determine whether the IO operation will overwrite allocated + * and initialized blocks. + * We need exclusive i_rwsem for changing security info + * in file_modified(). + */ + if (*ilock_shared && (!IS_NOSEC(inode) || *extend || + !ext4_overwrite_io(inode, offset, count))) { + inode_unlock_shared(inode); + *ilock_shared = false; + inode_lock(inode); + goto restart; + } + + ret = file_modified(file); + if (ret < 0) + goto out; + + return count; +out: + if (*ilock_shared) + inode_unlock_shared(inode); + else + inode_unlock(inode); + return ret; +} + static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret; - size_t count; - loff_t offset; handle_t *handle; struct inode *inode = file_inode(iocb->ki_filp); - bool extend = false, overwrite = false, unaligned_aio = false; + loff_t offset = iocb->ki_pos; + size_t count = iov_iter_count(from); + const struct iomap_ops *iomap_ops = &ext4_iomap_ops; + bool extend = false, unaligned_io = false; + bool ilock_shared = true; + + /* + * We initially start with shared inode lock unless it is + * unaligned IO which needs exclusive lock anyways. + */ + if (ext4_unaligned_io(inode, from, offset)) { + unaligned_io = true; + ilock_shared = false; + } + /* + * Quick check here without any i_rwsem lock to see if it is extending + * IO. A more reliable check is done in ext4_dio_write_checks() with + * proper locking in place. + */ + if (offset + count > i_size_read(inode)) + ilock_shared = false; if (iocb->ki_flags & IOCB_NOWAIT) { - if (!inode_trylock(inode)) - return -EAGAIN; + if (ilock_shared) { + if (!inode_trylock_shared(inode)) + return -EAGAIN; + } else { + if (!inode_trylock(inode)) + return -EAGAIN; + } } else { - inode_lock(inode); + if (ilock_shared) + inode_lock_shared(inode); + else + inode_lock(inode); } + /* Fallback to buffered I/O if the inode does not support direct I/O. */ if (!ext4_dio_supported(inode)) { - inode_unlock(inode); - /* - * Fallback to buffered I/O if the inode does not support - * direct I/O. - */ + if (ilock_shared) + inode_unlock_shared(inode); + else + inode_unlock(inode); return ext4_buffered_write_iter(iocb, from); } - ret = ext4_write_checks(iocb, from); - if (ret <= 0) { - inode_unlock(inode); + ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend); + if (ret <= 0) return ret; - } - /* - * Unaligned asynchronous direct I/O must be serialized among each - * other as the zeroing of partial blocks of two competing unaligned - * asynchronous direct I/O writes can result in data corruption. - */ offset = iocb->ki_pos; - count = iov_iter_count(from); - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && - !is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) { - unaligned_aio = true; - inode_dio_wait(inode); - } + count = ret; /* - * Determine whether the I/O will overwrite allocated and initialized - * blocks. If so, check to see whether it is possible to take the - * dioread_nolock path. + * Unaligned direct IO must be serialized among each other as zeroing + * of partial blocks of two competing unaligned IOs can result in data + * corruption. + * + * So we make sure we don't allow any unaligned IO in flight. + * For IOs where we need not wait (like unaligned non-AIO DIO), + * below inode_dio_wait() may anyway become a no-op, since we start + * with exclusive lock. */ - if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) && - ext4_should_dioread_nolock(inode)) { - overwrite = true; - downgrade_write(&inode->i_rwsem); - } + if (unaligned_io) + inode_dio_wait(inode); - if (offset + count > EXT4_I(inode)->i_disksize) { + if (extend) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { ret = PTR_ERR(handle); @@ -431,18 +524,19 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out; } - extend = true; ext4_journal_stop(handle); } - ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, &ext4_dio_write_ops, - is_sync_kiocb(iocb) || unaligned_aio || extend); + if (ilock_shared) + iomap_ops = &ext4_iomap_overwrite_ops; + ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, + is_sync_kiocb(iocb) || unaligned_io || extend); if (extend) ret = ext4_handle_inode_extension(inode, offset, ret, count); out: - if (overwrite) + if (ilock_shared) inode_unlock_shared(inode); else inode_unlock(inode); @@ -487,9 +581,10 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) bool extend = false; struct inode *inode = file_inode(iocb->ki_filp); - if (!inode_trylock(inode)) { - if (iocb->ki_flags & IOCB_NOWAIT) + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) return -EAGAIN; + } else { inode_lock(inode); } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 8ca4a23129aa..c66e8f9451a2 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -94,7 +94,8 @@ static int ext4_validate_inode_bitmap(struct super_block *sb, goto verified; blk = ext4_inode_bitmap(sb, desc); if (!ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh, - EXT4_INODES_PER_GROUP(sb) / 8)) { + EXT4_INODES_PER_GROUP(sb) / 8) || + ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) { ext4_unlock_group(sb, block_group); ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " "inode_bitmap = %llu", block_group, blk); @@ -192,8 +193,10 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) get_bh(bh); submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); + ext4_simulate_fail_bh(sb, bh, EXT4_SIM_IBITMAP_EIO); if (!buffer_uptodate(bh)) { put_bh(bh); + ext4_set_errno(sb, EIO); ext4_error(sb, "Cannot read inode bitmap - " "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); @@ -1223,6 +1226,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { err = PTR_ERR(inode); + ext4_set_errno(sb, -err); ext4_error(sb, "couldn't read orphan inode %lu (err %d)", ino, err); return inode; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 3a4ab70fe9e0..569fc68e8975 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -660,32 +660,6 @@ out: } /* - * Calculate the number of metadata blocks need to reserve - * to allocate a new block at @lblocks for non extent file based file - */ -int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); - int blk_bits; - - if (lblock < EXT4_NDIR_BLOCKS) - return 0; - - lblock -= EXT4_NDIR_BLOCKS; - - if (ei->i_da_metadata_calc_len && - (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { - ei->i_da_metadata_calc_len++; - return 0; - } - ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; - ei->i_da_metadata_calc_len = 1; - blk_bits = order_base_2(lblock); - return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; -} - -/* * Calculate number of indirect blocks touched by mapping @nrblocks logically * contiguous blocks */ diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 2fec62d764fa..fad82d08fca5 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -98,6 +98,7 @@ int ext4_get_max_inline_size(struct inode *inode) error = ext4_get_inode_loc(inode, &iloc); if (error) { + ext4_set_errno(inode->i_sb, -error); ext4_error_inode(inode, __func__, __LINE__, 0, "can't get inode location %lu", inode->i_ino); @@ -849,7 +850,7 @@ out: /* * Prepare the write for the inline data. - * If the the data can be written into the inode, we just read + * If the data can be written into the inode, we just read * the page and make it uptodate, and start the journal. * Otherwise read the page, makes it dirty so that it can be * handle in writepages(the i_disksize update is left to the @@ -1761,6 +1762,7 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data) err = ext4_get_inode_loc(dir, &iloc); if (err) { + ext4_set_errno(dir->i_sb, -err); EXT4_ERROR_INODE(dir, "error %d getting inode %lu block", err, dir->i_ino); return true; diff --git a/fs/ext4/inode-test.c b/fs/ext4/inode-test.c index bbce1c328d85..d62d802c9c12 100644 --- a/fs/ext4/inode-test.c +++ b/fs/ext4/inode-test.c @@ -269,4 +269,6 @@ static struct kunit_suite ext4_inode_test_suite = { .test_cases = ext4_inode_test_cases, }; -kunit_test_suite(ext4_inode_test_suite); +kunit_test_suites(&ext4_inode_test_suite); + +MODULE_LICENSE("GPL v2"); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 629a25d999f0..3313168b680f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -48,8 +48,6 @@ #include <trace/events/ext4.h> -#define MPAGE_DA_EXTENT_TAIL 0x01 - static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { @@ -271,6 +269,7 @@ void ext4_evict_inode(struct inode *inode) if (inode->i_blocks) { err = ext4_truncate(inode); if (err) { + ext4_set_errno(inode->i_sb, -err); ext4_error(inode->i_sb, "couldn't truncate inode %lu (err %d)", inode->i_ino, err); @@ -402,7 +401,7 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, { int ret; - if (IS_ENCRYPTED(inode)) + if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) return fscrypt_zeroout_range(inode, lblk, pblk, len); ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); @@ -2478,10 +2477,12 @@ update_disksize: EXT4_I(inode)->i_disksize = disksize; up_write(&EXT4_I(inode)->i_data_sem); err2 = ext4_mark_inode_dirty(handle, inode); - if (err2) + if (err2) { + ext4_set_errno(inode->i_sb, -err2); ext4_error(inode->i_sb, "Failed to mark inode %lu dirty", inode->i_ino); + } if (!err) err = err2; } @@ -3448,6 +3449,22 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, return 0; } +static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset, + loff_t length, unsigned flags, struct iomap *iomap, + struct iomap *srcmap) +{ + int ret; + + /* + * Even for writes we don't need to allocate blocks, so just pretend + * we are reading to save overhead of starting a transaction. + */ + flags &= ~IOMAP_WRITE; + ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap); + WARN_ON_ONCE(iomap->type != IOMAP_MAPPED); + return ret; +} + static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { @@ -3469,6 +3486,11 @@ const struct iomap_ops ext4_iomap_ops = { .iomap_end = ext4_iomap_end, }; +const struct iomap_ops ext4_iomap_overwrite_ops = { + .iomap_begin = ext4_iomap_overwrite_begin, + .iomap_end = ext4_iomap_end, +}; + static bool ext4_iomap_is_delalloc(struct inode *inode, struct ext4_map_blocks *map) { @@ -3701,8 +3723,12 @@ static int __ext4_block_zero_page_range(handle_t *handle, if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) { /* We expect the key to be set. */ BUG_ON(!fscrypt_has_encryption_key(inode)); - WARN_ON_ONCE(fscrypt_decrypt_pagecache_blocks( - page, blocksize, bh_offset(bh))); + err = fscrypt_decrypt_pagecache_blocks(page, blocksize, + bh_offset(bh)); + if (err) { + clear_buffer_uptodate(bh); + goto unlock; + } } } if (ext4_should_journal_data(inode)) { @@ -3912,9 +3938,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) unsigned int credits; int ret = 0; - if (!S_ISREG(inode->i_mode)) - return -EOPNOTSUPP; - trace_ext4_punch_hole(inode, offset, length, 0); ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); @@ -4240,6 +4263,8 @@ static int __ext4_get_inode_loc(struct inode *inode, bh = sb_getblk(sb, block); if (unlikely(!bh)) return -ENOMEM; + if (ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO)) + goto simulate_eio; if (!buffer_uptodate(bh)) { lock_buffer(bh); @@ -4338,6 +4363,8 @@ make_io: blk_finish_plug(&plug); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { + simulate_eio: + ext4_set_errno(inode->i_sb, EIO); EXT4_ERROR_INODE_BLOCK(inode, block, "unable to read itable block"); brelse(bh); @@ -4551,7 +4578,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, sizeof(gen)); } - if (!ext4_inode_csum_verify(inode, raw_inode, ei)) { + if (!ext4_inode_csum_verify(inode, raw_inode, ei) || + ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) { + ext4_set_errno(inode->i_sb, EFSBADCRC); ext4_error_inode(inode, function, line, 0, "iget: checksum invalid"); ret = -EFSBADCRC; @@ -5090,6 +5119,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { + ext4_set_errno(inode->i_sb, EIO); EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr, "IO error syncing inode"); err = -EIO; @@ -5368,7 +5398,8 @@ int ext4_getattr(const struct path *path, struct kstat *stat, struct ext4_inode_info *ei = EXT4_I(inode); unsigned int flags; - if (EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) { + if ((request_mask & STATX_BTIME) && + EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) { stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = ei->i_crtime.tv_sec; stat->btime.tv_nsec = ei->i_crtime.tv_nsec; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index e8870fff8224..a0ec750018dd 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1377,6 +1377,8 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC_CLEAR_ES_CACHE: case EXT4_IOC_GETSTATE: case EXT4_IOC_GET_ES_CACHE: + case EXT4_IOC_FSGETXATTR: + case EXT4_IOC_FSSETXATTR: break; default: return -ENOIOCTLCMD; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index a3e2767bdf2f..f64838187559 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3895,6 +3895,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { err = PTR_ERR(bitmap_bh); + ext4_set_errno(sb, -err); ext4_error(sb, "Error %d reading block bitmap for %u", err, group); return 0; @@ -4063,6 +4064,7 @@ repeat: err = ext4_mb_load_buddy_gfp(sb, group, &e4b, GFP_NOFS|__GFP_NOFAIL); if (err) { + ext4_set_errno(sb, -err); ext4_error(sb, "Error %d loading buddy information for %u", err, group); continue; @@ -4071,6 +4073,7 @@ repeat: bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { err = PTR_ERR(bitmap_bh); + ext4_set_errno(sb, -err); ext4_error(sb, "Error %d reading block bitmap for %u", err, group); ext4_mb_unload_buddy(&e4b); @@ -4325,6 +4328,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, err = ext4_mb_load_buddy_gfp(sb, group, &e4b, GFP_NOFS|__GFP_NOFAIL); if (err) { + ext4_set_errno(sb, -err); ext4_error(sb, "Error %d loading buddy information for %u", err, group); continue; diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 2305b4374fd3..1c44b1a32001 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -173,8 +173,10 @@ static int kmmpd(void *data) * (s_mmp_update_interval * 60) seconds. */ if (retval) { - if ((failed_writes % 60) == 0) + if ((failed_writes % 60) == 0) { + ext4_set_errno(sb, -retval); ext4_error(sb, "Error writing to MMP block"); + } failed_writes++; } @@ -205,6 +207,7 @@ static int kmmpd(void *data) retval = read_mmp_block(sb, &bh_check, mmp_block); if (retval) { + ext4_set_errno(sb, -retval); ext4_error(sb, "error reading MMP data: %d", retval); goto exit_thread; @@ -218,6 +221,7 @@ static int kmmpd(void *data) "Error while updating MMP info. " "The filesystem seems to have been" " multiply mounted."); + ext4_set_errno(sb, EBUSY); ext4_error(sb, "abort"); put_bh(bh_check); retval = -EBUSY; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 1cb42d940784..129d2ebae00d 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -109,7 +109,10 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, struct ext4_dir_entry *dirent; int is_dx_block = 0; - bh = ext4_bread(NULL, inode, block, 0); + if (ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_EIO)) + bh = ERR_PTR(-EIO); + else + bh = ext4_bread(NULL, inode, block, 0); if (IS_ERR(bh)) { __ext4_warning(inode->i_sb, func, line, "inode #%lu: lblock %lu: comm %s: " @@ -153,9 +156,11 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, * caller is sure it should be an index block. */ if (is_dx_block && type == INDEX) { - if (ext4_dx_csum_verify(inode, dirent)) + if (ext4_dx_csum_verify(inode, dirent) && + !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC)) set_buffer_verified(bh); else { + ext4_set_errno(inode->i_sb, EFSBADCRC); ext4_error_inode(inode, func, line, block, "Directory index failed checksum"); brelse(bh); @@ -163,9 +168,11 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, } } if (!is_dx_block) { - if (ext4_dirblock_csum_verify(inode, bh)) + if (ext4_dirblock_csum_verify(inode, bh) && + !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC)) set_buffer_verified(bh); else { + ext4_set_errno(inode->i_sb, EFSBADCRC); ext4_error_inode(inode, func, line, block, "Directory block failed checksum"); brelse(bh); @@ -1002,7 +1009,6 @@ static int htree_dirblock_to_tree(struct file *dir_file, top = (struct ext4_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0)); -#ifdef CONFIG_FS_ENCRYPTION /* Check if the directory is encrypted */ if (IS_ENCRYPTED(dir)) { err = fscrypt_get_encryption_info(dir); @@ -1017,7 +1023,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, return err; } } -#endif + for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, bh->b_size, @@ -1065,9 +1071,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, } errout: brelse(bh); -#ifdef CONFIG_FS_ENCRYPTION fscrypt_fname_free_buffer(&fname_crypto_str); -#endif return count; } @@ -1527,6 +1531,7 @@ restart: goto next; wait_on_buffer(bh); if (!buffer_uptodate(bh)) { + ext4_set_errno(sb, EIO); EXT4_ERROR_INODE(dir, "reading directory lblock %lu", (unsigned long) block); brelse(bh); @@ -1537,6 +1542,7 @@ restart: !is_dx_internal_node(dir, block, (struct ext4_dir_entry *)bh->b_data) && !ext4_dirblock_csum_verify(dir, bh)) { + ext4_set_errno(sb, EFSBADCRC); EXT4_ERROR_INODE(dir, "checksumming directory " "block %lu", (unsigned long)block); brelse(bh); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 24aeedb8fc75..68b39e75446a 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -512,17 +512,26 @@ int ext4_bio_write_page(struct ext4_io_submit *io, gfp_t gfp_flags = GFP_NOFS; unsigned int enc_bytes = round_up(len, i_blocksize(inode)); + /* + * Since bounce page allocation uses a mempool, we can only use + * a waiting mask (i.e. request guaranteed allocation) on the + * first page of the bio. Otherwise it can deadlock. + */ + if (io->io_bio) + gfp_flags = GFP_NOWAIT | __GFP_NOWARN; retry_encrypt: bounce_page = fscrypt_encrypt_pagecache_blocks(page, enc_bytes, 0, gfp_flags); if (IS_ERR(bounce_page)) { ret = PTR_ERR(bounce_page); - if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) { - if (io->io_bio) { + if (ret == -ENOMEM && + (io->io_bio || wbc->sync_mode == WB_SYNC_ALL)) { + gfp_flags = GFP_NOFS; + if (io->io_bio) ext4_io_submit(io); - congestion_wait(BLK_RW_ASYNC, HZ/50); - } - gfp_flags |= __GFP_NOFAIL; + else + gfp_flags |= __GFP_NOFAIL; + congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry_encrypt; } diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index fef7755300c3..c1769afbf799 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -57,6 +57,7 @@ enum bio_post_read_step { STEP_INITIAL = 0, STEP_DECRYPT, STEP_VERITY, + STEP_MAX, }; struct bio_post_read_ctx { @@ -106,10 +107,22 @@ static void verity_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); + struct bio *bio = ctx->bio; - fsverity_verify_bio(ctx->bio); + /* + * fsverity_verify_bio() may call readpages() again, and although verity + * will be disabled for that, decryption may still be needed, causing + * another bio_post_read_ctx to be allocated. So to guarantee that + * mempool_alloc() never deadlocks we must free the current ctx first. + * This is safe because verity is the last post-read step. + */ + BUILD_BUG_ON(STEP_VERITY + 1 != STEP_MAX); + mempool_free(ctx, bio_post_read_ctx_pool); + bio->bi_private = NULL; - bio_post_read_processing(ctx); + fsverity_verify_bio(bio); + + __read_end_io(bio); } static void bio_post_read_processing(struct bio_post_read_ctx *ctx) @@ -176,12 +189,11 @@ static inline bool ext4_need_verity(const struct inode *inode, pgoff_t idx) idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); } -static struct bio_post_read_ctx *get_bio_post_read_ctx(struct inode *inode, - struct bio *bio, - pgoff_t first_idx) +static void ext4_set_bio_post_read_ctx(struct bio *bio, + const struct inode *inode, + pgoff_t first_idx) { unsigned int post_read_steps = 0; - struct bio_post_read_ctx *ctx = NULL; if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) post_read_steps |= 1 << STEP_DECRYPT; @@ -190,14 +202,14 @@ static struct bio_post_read_ctx *get_bio_post_read_ctx(struct inode *inode, post_read_steps |= 1 << STEP_VERITY; if (post_read_steps) { - ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); - if (!ctx) - return ERR_PTR(-ENOMEM); + /* Due to the mempool, this never fails. */ + struct bio_post_read_ctx *ctx = + mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); + ctx->bio = bio; ctx->enabled_steps = post_read_steps; bio->bi_private = ctx; } - return ctx; } static inline loff_t ext4_readpage_limit(struct inode *inode) @@ -358,24 +370,16 @@ int ext4_mpage_readpages(struct address_space *mapping, bio = NULL; } if (bio == NULL) { - struct bio_post_read_ctx *ctx; - /* * bio_alloc will _always_ be able to allocate a bio if * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset(). */ bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); - ctx = get_bio_post_read_ctx(inode, bio, page->index); - if (IS_ERR(ctx)) { - bio_put(bio); - bio = NULL; - goto set_error_page; - } + ext4_set_bio_post_read_ctx(bio, inode, page->index); bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); bio->bi_end_io = mpage_end_io; - bio->bi_private = ctx; bio_set_op_attrs(bio, REQ_OP_READ, is_readahead ? REQ_RAHEAD : 0); } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index a8c0f2b5b6e1..86a2500ed292 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -824,9 +824,8 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, if (unlikely(err)) goto errout; - n_group_desc = ext4_kvmalloc((gdb_num + 1) * - sizeof(struct buffer_head *), - GFP_NOFS); + n_group_desc = kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *), + GFP_KERNEL); if (!n_group_desc) { err = -ENOMEM; ext4_warning(sb, "not enough memory for %lu groups", @@ -900,9 +899,8 @@ static int add_new_gdb_meta_bg(struct super_block *sb, gdb_bh = ext4_sb_bread(sb, gdblock, 0); if (IS_ERR(gdb_bh)) return PTR_ERR(gdb_bh); - n_group_desc = ext4_kvmalloc((gdb_num + 1) * - sizeof(struct buffer_head *), - GFP_NOFS); + n_group_desc = kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *), + GFP_KERNEL); if (!n_group_desc) { brelse(gdb_bh); err = -ENOMEM; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2937a8873fe1..88b213bd32bc 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -154,7 +154,7 @@ ext4_sb_bread(struct super_block *sb, sector_t block, int op_flags) if (bh == NULL) return ERR_PTR(-ENOMEM); - if (buffer_uptodate(bh)) + if (ext4_buffer_uptodate(bh)) return bh; ll_rw_block(REQ_OP_READ, REQ_META | op_flags, 1, &bh); wait_on_buffer(bh); @@ -204,26 +204,6 @@ void ext4_superblock_csum_set(struct super_block *sb) es->s_checksum = ext4_superblock_csum(sb, es); } -void *ext4_kvmalloc(size_t size, gfp_t flags) -{ - void *ret; - - ret = kmalloc(size, flags | __GFP_NOWARN); - if (!ret) - ret = __vmalloc(size, flags, PAGE_KERNEL); - return ret; -} - -void *ext4_kvzalloc(size_t size, gfp_t flags) -{ - void *ret; - - ret = kzalloc(size, flags | __GFP_NOWARN); - if (!ret) - ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL); - return ret; -} - ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, struct ext4_group_desc *bg) { @@ -367,6 +347,8 @@ static void __save_error_info(struct super_block *sb, const char *func, ext4_update_tstamp(es, s_last_error_time); strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func)); es->s_last_error_line = cpu_to_le32(line); + if (es->s_last_error_errcode == 0) + es->s_last_error_errcode = EXT4_ERR_EFSCORRUPTED; if (!es->s_first_error_time) { es->s_first_error_time = es->s_last_error_time; es->s_first_error_time_hi = es->s_last_error_time_hi; @@ -375,6 +357,7 @@ static void __save_error_info(struct super_block *sb, const char *func, es->s_first_error_line = cpu_to_le32(line); es->s_first_error_ino = es->s_last_error_ino; es->s_first_error_block = es->s_last_error_block; + es->s_first_error_errcode = es->s_last_error_errcode; } /* * Start the daily error reporting function if it hasn't been @@ -631,6 +614,66 @@ const char *ext4_decode_error(struct super_block *sb, int errno, return errstr; } +void ext4_set_errno(struct super_block *sb, int err) +{ + if (err < 0) + err = -err; + + switch (err) { + case EIO: + err = EXT4_ERR_EIO; + break; + case ENOMEM: + err = EXT4_ERR_ENOMEM; + break; + case EFSBADCRC: + err = EXT4_ERR_EFSBADCRC; + break; + case EFSCORRUPTED: + err = EXT4_ERR_EFSCORRUPTED; + break; + case ENOSPC: + err = EXT4_ERR_ENOSPC; + break; + case ENOKEY: + err = EXT4_ERR_ENOKEY; + break; + case EROFS: + err = EXT4_ERR_EROFS; + break; + case EFBIG: + err = EXT4_ERR_EFBIG; + break; + case EEXIST: + err = EXT4_ERR_EEXIST; + break; + case ERANGE: + err = EXT4_ERR_ERANGE; + break; + case EOVERFLOW: + err = EXT4_ERR_EOVERFLOW; + break; + case EBUSY: + err = EXT4_ERR_EBUSY; + break; + case ENOTDIR: + err = EXT4_ERR_ENOTDIR; + break; + case ENOTEMPTY: + err = EXT4_ERR_ENOTEMPTY; + break; + case ESHUTDOWN: + err = EXT4_ERR_ESHUTDOWN; + break; + case EFAULT: + err = EXT4_ERR_EFAULT; + break; + default: + err = EXT4_ERR_UNKNOWN; + } + EXT4_SB(sb)->s_es->s_last_error_errcode = err; +} + /* __ext4_std_error decodes expected errors from journaling functions * automatically and invokes the appropriate error response. */ @@ -655,6 +698,7 @@ void __ext4_std_error(struct super_block *sb, const char *function, sb->s_id, function, line, errstr); } + ext4_set_errno(sb, -errno); save_error_info(sb, function, line); ext4_handle_error(sb); } @@ -982,8 +1026,10 @@ static void ext4_put_super(struct super_block *sb) aborted = is_journal_aborted(sbi->s_journal); err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; - if ((err < 0) && !aborted) + if ((err < 0) && !aborted) { + ext4_set_errno(sb, -err); ext4_abort(sb, "Couldn't clean up the journal"); + } } ext4_unregister_sysfs(sb); @@ -1085,8 +1131,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_es_shk_nr = 0; ei->i_es_shrink_lblk = 0; ei->i_reserved_data_blocks = 0; - ei->i_da_metadata_calc_len = 0; - ei->i_da_metadata_calc_last_lblock = 0; spin_lock_init(&(ei->i_block_reservation_lock)); ext4_init_pending_tree(&ei->i_pending_tree); #ifdef CONFIG_QUOTA @@ -1548,6 +1592,7 @@ static const match_table_t tokens = { {Opt_auto_da_alloc, "auto_da_alloc"}, {Opt_noauto_da_alloc, "noauto_da_alloc"}, {Opt_dioread_nolock, "dioread_nolock"}, + {Opt_dioread_lock, "nodioread_nolock"}, {Opt_dioread_lock, "dioread_lock"}, {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, @@ -3720,6 +3765,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_opt(sb, NO_UID32); /* xattr user namespace & acls are now defaulted on */ set_opt(sb, XATTR_USER); + set_opt(sb, DIOREAD_NOLOCK); #ifdef CONFIG_EXT4_FS_POSIX_ACL set_opt(sb, POSIX_ACL); #endif @@ -3887,9 +3933,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) #endif if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { - printk_once(KERN_WARNING "EXT4-fs: Warning: mounting " - "with data=journal disables delayed " - "allocation and O_DIRECT support!\n"); + printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, and O_DIRECT support!\n"); + clear_opt(sb, DIOREAD_NOLOCK); if (test_opt2(sb, EXPLICIT_DELALLOC)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and delalloc"); @@ -5540,9 +5585,15 @@ static int ext4_statfs_project(struct super_block *sb, return PTR_ERR(dquot); spin_lock(&dquot->dq_dqb_lock); - limit = (dquot->dq_dqb.dqb_bsoftlimit ? - dquot->dq_dqb.dqb_bsoftlimit : - dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits; + limit = 0; + if (dquot->dq_dqb.dqb_bsoftlimit && + (!limit || dquot->dq_dqb.dqb_bsoftlimit < limit)) + limit = dquot->dq_dqb.dqb_bsoftlimit; + if (dquot->dq_dqb.dqb_bhardlimit && + (!limit || dquot->dq_dqb.dqb_bhardlimit < limit)) + limit = dquot->dq_dqb.dqb_bhardlimit; + limit >>= sb->s_blocksize_bits; + if (limit && buf->f_blocks > limit) { curblock = (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits; @@ -5552,9 +5603,14 @@ static int ext4_statfs_project(struct super_block *sb, (buf->f_blocks - curblock) : 0; } - limit = dquot->dq_dqb.dqb_isoftlimit ? - dquot->dq_dqb.dqb_isoftlimit : - dquot->dq_dqb.dqb_ihardlimit; + limit = 0; + if (dquot->dq_dqb.dqb_isoftlimit && + (!limit || dquot->dq_dqb.dqb_isoftlimit < limit)) + limit = dquot->dq_dqb.dqb_isoftlimit; + if (dquot->dq_dqb.dqb_ihardlimit && + (!limit || dquot->dq_dqb.dqb_ihardlimit < limit)) + limit = dquot->dq_dqb.dqb_ihardlimit; + if (limit && buf->f_files > limit) { buf->f_files = limit; buf->f_ffree = diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index eb1efad0e20a..d218ebdafa4a 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -29,6 +29,10 @@ typedef enum { attr_last_error_time, attr_feature, attr_pointer_ui, + attr_pointer_ul, + attr_pointer_u64, + attr_pointer_u8, + attr_pointer_string, attr_pointer_atomic, attr_journal_task, } attr_id_t; @@ -46,6 +50,7 @@ struct ext4_attr { struct attribute attr; short attr_id; short attr_ptr; + unsigned short attr_size; union { int offset; void *explicit_ptr; @@ -154,12 +159,35 @@ static struct ext4_attr ext4_attr_##_name = { \ }, \ } +#define EXT4_ATTR_STRING(_name,_mode,_size,_struct,_elname) \ +static struct ext4_attr ext4_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .attr_id = attr_pointer_string, \ + .attr_size = _size, \ + .attr_ptr = ptr_##_struct##_offset, \ + .u = { \ + .offset = offsetof(struct _struct, _elname),\ + }, \ +} + #define EXT4_RO_ATTR_ES_UI(_name,_elname) \ EXT4_ATTR_OFFSET(_name, 0444, pointer_ui, ext4_super_block, _elname) +#define EXT4_RO_ATTR_ES_U8(_name,_elname) \ + EXT4_ATTR_OFFSET(_name, 0444, pointer_u8, ext4_super_block, _elname) + +#define EXT4_RO_ATTR_ES_U64(_name,_elname) \ + EXT4_ATTR_OFFSET(_name, 0444, pointer_u64, ext4_super_block, _elname) + +#define EXT4_RO_ATTR_ES_STRING(_name,_elname,_size) \ + EXT4_ATTR_STRING(_name, 0444, _size, ext4_super_block, _elname) + #define EXT4_RW_ATTR_SBI_UI(_name,_elname) \ EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname) +#define EXT4_RW_ATTR_SBI_UL(_name,_elname) \ + EXT4_ATTR_OFFSET(_name, 0644, pointer_ul, ext4_sb_info, _elname) + #define EXT4_ATTR_PTR(_name,_mode,_id,_ptr) \ static struct ext4_attr ext4_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ @@ -194,7 +222,20 @@ EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.int EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); +#ifdef CONFIG_EXT4_DEBUG +EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail); +#endif EXT4_RO_ATTR_ES_UI(errors_count, s_error_count); +EXT4_RO_ATTR_ES_U8(first_error_errcode, s_first_error_errcode); +EXT4_RO_ATTR_ES_U8(last_error_errcode, s_last_error_errcode); +EXT4_RO_ATTR_ES_UI(first_error_ino, s_first_error_ino); +EXT4_RO_ATTR_ES_UI(last_error_ino, s_last_error_ino); +EXT4_RO_ATTR_ES_U64(first_error_block, s_first_error_block); +EXT4_RO_ATTR_ES_U64(last_error_block, s_last_error_block); +EXT4_RO_ATTR_ES_UI(first_error_line, s_first_error_line); +EXT4_RO_ATTR_ES_UI(last_error_line, s_last_error_line); +EXT4_RO_ATTR_ES_STRING(first_error_func, s_first_error_func, 32); +EXT4_RO_ATTR_ES_STRING(last_error_func, s_last_error_func, 32); EXT4_ATTR(first_error_time, 0444, first_error_time); EXT4_ATTR(last_error_time, 0444, last_error_time); EXT4_ATTR(journal_task, 0444, journal_task); @@ -225,9 +266,22 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(msg_ratelimit_interval_ms), ATTR_LIST(msg_ratelimit_burst), ATTR_LIST(errors_count), + ATTR_LIST(first_error_ino), + ATTR_LIST(last_error_ino), + ATTR_LIST(first_error_block), + ATTR_LIST(last_error_block), + ATTR_LIST(first_error_line), + ATTR_LIST(last_error_line), + ATTR_LIST(first_error_func), + ATTR_LIST(last_error_func), + ATTR_LIST(first_error_errcode), + ATTR_LIST(last_error_errcode), ATTR_LIST(first_error_time), ATTR_LIST(last_error_time), ATTR_LIST(journal_task), +#ifdef CONFIG_EXT4_DEBUG + ATTR_LIST(simulate_fail), +#endif NULL, }; ATTRIBUTE_GROUPS(ext4); @@ -280,7 +334,7 @@ static void *calc_ptr(struct ext4_attr *a, struct ext4_sb_info *sbi) static ssize_t __print_tstamp(char *buf, __le32 lo, __u8 hi) { - return snprintf(buf, PAGE_SIZE, "%lld", + return snprintf(buf, PAGE_SIZE, "%lld\n", ((time64_t)hi << 32) + le32_to_cpu(lo)); } @@ -318,6 +372,30 @@ static ssize_t ext4_attr_show(struct kobject *kobj, else return snprintf(buf, PAGE_SIZE, "%u\n", *((unsigned int *) ptr)); + case attr_pointer_ul: + if (!ptr) + return 0; + return snprintf(buf, PAGE_SIZE, "%lu\n", + *((unsigned long *) ptr)); + case attr_pointer_u8: + if (!ptr) + return 0; + return snprintf(buf, PAGE_SIZE, "%u\n", + *((unsigned char *) ptr)); + case attr_pointer_u64: + if (!ptr) + return 0; + if (a->attr_ptr == ptr_ext4_super_block_offset) + return snprintf(buf, PAGE_SIZE, "%llu\n", + le64_to_cpup(ptr)); + else + return snprintf(buf, PAGE_SIZE, "%llu\n", + *((unsigned long long *) ptr)); + case attr_pointer_string: + if (!ptr) + return 0; + return snprintf(buf, PAGE_SIZE, "%.*s\n", a->attr_size, + (char *) ptr); case attr_pointer_atomic: if (!ptr) return 0; @@ -361,6 +439,14 @@ static ssize_t ext4_attr_store(struct kobject *kobj, else *((unsigned int *) ptr) = t; return len; + case attr_pointer_ul: + if (!ptr) + return 0; + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; + *((unsigned long *) ptr) = t; + return len; case attr_inode_readahead: return inode_readahead_blks_store(sbi, buf, len); case attr_trigger_test_error: diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 8966a5439a22..8cac7d95c3ad 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1456,7 +1456,7 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value, if (!ce) return NULL; - ea_data = ext4_kvmalloc(value_len, GFP_NOFS); + ea_data = kvmalloc(value_len, GFP_KERNEL); if (!ea_data) { mb_cache_entry_put(ea_inode_cache, ce); return NULL; @@ -2879,9 +2879,11 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) { error = PTR_ERR(bh); - if (error == -EIO) + if (error == -EIO) { + ext4_set_errno(inode->i_sb, EIO); EXT4_ERROR_INODE(inode, "block %llu read error", EXT4_I(inode)->i_file_acl); + } bh = NULL; goto cleanup; } diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 599fb9194c6a..f0faada30f30 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -22,7 +22,7 @@ config F2FS_FS config F2FS_STAT_FS bool "F2FS Status Information" - depends on F2FS_FS && DEBUG_FS + depends on F2FS_FS default y help /sys/kernel/debug/f2fs/ contains information about all the partitions @@ -93,3 +93,28 @@ config F2FS_FAULT_INJECTION Test F2FS to inject faults such as ENOMEM, ENOSPC, and so on. If unsure, say N. + +config F2FS_FS_COMPRESSION + bool "F2FS compression feature" + depends on F2FS_FS + help + Enable filesystem-level compression on f2fs regular files, + multiple back-end compression algorithms are supported. + +config F2FS_FS_LZO + bool "LZO compression support" + depends on F2FS_FS_COMPRESSION + select LZO_COMPRESS + select LZO_DECOMPRESS + default y + help + Support LZO compress algorithm, if unsure, say Y. + +config F2FS_FS_LZ4 + bool "LZ4 compression support" + depends on F2FS_FS_COMPRESSION + select LZ4_COMPRESS + select LZ4_DECOMPRESS + default y + help + Support LZ4 compress algorithm, if unsure, say Y. diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index 2aaecc63834f..ee7316b42f69 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -9,3 +9,4 @@ f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o f2fs-$(CONFIG_FS_VERITY) += verity.o +f2fs-$(CONFIG_F2FS_FS_COMPRESSION) += compress.o diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index ffdaba0c55d2..44e84ac5c941 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1509,10 +1509,10 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_wait_on_all_pages_writeback(sbi); /* - * invalidate intermediate page cache borrowed from meta inode - * which are used for migration of encrypted inode's blocks. + * invalidate intermediate page cache borrowed from meta inode which are + * used for migration of encrypted or verity inode's blocks. */ - if (f2fs_sb_has_encrypt(sbi)) + if (f2fs_sb_has_encrypt(sbi) || f2fs_sb_has_verity(sbi)) invalidate_mapping_pages(META_MAPPING(sbi), MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1); diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c new file mode 100644 index 000000000000..d8a64be90a50 --- /dev/null +++ b/fs/f2fs/compress.c @@ -0,0 +1,1176 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * f2fs compress support + * + * Copyright (c) 2019 Chao Yu <chao@kernel.org> + */ + +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/writeback.h> +#include <linux/backing-dev.h> +#include <linux/lzo.h> +#include <linux/lz4.h> + +#include "f2fs.h" +#include "node.h" +#include <trace/events/f2fs.h> + +struct f2fs_compress_ops { + int (*init_compress_ctx)(struct compress_ctx *cc); + void (*destroy_compress_ctx)(struct compress_ctx *cc); + int (*compress_pages)(struct compress_ctx *cc); + int (*decompress_pages)(struct decompress_io_ctx *dic); +}; + +static unsigned int offset_in_cluster(struct compress_ctx *cc, pgoff_t index) +{ + return index & (cc->cluster_size - 1); +} + +static pgoff_t cluster_idx(struct compress_ctx *cc, pgoff_t index) +{ + return index >> cc->log_cluster_size; +} + +static pgoff_t start_idx_of_cluster(struct compress_ctx *cc) +{ + return cc->cluster_idx << cc->log_cluster_size; +} + +bool f2fs_is_compressed_page(struct page *page) +{ + if (!PagePrivate(page)) + return false; + if (!page_private(page)) + return false; + if (IS_ATOMIC_WRITTEN_PAGE(page) || IS_DUMMY_WRITTEN_PAGE(page)) + return false; + f2fs_bug_on(F2FS_M_SB(page->mapping), + *((u32 *)page_private(page)) != F2FS_COMPRESSED_PAGE_MAGIC); + return true; +} + +static void f2fs_set_compressed_page(struct page *page, + struct inode *inode, pgoff_t index, void *data, refcount_t *r) +{ + SetPagePrivate(page); + set_page_private(page, (unsigned long)data); + + /* i_crypto_info and iv index */ + page->index = index; + page->mapping = inode->i_mapping; + if (r) + refcount_inc(r); +} + +static void f2fs_put_compressed_page(struct page *page) +{ + set_page_private(page, (unsigned long)NULL); + ClearPagePrivate(page); + page->mapping = NULL; + unlock_page(page); + put_page(page); +} + +static void f2fs_drop_rpages(struct compress_ctx *cc, int len, bool unlock) +{ + int i; + + for (i = 0; i < len; i++) { + if (!cc->rpages[i]) + continue; + if (unlock) + unlock_page(cc->rpages[i]); + else + put_page(cc->rpages[i]); + } +} + +static void f2fs_put_rpages(struct compress_ctx *cc) +{ + f2fs_drop_rpages(cc, cc->cluster_size, false); +} + +static void f2fs_unlock_rpages(struct compress_ctx *cc, int len) +{ + f2fs_drop_rpages(cc, len, true); +} + +static void f2fs_put_rpages_mapping(struct compress_ctx *cc, + struct address_space *mapping, + pgoff_t start, int len) +{ + int i; + + for (i = 0; i < len; i++) { + struct page *page = find_get_page(mapping, start + i); + + put_page(page); + put_page(page); + } +} + +static void f2fs_put_rpages_wbc(struct compress_ctx *cc, + struct writeback_control *wbc, bool redirty, int unlock) +{ + unsigned int i; + + for (i = 0; i < cc->cluster_size; i++) { + if (!cc->rpages[i]) + continue; + if (redirty) + redirty_page_for_writepage(wbc, cc->rpages[i]); + f2fs_put_page(cc->rpages[i], unlock); + } +} + +struct page *f2fs_compress_control_page(struct page *page) +{ + return ((struct compress_io_ctx *)page_private(page))->rpages[0]; +} + +int f2fs_init_compress_ctx(struct compress_ctx *cc) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); + + if (cc->nr_rpages) + return 0; + + cc->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) << + cc->log_cluster_size, GFP_NOFS); + return cc->rpages ? 0 : -ENOMEM; +} + +void f2fs_destroy_compress_ctx(struct compress_ctx *cc) +{ + kfree(cc->rpages); + cc->rpages = NULL; + cc->nr_rpages = 0; + cc->nr_cpages = 0; + cc->cluster_idx = NULL_CLUSTER; +} + +void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page) +{ + unsigned int cluster_ofs; + + if (!f2fs_cluster_can_merge_page(cc, page->index)) + f2fs_bug_on(F2FS_I_SB(cc->inode), 1); + + cluster_ofs = offset_in_cluster(cc, page->index); + cc->rpages[cluster_ofs] = page; + cc->nr_rpages++; + cc->cluster_idx = cluster_idx(cc, page->index); +} + +#ifdef CONFIG_F2FS_FS_LZO +static int lzo_init_compress_ctx(struct compress_ctx *cc) +{ + cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode), + LZO1X_MEM_COMPRESS, GFP_NOFS); + if (!cc->private) + return -ENOMEM; + + cc->clen = lzo1x_worst_compress(PAGE_SIZE << cc->log_cluster_size); + return 0; +} + +static void lzo_destroy_compress_ctx(struct compress_ctx *cc) +{ + kvfree(cc->private); + cc->private = NULL; +} + +static int lzo_compress_pages(struct compress_ctx *cc) +{ + int ret; + + ret = lzo1x_1_compress(cc->rbuf, cc->rlen, cc->cbuf->cdata, + &cc->clen, cc->private); + if (ret != LZO_E_OK) { + printk_ratelimited("%sF2FS-fs (%s): lzo compress failed, ret:%d\n", + KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, ret); + return -EIO; + } + return 0; +} + +static int lzo_decompress_pages(struct decompress_io_ctx *dic) +{ + int ret; + + ret = lzo1x_decompress_safe(dic->cbuf->cdata, dic->clen, + dic->rbuf, &dic->rlen); + if (ret != LZO_E_OK) { + printk_ratelimited("%sF2FS-fs (%s): lzo decompress failed, ret:%d\n", + KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, ret); + return -EIO; + } + + if (dic->rlen != PAGE_SIZE << dic->log_cluster_size) { + printk_ratelimited("%sF2FS-fs (%s): lzo invalid rlen:%zu, " + "expected:%lu\n", KERN_ERR, + F2FS_I_SB(dic->inode)->sb->s_id, + dic->rlen, + PAGE_SIZE << dic->log_cluster_size); + return -EIO; + } + return 0; +} + +static const struct f2fs_compress_ops f2fs_lzo_ops = { + .init_compress_ctx = lzo_init_compress_ctx, + .destroy_compress_ctx = lzo_destroy_compress_ctx, + .compress_pages = lzo_compress_pages, + .decompress_pages = lzo_decompress_pages, +}; +#endif + +#ifdef CONFIG_F2FS_FS_LZ4 +static int lz4_init_compress_ctx(struct compress_ctx *cc) +{ + cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode), + LZ4_MEM_COMPRESS, GFP_NOFS); + if (!cc->private) + return -ENOMEM; + + cc->clen = LZ4_compressBound(PAGE_SIZE << cc->log_cluster_size); + return 0; +} + +static void lz4_destroy_compress_ctx(struct compress_ctx *cc) +{ + kvfree(cc->private); + cc->private = NULL; +} + +static int lz4_compress_pages(struct compress_ctx *cc) +{ + int len; + + len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen, + cc->clen, cc->private); + if (!len) { + printk_ratelimited("%sF2FS-fs (%s): lz4 compress failed\n", + KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id); + return -EIO; + } + cc->clen = len; + return 0; +} + +static int lz4_decompress_pages(struct decompress_io_ctx *dic) +{ + int ret; + + ret = LZ4_decompress_safe(dic->cbuf->cdata, dic->rbuf, + dic->clen, dic->rlen); + if (ret < 0) { + printk_ratelimited("%sF2FS-fs (%s): lz4 decompress failed, ret:%d\n", + KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, ret); + return -EIO; + } + + if (ret != PAGE_SIZE << dic->log_cluster_size) { + printk_ratelimited("%sF2FS-fs (%s): lz4 invalid rlen:%zu, " + "expected:%lu\n", KERN_ERR, + F2FS_I_SB(dic->inode)->sb->s_id, + dic->rlen, + PAGE_SIZE << dic->log_cluster_size); + return -EIO; + } + return 0; +} + +static const struct f2fs_compress_ops f2fs_lz4_ops = { + .init_compress_ctx = lz4_init_compress_ctx, + .destroy_compress_ctx = lz4_destroy_compress_ctx, + .compress_pages = lz4_compress_pages, + .decompress_pages = lz4_decompress_pages, +}; +#endif + +static const struct f2fs_compress_ops *f2fs_cops[COMPRESS_MAX] = { +#ifdef CONFIG_F2FS_FS_LZO + &f2fs_lzo_ops, +#else + NULL, +#endif +#ifdef CONFIG_F2FS_FS_LZ4 + &f2fs_lz4_ops, +#else + NULL, +#endif +}; + +bool f2fs_is_compress_backend_ready(struct inode *inode) +{ + if (!f2fs_compressed_file(inode)) + return true; + return f2fs_cops[F2FS_I(inode)->i_compress_algorithm]; +} + +static struct page *f2fs_grab_page(void) +{ + struct page *page; + + page = alloc_page(GFP_NOFS); + if (!page) + return NULL; + lock_page(page); + return page; +} + +static int f2fs_compress_pages(struct compress_ctx *cc) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); + struct f2fs_inode_info *fi = F2FS_I(cc->inode); + const struct f2fs_compress_ops *cops = + f2fs_cops[fi->i_compress_algorithm]; + unsigned int max_len, nr_cpages; + int i, ret; + + trace_f2fs_compress_pages_start(cc->inode, cc->cluster_idx, + cc->cluster_size, fi->i_compress_algorithm); + + ret = cops->init_compress_ctx(cc); + if (ret) + goto out; + + max_len = COMPRESS_HEADER_SIZE + cc->clen; + cc->nr_cpages = DIV_ROUND_UP(max_len, PAGE_SIZE); + + cc->cpages = f2fs_kzalloc(sbi, sizeof(struct page *) * + cc->nr_cpages, GFP_NOFS); + if (!cc->cpages) { + ret = -ENOMEM; + goto destroy_compress_ctx; + } + + for (i = 0; i < cc->nr_cpages; i++) { + cc->cpages[i] = f2fs_grab_page(); + if (!cc->cpages[i]) { + ret = -ENOMEM; + goto out_free_cpages; + } + } + + cc->rbuf = vmap(cc->rpages, cc->cluster_size, VM_MAP, PAGE_KERNEL_RO); + if (!cc->rbuf) { + ret = -ENOMEM; + goto out_free_cpages; + } + + cc->cbuf = vmap(cc->cpages, cc->nr_cpages, VM_MAP, PAGE_KERNEL); + if (!cc->cbuf) { + ret = -ENOMEM; + goto out_vunmap_rbuf; + } + + ret = cops->compress_pages(cc); + if (ret) + goto out_vunmap_cbuf; + + max_len = PAGE_SIZE * (cc->cluster_size - 1) - COMPRESS_HEADER_SIZE; + + if (cc->clen > max_len) { + ret = -EAGAIN; + goto out_vunmap_cbuf; + } + + cc->cbuf->clen = cpu_to_le32(cc->clen); + cc->cbuf->chksum = cpu_to_le32(0); + + for (i = 0; i < COMPRESS_DATA_RESERVED_SIZE; i++) + cc->cbuf->reserved[i] = cpu_to_le32(0); + + vunmap(cc->cbuf); + vunmap(cc->rbuf); + + nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE); + + for (i = nr_cpages; i < cc->nr_cpages; i++) { + f2fs_put_compressed_page(cc->cpages[i]); + cc->cpages[i] = NULL; + } + + cc->nr_cpages = nr_cpages; + + trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx, + cc->clen, ret); + return 0; + +out_vunmap_cbuf: + vunmap(cc->cbuf); +out_vunmap_rbuf: + vunmap(cc->rbuf); +out_free_cpages: + for (i = 0; i < cc->nr_cpages; i++) { + if (cc->cpages[i]) + f2fs_put_compressed_page(cc->cpages[i]); + } + kfree(cc->cpages); + cc->cpages = NULL; +destroy_compress_ctx: + cops->destroy_compress_ctx(cc); +out: + trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx, + cc->clen, ret); + return ret; +} + +void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) +{ + struct decompress_io_ctx *dic = + (struct decompress_io_ctx *)page_private(page); + struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); + struct f2fs_inode_info *fi= F2FS_I(dic->inode); + const struct f2fs_compress_ops *cops = + f2fs_cops[fi->i_compress_algorithm]; + int ret; + + dec_page_count(sbi, F2FS_RD_DATA); + + if (bio->bi_status || PageError(page)) + dic->failed = true; + + if (refcount_dec_not_one(&dic->ref)) + return; + + trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx, + dic->cluster_size, fi->i_compress_algorithm); + + /* submit partial compressed pages */ + if (dic->failed) { + ret = -EIO; + goto out_free_dic; + } + + dic->rbuf = vmap(dic->tpages, dic->cluster_size, VM_MAP, PAGE_KERNEL); + if (!dic->rbuf) { + ret = -ENOMEM; + goto out_free_dic; + } + + dic->cbuf = vmap(dic->cpages, dic->nr_cpages, VM_MAP, PAGE_KERNEL_RO); + if (!dic->cbuf) { + ret = -ENOMEM; + goto out_vunmap_rbuf; + } + + dic->clen = le32_to_cpu(dic->cbuf->clen); + dic->rlen = PAGE_SIZE << dic->log_cluster_size; + + if (dic->clen > PAGE_SIZE * dic->nr_cpages - COMPRESS_HEADER_SIZE) { + ret = -EFSCORRUPTED; + goto out_vunmap_cbuf; + } + + ret = cops->decompress_pages(dic); + +out_vunmap_cbuf: + vunmap(dic->cbuf); +out_vunmap_rbuf: + vunmap(dic->rbuf); +out_free_dic: + if (!verity) + f2fs_decompress_end_io(dic->rpages, dic->cluster_size, + ret, false); + + trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx, + dic->clen, ret); + if (!verity) + f2fs_free_dic(dic); +} + +static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index) +{ + if (cc->cluster_idx == NULL_CLUSTER) + return true; + return cc->cluster_idx == cluster_idx(cc, index); +} + +bool f2fs_cluster_is_empty(struct compress_ctx *cc) +{ + return cc->nr_rpages == 0; +} + +static bool f2fs_cluster_is_full(struct compress_ctx *cc) +{ + return cc->cluster_size == cc->nr_rpages; +} + +bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index) +{ + if (f2fs_cluster_is_empty(cc)) + return true; + return is_page_in_cluster(cc, index); +} + +static bool __cluster_may_compress(struct compress_ctx *cc) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); + loff_t i_size = i_size_read(cc->inode); + unsigned nr_pages = DIV_ROUND_UP(i_size, PAGE_SIZE); + int i; + + for (i = 0; i < cc->cluster_size; i++) { + struct page *page = cc->rpages[i]; + + f2fs_bug_on(sbi, !page); + + if (unlikely(f2fs_cp_error(sbi))) + return false; + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + return false; + + /* beyond EOF */ + if (page->index >= nr_pages) + return false; + } + return true; +} + +/* return # of compressed block addresses */ +static int f2fs_compressed_blocks(struct compress_ctx *cc) +{ + struct dnode_of_data dn; + int ret; + + set_new_dnode(&dn, cc->inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, start_idx_of_cluster(cc), + LOOKUP_NODE); + if (ret) { + if (ret == -ENOENT) + ret = 0; + goto fail; + } + + if (dn.data_blkaddr == COMPRESS_ADDR) { + int i; + + ret = 1; + for (i = 1; i < cc->cluster_size; i++) { + block_t blkaddr; + + blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node + i); + if (blkaddr != NULL_ADDR) + ret++; + } + } +fail: + f2fs_put_dnode(&dn); + return ret; +} + +int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index) +{ + struct compress_ctx cc = { + .inode = inode, + .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, + .cluster_size = F2FS_I(inode)->i_cluster_size, + .cluster_idx = index >> F2FS_I(inode)->i_log_cluster_size, + }; + + return f2fs_compressed_blocks(&cc); +} + +static bool cluster_may_compress(struct compress_ctx *cc) +{ + if (!f2fs_compressed_file(cc->inode)) + return false; + if (f2fs_is_atomic_file(cc->inode)) + return false; + if (f2fs_is_mmap_file(cc->inode)) + return false; + if (!f2fs_cluster_is_full(cc)) + return false; + return __cluster_may_compress(cc); +} + +static void set_cluster_writeback(struct compress_ctx *cc) +{ + int i; + + for (i = 0; i < cc->cluster_size; i++) { + if (cc->rpages[i]) + set_page_writeback(cc->rpages[i]); + } +} + +static void set_cluster_dirty(struct compress_ctx *cc) +{ + int i; + + for (i = 0; i < cc->cluster_size; i++) + if (cc->rpages[i]) + set_page_dirty(cc->rpages[i]); +} + +static int prepare_compress_overwrite(struct compress_ctx *cc, + struct page **pagep, pgoff_t index, void **fsdata) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); + struct address_space *mapping = cc->inode->i_mapping; + struct page *page; + struct dnode_of_data dn; + sector_t last_block_in_bio; + unsigned fgp_flag = FGP_LOCK | FGP_WRITE | FGP_CREAT; + pgoff_t start_idx = start_idx_of_cluster(cc); + int i, ret; + bool prealloc; + +retry: + ret = f2fs_compressed_blocks(cc); + if (ret <= 0) + return ret; + + /* compressed case */ + prealloc = (ret < cc->cluster_size); + + ret = f2fs_init_compress_ctx(cc); + if (ret) + return ret; + + /* keep page reference to avoid page reclaim */ + for (i = 0; i < cc->cluster_size; i++) { + page = f2fs_pagecache_get_page(mapping, start_idx + i, + fgp_flag, GFP_NOFS); + if (!page) { + ret = -ENOMEM; + goto unlock_pages; + } + + if (PageUptodate(page)) + unlock_page(page); + else + f2fs_compress_ctx_add_page(cc, page); + } + + if (!f2fs_cluster_is_empty(cc)) { + struct bio *bio = NULL; + + ret = f2fs_read_multi_pages(cc, &bio, cc->cluster_size, + &last_block_in_bio, false); + f2fs_destroy_compress_ctx(cc); + if (ret) + goto release_pages; + if (bio) + f2fs_submit_bio(sbi, bio, DATA); + + ret = f2fs_init_compress_ctx(cc); + if (ret) + goto release_pages; + } + + for (i = 0; i < cc->cluster_size; i++) { + f2fs_bug_on(sbi, cc->rpages[i]); + + page = find_lock_page(mapping, start_idx + i); + f2fs_bug_on(sbi, !page); + + f2fs_wait_on_page_writeback(page, DATA, true, true); + + f2fs_compress_ctx_add_page(cc, page); + f2fs_put_page(page, 0); + + if (!PageUptodate(page)) { + f2fs_unlock_rpages(cc, i + 1); + f2fs_put_rpages_mapping(cc, mapping, start_idx, + cc->cluster_size); + f2fs_destroy_compress_ctx(cc); + goto retry; + } + } + + if (prealloc) { + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); + + set_new_dnode(&dn, cc->inode, NULL, NULL, 0); + + for (i = cc->cluster_size - 1; i > 0; i--) { + ret = f2fs_get_block(&dn, start_idx + i); + if (ret) { + i = cc->cluster_size; + break; + } + + if (dn.data_blkaddr != NEW_ADDR) + break; + } + + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); + } + + if (likely(!ret)) { + *fsdata = cc->rpages; + *pagep = cc->rpages[offset_in_cluster(cc, index)]; + return cc->cluster_size; + } + +unlock_pages: + f2fs_unlock_rpages(cc, i); +release_pages: + f2fs_put_rpages_mapping(cc, mapping, start_idx, i); + f2fs_destroy_compress_ctx(cc); + return ret; +} + +int f2fs_prepare_compress_overwrite(struct inode *inode, + struct page **pagep, pgoff_t index, void **fsdata) +{ + struct compress_ctx cc = { + .inode = inode, + .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, + .cluster_size = F2FS_I(inode)->i_cluster_size, + .cluster_idx = index >> F2FS_I(inode)->i_log_cluster_size, + .rpages = NULL, + .nr_rpages = 0, + }; + + return prepare_compress_overwrite(&cc, pagep, index, fsdata); +} + +bool f2fs_compress_write_end(struct inode *inode, void *fsdata, + pgoff_t index, unsigned copied) + +{ + struct compress_ctx cc = { + .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, + .cluster_size = F2FS_I(inode)->i_cluster_size, + .rpages = fsdata, + }; + bool first_index = (index == cc.rpages[0]->index); + + if (copied) + set_cluster_dirty(&cc); + + f2fs_put_rpages_wbc(&cc, NULL, false, 1); + f2fs_destroy_compress_ctx(&cc); + + return first_index; +} + +static int f2fs_write_compressed_pages(struct compress_ctx *cc, + int *submitted, + struct writeback_control *wbc, + enum iostat_type io_type) +{ + struct inode *inode = cc->inode; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_io_info fio = { + .sbi = sbi, + .ino = cc->inode->i_ino, + .type = DATA, + .op = REQ_OP_WRITE, + .op_flags = wbc_to_write_flags(wbc), + .old_blkaddr = NEW_ADDR, + .page = NULL, + .encrypted_page = NULL, + .compressed_page = NULL, + .submitted = false, + .need_lock = LOCK_RETRY, + .io_type = io_type, + .io_wbc = wbc, + .encrypted = f2fs_encrypted_file(cc->inode), + }; + struct dnode_of_data dn; + struct node_info ni; + struct compress_io_ctx *cic; + pgoff_t start_idx = start_idx_of_cluster(cc); + unsigned int last_index = cc->cluster_size - 1; + loff_t psize; + int i, err; + + set_new_dnode(&dn, cc->inode, NULL, NULL, 0); + + f2fs_lock_op(sbi); + + err = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); + if (err) + goto out_unlock_op; + + for (i = 0; i < cc->cluster_size; i++) { + if (datablock_addr(dn.inode, dn.node_page, + dn.ofs_in_node + i) == NULL_ADDR) + goto out_put_dnode; + } + + psize = (loff_t)(cc->rpages[last_index]->index + 1) << PAGE_SHIFT; + + err = f2fs_get_node_info(fio.sbi, dn.nid, &ni); + if (err) + goto out_put_dnode; + + fio.version = ni.version; + + cic = f2fs_kzalloc(sbi, sizeof(struct compress_io_ctx), GFP_NOFS); + if (!cic) + goto out_put_dnode; + + cic->magic = F2FS_COMPRESSED_PAGE_MAGIC; + cic->inode = inode; + refcount_set(&cic->ref, 1); + cic->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) << + cc->log_cluster_size, GFP_NOFS); + if (!cic->rpages) + goto out_put_cic; + + cic->nr_rpages = cc->cluster_size; + + for (i = 0; i < cc->nr_cpages; i++) { + f2fs_set_compressed_page(cc->cpages[i], inode, + cc->rpages[i + 1]->index, + cic, i ? &cic->ref : NULL); + fio.compressed_page = cc->cpages[i]; + if (fio.encrypted) { + fio.page = cc->rpages[i + 1]; + err = f2fs_encrypt_one_page(&fio); + if (err) + goto out_destroy_crypt; + cc->cpages[i] = fio.encrypted_page; + } + } + + set_cluster_writeback(cc); + + for (i = 0; i < cc->cluster_size; i++) + cic->rpages[i] = cc->rpages[i]; + + for (i = 0; i < cc->cluster_size; i++, dn.ofs_in_node++) { + block_t blkaddr; + + blkaddr = datablock_addr(dn.inode, dn.node_page, + dn.ofs_in_node); + fio.page = cic->rpages[i]; + fio.old_blkaddr = blkaddr; + + /* cluster header */ + if (i == 0) { + if (blkaddr == COMPRESS_ADDR) + fio.compr_blocks++; + if (__is_valid_data_blkaddr(blkaddr)) + f2fs_invalidate_blocks(sbi, blkaddr); + f2fs_update_data_blkaddr(&dn, COMPRESS_ADDR); + goto unlock_continue; + } + + if (fio.compr_blocks && __is_valid_data_blkaddr(blkaddr)) + fio.compr_blocks++; + + if (i > cc->nr_cpages) { + if (__is_valid_data_blkaddr(blkaddr)) { + f2fs_invalidate_blocks(sbi, blkaddr); + f2fs_update_data_blkaddr(&dn, NEW_ADDR); + } + goto unlock_continue; + } + + f2fs_bug_on(fio.sbi, blkaddr == NULL_ADDR); + + if (fio.encrypted) + fio.encrypted_page = cc->cpages[i - 1]; + else + fio.compressed_page = cc->cpages[i - 1]; + + cc->cpages[i - 1] = NULL; + f2fs_outplace_write_data(&dn, &fio); + (*submitted)++; +unlock_continue: + inode_dec_dirty_pages(cc->inode); + unlock_page(fio.page); + } + + if (fio.compr_blocks) + f2fs_i_compr_blocks_update(inode, fio.compr_blocks - 1, false); + f2fs_i_compr_blocks_update(inode, cc->nr_cpages, true); + + set_inode_flag(cc->inode, FI_APPEND_WRITE); + if (cc->cluster_idx == 0) + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); + + f2fs_put_dnode(&dn); + f2fs_unlock_op(sbi); + + down_write(&fi->i_sem); + if (fi->last_disk_size < psize) + fi->last_disk_size = psize; + up_write(&fi->i_sem); + + f2fs_put_rpages(cc); + f2fs_destroy_compress_ctx(cc); + return 0; + +out_destroy_crypt: + kfree(cic->rpages); + + for (--i; i >= 0; i--) + fscrypt_finalize_bounce_page(&cc->cpages[i]); + for (i = 0; i < cc->nr_cpages; i++) { + if (!cc->cpages[i]) + continue; + f2fs_put_page(cc->cpages[i], 1); + } +out_put_cic: + kfree(cic); +out_put_dnode: + f2fs_put_dnode(&dn); +out_unlock_op: + f2fs_unlock_op(sbi); + return -EAGAIN; +} + +void f2fs_compress_write_end_io(struct bio *bio, struct page *page) +{ + struct f2fs_sb_info *sbi = bio->bi_private; + struct compress_io_ctx *cic = + (struct compress_io_ctx *)page_private(page); + int i; + + if (unlikely(bio->bi_status)) + mapping_set_error(cic->inode->i_mapping, -EIO); + + f2fs_put_compressed_page(page); + + dec_page_count(sbi, F2FS_WB_DATA); + + if (refcount_dec_not_one(&cic->ref)) + return; + + for (i = 0; i < cic->nr_rpages; i++) { + WARN_ON(!cic->rpages[i]); + clear_cold_data(cic->rpages[i]); + end_page_writeback(cic->rpages[i]); + } + + kfree(cic->rpages); + kfree(cic); +} + +static int f2fs_write_raw_pages(struct compress_ctx *cc, + int *submitted, + struct writeback_control *wbc, + enum iostat_type io_type) +{ + struct address_space *mapping = cc->inode->i_mapping; + int _submitted, compr_blocks, ret; + int i = -1, err = 0; + + compr_blocks = f2fs_compressed_blocks(cc); + if (compr_blocks < 0) { + err = compr_blocks; + goto out_err; + } + + for (i = 0; i < cc->cluster_size; i++) { + if (!cc->rpages[i]) + continue; +retry_write: + if (cc->rpages[i]->mapping != mapping) { + unlock_page(cc->rpages[i]); + continue; + } + + BUG_ON(!PageLocked(cc->rpages[i])); + + ret = f2fs_write_single_data_page(cc->rpages[i], &_submitted, + NULL, NULL, wbc, io_type, + compr_blocks); + if (ret) { + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(cc->rpages[i]); + ret = 0; + } else if (ret == -EAGAIN) { + ret = 0; + cond_resched(); + congestion_wait(BLK_RW_ASYNC, HZ/50); + lock_page(cc->rpages[i]); + clear_page_dirty_for_io(cc->rpages[i]); + goto retry_write; + } + err = ret; + goto out_fail; + } + + *submitted += _submitted; + } + return 0; + +out_fail: + /* TODO: revoke partially updated block addresses */ + BUG_ON(compr_blocks); +out_err: + for (++i; i < cc->cluster_size; i++) { + if (!cc->rpages[i]) + continue; + redirty_page_for_writepage(wbc, cc->rpages[i]); + unlock_page(cc->rpages[i]); + } + return err; +} + +int f2fs_write_multi_pages(struct compress_ctx *cc, + int *submitted, + struct writeback_control *wbc, + enum iostat_type io_type) +{ + struct f2fs_inode_info *fi = F2FS_I(cc->inode); + const struct f2fs_compress_ops *cops = + f2fs_cops[fi->i_compress_algorithm]; + int err; + + *submitted = 0; + if (cluster_may_compress(cc)) { + err = f2fs_compress_pages(cc); + if (err == -EAGAIN) { + goto write; + } else if (err) { + f2fs_put_rpages_wbc(cc, wbc, true, 1); + goto destroy_out; + } + + err = f2fs_write_compressed_pages(cc, submitted, + wbc, io_type); + cops->destroy_compress_ctx(cc); + if (!err) + return 0; + f2fs_bug_on(F2FS_I_SB(cc->inode), err != -EAGAIN); + } +write: + f2fs_bug_on(F2FS_I_SB(cc->inode), *submitted); + + err = f2fs_write_raw_pages(cc, submitted, wbc, io_type); + f2fs_put_rpages_wbc(cc, wbc, false, 0); +destroy_out: + f2fs_destroy_compress_ctx(cc); + return err; +} + +struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); + struct decompress_io_ctx *dic; + pgoff_t start_idx = start_idx_of_cluster(cc); + int i; + + dic = f2fs_kzalloc(sbi, sizeof(struct decompress_io_ctx), GFP_NOFS); + if (!dic) + return ERR_PTR(-ENOMEM); + + dic->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) << + cc->log_cluster_size, GFP_NOFS); + if (!dic->rpages) { + kfree(dic); + return ERR_PTR(-ENOMEM); + } + + dic->magic = F2FS_COMPRESSED_PAGE_MAGIC; + dic->inode = cc->inode; + refcount_set(&dic->ref, 1); + dic->cluster_idx = cc->cluster_idx; + dic->cluster_size = cc->cluster_size; + dic->log_cluster_size = cc->log_cluster_size; + dic->nr_cpages = cc->nr_cpages; + dic->failed = false; + + for (i = 0; i < dic->cluster_size; i++) + dic->rpages[i] = cc->rpages[i]; + dic->nr_rpages = cc->cluster_size; + + dic->cpages = f2fs_kzalloc(sbi, sizeof(struct page *) * + dic->nr_cpages, GFP_NOFS); + if (!dic->cpages) + goto out_free; + + for (i = 0; i < dic->nr_cpages; i++) { + struct page *page; + + page = f2fs_grab_page(); + if (!page) + goto out_free; + + f2fs_set_compressed_page(page, cc->inode, + start_idx + i + 1, + dic, i ? &dic->ref : NULL); + dic->cpages[i] = page; + } + + dic->tpages = f2fs_kzalloc(sbi, sizeof(struct page *) * + dic->cluster_size, GFP_NOFS); + if (!dic->tpages) + goto out_free; + + for (i = 0; i < dic->cluster_size; i++) { + if (cc->rpages[i]) + continue; + + dic->tpages[i] = f2fs_grab_page(); + if (!dic->tpages[i]) + goto out_free; + } + + for (i = 0; i < dic->cluster_size; i++) { + if (dic->tpages[i]) + continue; + dic->tpages[i] = cc->rpages[i]; + } + + return dic; + +out_free: + f2fs_free_dic(dic); + return ERR_PTR(-ENOMEM); +} + +void f2fs_free_dic(struct decompress_io_ctx *dic) +{ + int i; + + if (dic->tpages) { + for (i = 0; i < dic->cluster_size; i++) { + if (dic->rpages[i]) + continue; + f2fs_put_page(dic->tpages[i], 1); + } + kfree(dic->tpages); + } + + if (dic->cpages) { + for (i = 0; i < dic->nr_cpages; i++) { + if (!dic->cpages[i]) + continue; + f2fs_put_compressed_page(dic->cpages[i]); + } + kfree(dic->cpages); + } + + kfree(dic->rpages); + kfree(dic); +} + +void f2fs_decompress_end_io(struct page **rpages, + unsigned int cluster_size, bool err, bool verity) +{ + int i; + + for (i = 0; i < cluster_size; i++) { + struct page *rpage = rpages[i]; + + if (!rpage) + continue; + + if (err || PageError(rpage)) { + ClearPageUptodate(rpage); + ClearPageError(rpage); + } else { + if (!verity || fsverity_verify_page(rpage)) + SetPageUptodate(rpage); + else + SetPageError(rpage); + } + unlock_page(rpage); + } +} diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0fa356e94ef5..8bd9afa81c54 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -31,6 +31,47 @@ static struct kmem_cache *bio_post_read_ctx_cache; static struct kmem_cache *bio_entry_slab; static mempool_t *bio_post_read_ctx_pool; +static struct bio_set f2fs_bioset; + +#define F2FS_BIO_POOL_SIZE NR_CURSEG_TYPE + +int __init f2fs_init_bioset(void) +{ + if (bioset_init(&f2fs_bioset, F2FS_BIO_POOL_SIZE, + 0, BIOSET_NEED_BVECS)) + return -ENOMEM; + return 0; +} + +void f2fs_destroy_bioset(void) +{ + bioset_exit(&f2fs_bioset); +} + +static inline struct bio *__f2fs_bio_alloc(gfp_t gfp_mask, + unsigned int nr_iovecs) +{ + return bio_alloc_bioset(gfp_mask, nr_iovecs, &f2fs_bioset); +} + +struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool no_fail) +{ + struct bio *bio; + + if (no_fail) { + /* No failure on bio allocation */ + bio = __f2fs_bio_alloc(GFP_NOIO, npages); + if (!bio) + bio = __f2fs_bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages); + return bio; + } + if (time_to_inject(sbi, FAULT_ALLOC_BIO)) { + f2fs_show_injection_info(sbi, FAULT_ALLOC_BIO); + return NULL; + } + + return __f2fs_bio_alloc(GFP_KERNEL, npages); +} static bool __is_cp_guaranteed(struct page *page) { @@ -41,6 +82,9 @@ static bool __is_cp_guaranteed(struct page *page) if (!mapping) return false; + if (f2fs_is_compressed_page(page)) + return false; + inode = mapping->host; sbi = F2FS_I_SB(inode); @@ -73,19 +117,19 @@ static enum count_type __read_io_type(struct page *page) /* postprocessing steps for read bios */ enum bio_post_read_step { - STEP_INITIAL = 0, STEP_DECRYPT, + STEP_DECOMPRESS, STEP_VERITY, }; struct bio_post_read_ctx { struct bio *bio; + struct f2fs_sb_info *sbi; struct work_struct work; - unsigned int cur_step; unsigned int enabled_steps; }; -static void __read_end_io(struct bio *bio) +static void __read_end_io(struct bio *bio, bool compr, bool verity) { struct page *page; struct bio_vec *bv; @@ -94,6 +138,13 @@ static void __read_end_io(struct bio *bio) bio_for_each_segment_all(bv, bio, iter_all) { page = bv->bv_page; +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (compr && f2fs_is_compressed_page(page)) { + f2fs_decompress_pages(bio, page, verity); + continue; + } +#endif + /* PG_error was set if any post_read step failed */ if (bio->bi_status || PageError(page)) { ClearPageUptodate(page); @@ -105,31 +156,107 @@ static void __read_end_io(struct bio *bio) dec_page_count(F2FS_P_SB(page), __read_io_type(page)); unlock_page(page); } - if (bio->bi_private) - mempool_free(bio->bi_private, bio_post_read_ctx_pool); - bio_put(bio); +} + +static void f2fs_release_read_bio(struct bio *bio); +static void __f2fs_read_end_io(struct bio *bio, bool compr, bool verity) +{ + if (!compr) + __read_end_io(bio, false, verity); + f2fs_release_read_bio(bio); +} + +static void f2fs_decompress_bio(struct bio *bio, bool verity) +{ + __read_end_io(bio, true, verity); } static void bio_post_read_processing(struct bio_post_read_ctx *ctx); -static void decrypt_work(struct work_struct *work) +static void f2fs_decrypt_work(struct bio_post_read_ctx *ctx) +{ + fscrypt_decrypt_bio(ctx->bio); +} + +static void f2fs_decompress_work(struct bio_post_read_ctx *ctx) +{ + f2fs_decompress_bio(ctx->bio, ctx->enabled_steps & (1 << STEP_VERITY)); +} + +#ifdef CONFIG_F2FS_FS_COMPRESSION +static void f2fs_verify_pages(struct page **rpages, unsigned int cluster_size) +{ + f2fs_decompress_end_io(rpages, cluster_size, false, true); +} + +static void f2fs_verify_bio(struct bio *bio) +{ + struct page *page = bio_first_page_all(bio); + struct decompress_io_ctx *dic = + (struct decompress_io_ctx *)page_private(page); + + f2fs_verify_pages(dic->rpages, dic->cluster_size); + f2fs_free_dic(dic); +} +#endif + +static void f2fs_verity_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); + struct bio *bio = ctx->bio; +#ifdef CONFIG_F2FS_FS_COMPRESSION + unsigned int enabled_steps = ctx->enabled_steps; +#endif - fscrypt_decrypt_bio(ctx->bio); + /* + * fsverity_verify_bio() may call readpages() again, and while verity + * will be disabled for this, decryption may still be needed, resulting + * in another bio_post_read_ctx being allocated. So to prevent + * deadlocks we need to release the current ctx to the mempool first. + * This assumes that verity is the last post-read step. + */ + mempool_free(ctx, bio_post_read_ctx_pool); + bio->bi_private = NULL; + +#ifdef CONFIG_F2FS_FS_COMPRESSION + /* previous step is decompression */ + if (enabled_steps & (1 << STEP_DECOMPRESS)) { + f2fs_verify_bio(bio); + f2fs_release_read_bio(bio); + return; + } +#endif - bio_post_read_processing(ctx); + fsverity_verify_bio(bio); + __f2fs_read_end_io(bio, false, false); } -static void verity_work(struct work_struct *work) +static void f2fs_post_read_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); - fsverity_verify_bio(ctx->bio); + if (ctx->enabled_steps & (1 << STEP_DECRYPT)) + f2fs_decrypt_work(ctx); - bio_post_read_processing(ctx); + if (ctx->enabled_steps & (1 << STEP_DECOMPRESS)) + f2fs_decompress_work(ctx); + + if (ctx->enabled_steps & (1 << STEP_VERITY)) { + INIT_WORK(&ctx->work, f2fs_verity_work); + fsverity_enqueue_verify_work(&ctx->work); + return; + } + + __f2fs_read_end_io(ctx->bio, + ctx->enabled_steps & (1 << STEP_DECOMPRESS), false); +} + +static void f2fs_enqueue_post_read_work(struct f2fs_sb_info *sbi, + struct work_struct *work) +{ + queue_work(sbi->post_read_wq, work); } static void bio_post_read_processing(struct bio_post_read_ctx *ctx) @@ -139,31 +266,26 @@ static void bio_post_read_processing(struct bio_post_read_ctx *ctx) * verity may require reading metadata pages that need decryption, and * we shouldn't recurse to the same workqueue. */ - switch (++ctx->cur_step) { - case STEP_DECRYPT: - if (ctx->enabled_steps & (1 << STEP_DECRYPT)) { - INIT_WORK(&ctx->work, decrypt_work); - fscrypt_enqueue_decrypt_work(&ctx->work); - return; - } - ctx->cur_step++; - /* fall-through */ - case STEP_VERITY: - if (ctx->enabled_steps & (1 << STEP_VERITY)) { - INIT_WORK(&ctx->work, verity_work); - fsverity_enqueue_verify_work(&ctx->work); - return; - } - ctx->cur_step++; - /* fall-through */ - default: - __read_end_io(ctx->bio); + + if (ctx->enabled_steps & (1 << STEP_DECRYPT) || + ctx->enabled_steps & (1 << STEP_DECOMPRESS)) { + INIT_WORK(&ctx->work, f2fs_post_read_work); + f2fs_enqueue_post_read_work(ctx->sbi, &ctx->work); + return; } + + if (ctx->enabled_steps & (1 << STEP_VERITY)) { + INIT_WORK(&ctx->work, f2fs_verity_work); + fsverity_enqueue_verify_work(&ctx->work); + return; + } + + __f2fs_read_end_io(ctx->bio, false, false); } static bool f2fs_bio_post_read_required(struct bio *bio) { - return bio->bi_private && !bio->bi_status; + return bio->bi_private; } static void f2fs_read_end_io(struct bio *bio) @@ -178,12 +300,11 @@ static void f2fs_read_end_io(struct bio *bio) if (f2fs_bio_post_read_required(bio)) { struct bio_post_read_ctx *ctx = bio->bi_private; - ctx->cur_step = STEP_INITIAL; bio_post_read_processing(ctx); return; } - __read_end_io(bio); + __f2fs_read_end_io(bio, false, false); } static void f2fs_write_end_io(struct bio *bio) @@ -214,6 +335,13 @@ static void f2fs_write_end_io(struct bio *bio) fscrypt_finalize_bounce_page(&page); +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_is_compressed_page(page)) { + f2fs_compress_write_end_io(bio, page); + continue; + } +#endif + if (unlikely(bio->bi_status)) { mapping_set_error(page->mapping, -EIO); if (type == F2FS_WB_CP_DATA) @@ -358,6 +486,12 @@ submit_io: submit_bio(bio); } +void f2fs_submit_bio(struct f2fs_sb_info *sbi, + struct bio *bio, enum page_type type) +{ + __submit_bio(sbi, bio, type); +} + static void __submit_merged_bio(struct f2fs_bio_info *io) { struct f2fs_io_info *fio = &io->fio; @@ -380,7 +514,6 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode, struct page *page, nid_t ino) { struct bio_vec *bvec; - struct page *target; struct bvec_iter_all iter_all; if (!bio) @@ -390,10 +523,18 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode, return true; bio_for_each_segment_all(bvec, bio, iter_all) { + struct page *target = bvec->bv_page; - target = bvec->bv_page; - if (fscrypt_is_bounce_page(target)) + if (fscrypt_is_bounce_page(target)) { target = fscrypt_pagecache_page(target); + if (IS_ERR(target)) + continue; + } + if (f2fs_is_compressed_page(target)) { + target = f2fs_compress_control_page(target); + if (IS_ERR(target)) + continue; + } if (inode && inode == target->mapping->host) return true; @@ -588,7 +729,8 @@ static int add_ipu_page(struct f2fs_sb_info *sbi, struct bio **bio, found = true; - if (bio_add_page(*bio, page, PAGE_SIZE, 0) == PAGE_SIZE) { + if (bio_add_page(*bio, page, PAGE_SIZE, 0) == + PAGE_SIZE) { ret = 0; break; } @@ -728,7 +870,12 @@ next: verify_fio_blkaddr(fio); - bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; + if (fio->encrypted_page) + bio_page = fio->encrypted_page; + else if (fio->compressed_page) + bio_page = fio->compressed_page; + else + bio_page = fio->page; /* set submitted = true as a return value */ fio->submitted = true; @@ -797,17 +944,16 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, if (f2fs_encrypted_file(inode)) post_read_steps |= 1 << STEP_DECRYPT; - + if (f2fs_compressed_file(inode)) + post_read_steps |= 1 << STEP_DECOMPRESS; if (f2fs_need_verity(inode, first_idx)) post_read_steps |= 1 << STEP_VERITY; if (post_read_steps) { + /* Due to the mempool, this never fails. */ ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); - if (!ctx) { - bio_put(bio); - return ERR_PTR(-ENOMEM); - } ctx->bio = bio; + ctx->sbi = sbi; ctx->enabled_steps = post_read_steps; bio->bi_private = ctx; } @@ -815,6 +961,13 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, return bio; } +static void f2fs_release_read_bio(struct bio *bio) +{ + if (bio->bi_private) + mempool_free(bio->bi_private, bio_post_read_ctx_pool); + bio_put(bio); +} + /* This can handle encryption stuffs */ static int f2fs_submit_page_read(struct inode *inode, struct page *page, block_t blkaddr) @@ -1180,19 +1333,6 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) int err = 0; bool direct_io = iocb->ki_flags & IOCB_DIRECT; - /* convert inline data for Direct I/O*/ - if (direct_io) { - err = f2fs_convert_inline_inode(inode); - if (err) - return err; - } - - if (direct_io && allow_outplace_dio(inode, iocb, from)) - return 0; - - if (is_inode_flag_set(inode, FI_NO_PREALLOC)) - return 0; - map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from)); if (map.m_len > map.m_lblk) @@ -1872,6 +2012,144 @@ out: return ret; } +#ifdef CONFIG_F2FS_FS_COMPRESSION +int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, + unsigned nr_pages, sector_t *last_block_in_bio, + bool is_readahead) +{ + struct dnode_of_data dn; + struct inode *inode = cc->inode; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct bio *bio = *bio_ret; + unsigned int start_idx = cc->cluster_idx << cc->log_cluster_size; + sector_t last_block_in_file; + const unsigned blkbits = inode->i_blkbits; + const unsigned blocksize = 1 << blkbits; + struct decompress_io_ctx *dic = NULL; + int i; + int ret = 0; + + f2fs_bug_on(sbi, f2fs_cluster_is_empty(cc)); + + last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; + + /* get rid of pages beyond EOF */ + for (i = 0; i < cc->cluster_size; i++) { + struct page *page = cc->rpages[i]; + + if (!page) + continue; + if ((sector_t)page->index >= last_block_in_file) { + zero_user_segment(page, 0, PAGE_SIZE); + if (!PageUptodate(page)) + SetPageUptodate(page); + } else if (!PageUptodate(page)) { + continue; + } + unlock_page(page); + cc->rpages[i] = NULL; + cc->nr_rpages--; + } + + /* we are done since all pages are beyond EOF */ + if (f2fs_cluster_is_empty(cc)) + goto out; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); + if (ret) + goto out; + + /* cluster was overwritten as normal cluster */ + if (dn.data_blkaddr != COMPRESS_ADDR) + goto out; + + for (i = 1; i < cc->cluster_size; i++) { + block_t blkaddr; + + blkaddr = datablock_addr(dn.inode, dn.node_page, + dn.ofs_in_node + i); + + if (!__is_valid_data_blkaddr(blkaddr)) + break; + + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC)) { + ret = -EFAULT; + goto out_put_dnode; + } + cc->nr_cpages++; + } + + /* nothing to decompress */ + if (cc->nr_cpages == 0) { + ret = 0; + goto out_put_dnode; + } + + dic = f2fs_alloc_dic(cc); + if (IS_ERR(dic)) { + ret = PTR_ERR(dic); + goto out_put_dnode; + } + + for (i = 0; i < dic->nr_cpages; i++) { + struct page *page = dic->cpages[i]; + block_t blkaddr; + + blkaddr = datablock_addr(dn.inode, dn.node_page, + dn.ofs_in_node + i + 1); + + if (bio && !page_is_mergeable(sbi, bio, + *last_block_in_bio, blkaddr)) { +submit_and_realloc: + __submit_bio(sbi, bio, DATA); + bio = NULL; + } + + if (!bio) { + bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages, + is_readahead ? REQ_RAHEAD : 0, + page->index); + if (IS_ERR(bio)) { + ret = PTR_ERR(bio); + bio = NULL; + dic->failed = true; + if (refcount_sub_and_test(dic->nr_cpages - i, + &dic->ref)) + f2fs_decompress_end_io(dic->rpages, + cc->cluster_size, true, + false); + f2fs_free_dic(dic); + f2fs_put_dnode(&dn); + *bio_ret = bio; + return ret; + } + } + + f2fs_wait_on_block_writeback(inode, blkaddr); + + if (bio_add_page(bio, page, blocksize, 0) < blocksize) + goto submit_and_realloc; + + inc_page_count(sbi, F2FS_RD_DATA); + ClearPageError(page); + *last_block_in_bio = blkaddr; + } + + f2fs_put_dnode(&dn); + + *bio_ret = bio; + return 0; + +out_put_dnode: + f2fs_put_dnode(&dn); +out: + f2fs_decompress_end_io(cc->rpages, cc->cluster_size, true, false); + *bio_ret = bio; + return ret; +} +#endif + /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. @@ -1889,6 +2167,19 @@ int f2fs_mpage_readpages(struct address_space *mapping, sector_t last_block_in_bio = 0; struct inode *inode = mapping->host; struct f2fs_map_blocks map; +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct compress_ctx cc = { + .inode = inode, + .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, + .cluster_size = F2FS_I(inode)->i_cluster_size, + .cluster_idx = NULL_CLUSTER, + .rpages = NULL, + .cpages = NULL, + .nr_rpages = 0, + .nr_cpages = 0, + }; +#endif + unsigned max_nr_pages = nr_pages; int ret = 0; map.m_pblk = 0; @@ -1912,9 +2203,41 @@ int f2fs_mpage_readpages(struct address_space *mapping, goto next_page; } - ret = f2fs_read_single_page(inode, page, nr_pages, &map, &bio, - &last_block_in_bio, is_readahead); +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + /* there are remained comressed pages, submit them */ + if (!f2fs_cluster_can_merge_page(&cc, page->index)) { + ret = f2fs_read_multi_pages(&cc, &bio, + max_nr_pages, + &last_block_in_bio, + is_readahead); + f2fs_destroy_compress_ctx(&cc); + if (ret) + goto set_error_page; + } + ret = f2fs_is_compressed_cluster(inode, page->index); + if (ret < 0) + goto set_error_page; + else if (!ret) + goto read_single_page; + + ret = f2fs_init_compress_ctx(&cc); + if (ret) + goto set_error_page; + + f2fs_compress_ctx_add_page(&cc, page); + + goto next_page; + } +read_single_page: +#endif + + ret = f2fs_read_single_page(inode, page, max_nr_pages, &map, + &bio, &last_block_in_bio, is_readahead); if (ret) { +#ifdef CONFIG_F2FS_FS_COMPRESSION +set_error_page: +#endif SetPageError(page); zero_user_segment(page, 0, PAGE_SIZE); unlock_page(page); @@ -1922,6 +2245,19 @@ int f2fs_mpage_readpages(struct address_space *mapping, next_page: if (pages) put_page(page); + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + /* last page */ + if (nr_pages == 1 && !f2fs_cluster_is_empty(&cc)) { + ret = f2fs_read_multi_pages(&cc, &bio, + max_nr_pages, + &last_block_in_bio, + is_readahead); + f2fs_destroy_compress_ctx(&cc); + } + } +#endif } BUG_ON(pages && !list_empty(pages)); if (bio) @@ -1936,6 +2272,11 @@ static int f2fs_read_data_page(struct file *file, struct page *page) trace_f2fs_readpage(page, DATA); + if (!f2fs_is_compress_backend_ready(inode)) { + unlock_page(page); + return -EOPNOTSUPP; + } + /* If the file has inline data, try to read it directly */ if (f2fs_has_inline_data(inode)) ret = f2fs_read_inline_data(inode, page); @@ -1954,6 +2295,9 @@ static int f2fs_read_data_pages(struct file *file, trace_f2fs_readpages(inode, page, nr_pages); + if (!f2fs_is_compress_backend_ready(inode)) + return 0; + /* If the file has inline data, skip readpages */ if (f2fs_has_inline_data(inode)) return 0; @@ -1961,22 +2305,23 @@ static int f2fs_read_data_pages(struct file *file, return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages, true); } -static int encrypt_one_page(struct f2fs_io_info *fio) +int f2fs_encrypt_one_page(struct f2fs_io_info *fio) { struct inode *inode = fio->page->mapping->host; - struct page *mpage; + struct page *mpage, *page; gfp_t gfp_flags = GFP_NOFS; if (!f2fs_encrypted_file(inode)) return 0; + page = fio->compressed_page ? fio->compressed_page : fio->page; + /* wait for GCed page writeback via META_MAPPING */ f2fs_wait_on_block_writeback(inode, fio->old_blkaddr); retry_encrypt: - fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(fio->page, - PAGE_SIZE, 0, - gfp_flags); + fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(page, + PAGE_SIZE, 0, gfp_flags); if (IS_ERR(fio->encrypted_page)) { /* flush pending IOs and wait for a while in the ENOMEM case */ if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { @@ -2136,7 +2481,7 @@ got_it: if (ipu_force || (__is_valid_data_blkaddr(fio->old_blkaddr) && need_inplace_update(fio))) { - err = encrypt_one_page(fio); + err = f2fs_encrypt_one_page(fio); if (err) goto out_writepage; @@ -2172,13 +2517,16 @@ got_it: fio->version = ni.version; - err = encrypt_one_page(fio); + err = f2fs_encrypt_one_page(fio); if (err) goto out_writepage; set_page_writeback(page); ClearPageError(page); + if (fio->compr_blocks && fio->old_blkaddr == COMPRESS_ADDR) + f2fs_i_compr_blocks_update(inode, fio->compr_blocks - 1, false); + /* LFS mode write path */ f2fs_outplace_write_data(&dn, fio); trace_f2fs_do_write_data_page(page, OPU); @@ -2193,16 +2541,17 @@ out: return err; } -static int __write_data_page(struct page *page, bool *submitted, +int f2fs_write_single_data_page(struct page *page, int *submitted, struct bio **bio, sector_t *last_block, struct writeback_control *wbc, - enum iostat_type io_type) + enum iostat_type io_type, + int compr_blocks) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); loff_t i_size = i_size_read(inode); - const pgoff_t end_index = ((unsigned long long) i_size) + const pgoff_t end_index = ((unsigned long long)i_size) >> PAGE_SHIFT; loff_t psize = (loff_t)(page->index + 1) << PAGE_SHIFT; unsigned offset = 0; @@ -2218,6 +2567,7 @@ static int __write_data_page(struct page *page, bool *submitted, .page = page, .encrypted_page = NULL, .submitted = false, + .compr_blocks = compr_blocks, .need_lock = LOCK_RETRY, .io_type = io_type, .io_wbc = wbc, @@ -2242,7 +2592,9 @@ static int __write_data_page(struct page *page, bool *submitted, if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; - if (page->index < end_index || f2fs_verity_in_progress(inode)) + if (page->index < end_index || + f2fs_verity_in_progress(inode) || + compr_blocks) goto write; /* @@ -2318,7 +2670,6 @@ out: f2fs_remove_dirty_inode(inode); submitted = NULL; } - unlock_page(page); if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) && !F2FS_I(inode)->cp_task) @@ -2331,7 +2682,7 @@ out: } if (submitted) - *submitted = fio.submitted; + *submitted = fio.submitted ? 1 : 0; return 0; @@ -2352,7 +2703,23 @@ redirty_out: static int f2fs_write_data_page(struct page *page, struct writeback_control *wbc) { - return __write_data_page(page, NULL, NULL, NULL, wbc, FS_DATA_IO); +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct inode *inode = page->mapping->host; + + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + goto out; + + if (f2fs_compressed_file(inode)) { + if (f2fs_is_compressed_cluster(inode, page->index)) { + redirty_page_for_writepage(wbc, page); + return AOP_WRITEPAGE_ACTIVATE; + } + } +out: +#endif + + return f2fs_write_single_data_page(page, NULL, NULL, NULL, + wbc, FS_DATA_IO, 0); } /* @@ -2365,11 +2732,27 @@ static int f2fs_write_cache_pages(struct address_space *mapping, enum iostat_type io_type) { int ret = 0; - int done = 0; + int done = 0, retry = 0; struct pagevec pvec; struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); struct bio *bio = NULL; sector_t last_block; +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct inode *inode = mapping->host; + struct compress_ctx cc = { + .inode = inode, + .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, + .cluster_size = F2FS_I(inode)->i_cluster_size, + .cluster_idx = NULL_CLUSTER, + .rpages = NULL, + .nr_rpages = 0, + .cpages = NULL, + .rbuf = NULL, + .cbuf = NULL, + .rlen = PAGE_SIZE * F2FS_I(inode)->i_cluster_size, + .private = NULL, + }; +#endif int nr_pages; pgoff_t uninitialized_var(writeback_index); pgoff_t index; @@ -2379,6 +2762,8 @@ static int f2fs_write_cache_pages(struct address_space *mapping, int range_whole = 0; xa_mark_t tag; int nwritten = 0; + int submitted = 0; + int i; pagevec_init(&pvec); @@ -2408,12 +2793,11 @@ static int f2fs_write_cache_pages(struct address_space *mapping, else tag = PAGECACHE_TAG_DIRTY; retry: + retry = 0; if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); done_index = index; - while (!done && (index <= end)) { - int i; - + while (!done && !retry && (index <= end)) { nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, tag); if (nr_pages == 0) @@ -2421,15 +2805,62 @@ retry: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - bool submitted = false; + bool need_readd; +readd: + need_readd = false; +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + ret = f2fs_init_compress_ctx(&cc); + if (ret) { + done = 1; + break; + } + + if (!f2fs_cluster_can_merge_page(&cc, + page->index)) { + ret = f2fs_write_multi_pages(&cc, + &submitted, wbc, io_type); + if (!ret) + need_readd = true; + goto result; + } + if (unlikely(f2fs_cp_error(sbi))) + goto lock_page; + + if (f2fs_cluster_is_empty(&cc)) { + void *fsdata = NULL; + struct page *pagep; + int ret2; + + ret2 = f2fs_prepare_compress_overwrite( + inode, &pagep, + page->index, &fsdata); + if (ret2 < 0) { + ret = ret2; + done = 1; + break; + } else if (ret2 && + !f2fs_compress_write_end(inode, + fsdata, page->index, + 1)) { + retry = 1; + break; + } + } else { + goto lock_page; + } + } +#endif /* give a priority to WB_SYNC threads */ if (atomic_read(&sbi->wb_sync_req[DATA]) && wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; } - +#ifdef CONFIG_F2FS_FS_COMPRESSION +lock_page: +#endif done_index = page->index; retry_write: lock_page(page); @@ -2456,45 +2887,71 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - ret = __write_data_page(page, &submitted, &bio, - &last_block, wbc, io_type); +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + get_page(page); + f2fs_compress_ctx_add_page(&cc, page); + continue; + } +#endif + ret = f2fs_write_single_data_page(page, &submitted, + &bio, &last_block, wbc, io_type, 0); + if (ret == AOP_WRITEPAGE_ACTIVATE) + unlock_page(page); +#ifdef CONFIG_F2FS_FS_COMPRESSION +result: +#endif + nwritten += submitted; + wbc->nr_to_write -= submitted; + if (unlikely(ret)) { /* * keep nr_to_write, since vfs uses this to * get # of written pages. */ if (ret == AOP_WRITEPAGE_ACTIVATE) { - unlock_page(page); ret = 0; - continue; + goto next; } else if (ret == -EAGAIN) { ret = 0; if (wbc->sync_mode == WB_SYNC_ALL) { cond_resched(); congestion_wait(BLK_RW_ASYNC, - HZ/50); + HZ/50); goto retry_write; } - continue; + goto next; } done_index = page->index + 1; done = 1; break; - } else if (submitted) { - nwritten++; } - if (--wbc->nr_to_write <= 0 && + if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; } +next: + if (need_readd) + goto readd; } pagevec_release(&pvec); cond_resched(); } - - if (!cycled && !done) { +#ifdef CONFIG_F2FS_FS_COMPRESSION + /* flush remained pages in compress cluster */ + if (f2fs_compressed_file(inode) && !f2fs_cluster_is_empty(&cc)) { + ret = f2fs_write_multi_pages(&cc, &submitted, wbc, io_type); + nwritten += submitted; + wbc->nr_to_write -= submitted; + if (ret) { + done = 1; + retry = 0; + } + } +#endif + if ((!cycled && !done) || retry) { cycled = 1; index = 0; end = writeback_index - 1; @@ -2518,6 +2975,8 @@ static inline bool __should_serialize_io(struct inode *inode, { if (!S_ISREG(inode->i_mode)) return false; + if (f2fs_compressed_file(inode)) + return true; if (IS_NOQUOTA(inode)) return false; /* to avoid deadlock in path of data flush */ @@ -2613,14 +3072,16 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; loff_t i_size = i_size_read(inode); + if (IS_NOQUOTA(inode)) + return; + /* In the fs-verity case, f2fs_end_enable_verity() does the truncate */ if (to > i_size && !f2fs_verity_in_progress(inode)) { down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache(inode, i_size); - if (!IS_NOQUOTA(inode)) - f2fs_truncate_blocks(inode, i_size, true); + f2fs_truncate_blocks(inode, i_size, true); up_write(&F2FS_I(inode)->i_mmap_sem); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); @@ -2660,6 +3121,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, __do_map_lock(sbi, flag, true); locked = true; } + restart: /* check inline_data */ ipage = f2fs_get_node_page(sbi, inode->i_ino); @@ -2750,6 +3212,24 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, if (err) goto fail; } + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + int ret; + + *fsdata = NULL; + + ret = f2fs_prepare_compress_overwrite(inode, pagep, + index, fsdata); + if (ret < 0) { + err = ret; + goto fail; + } else if (ret) { + return 0; + } + } +#endif + repeat: /* * Do not use grab_cache_page_write_begin() to avoid deadlock due to @@ -2762,6 +3242,8 @@ repeat: goto fail; } + /* TODO: cluster can be compressed due to race with .writepage */ + *pagep = page; err = prepare_write_begin(sbi, page, pos, len, @@ -2845,6 +3327,16 @@ static int f2fs_write_end(struct file *file, else SetPageUptodate(page); } + +#ifdef CONFIG_F2FS_FS_COMPRESSION + /* overwrite compressed file */ + if (f2fs_compressed_file(inode) && fsdata) { + f2fs_compress_write_end(inode, fsdata, page->index, copied); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + return copied; + } +#endif + if (!copied) goto unlock_out; @@ -3145,7 +3637,8 @@ int f2fs_migrate_page(struct address_space *mapping, #ifdef CONFIG_SWAP /* Copied from generic_swapfile_activate() to check any holes */ -static int check_swap_activate(struct file *swap_file, unsigned int max) +static int check_swap_activate(struct swap_info_struct *sis, + struct file *swap_file, sector_t *span) { struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; @@ -3156,6 +3649,8 @@ static int check_swap_activate(struct file *swap_file, unsigned int max) sector_t last_block; sector_t lowest_block = -1; sector_t highest_block = 0; + int nr_extents = 0; + int ret; blkbits = inode->i_blkbits; blocks_per_page = PAGE_SIZE >> blkbits; @@ -3167,7 +3662,8 @@ static int check_swap_activate(struct file *swap_file, unsigned int max) probe_block = 0; page_no = 0; last_block = i_size_read(inode) >> blkbits; - while ((probe_block + blocks_per_page) <= last_block && page_no < max) { + while ((probe_block + blocks_per_page) <= last_block && + page_no < sis->max) { unsigned block_in_page; sector_t first_block; @@ -3207,13 +3703,27 @@ static int check_swap_activate(struct file *swap_file, unsigned int max) highest_block = first_block; } + /* + * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks + */ + ret = add_swap_extent(sis, page_no, 1, first_block); + if (ret < 0) + goto out; + nr_extents += ret; page_no++; probe_block += blocks_per_page; reprobe: continue; } - return 0; - + ret = nr_extents; + *span = 1 + highest_block - lowest_block; + if (page_no == 0) + page_no = 1; /* force Empty message */ + sis->max = page_no; + sis->pages = page_no - 1; + sis->highest_bit = page_no - 1; +out: + return ret; bad_bmap: pr_err("swapon: swapfile has holes\n"); return -EINVAL; @@ -3235,14 +3745,17 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, if (ret) return ret; - ret = check_swap_activate(file, sis->max); - if (ret) + if (f2fs_disable_compressed_file(inode)) + return -EINVAL; + + ret = check_swap_activate(sis, file, span); + if (ret < 0) return ret; set_inode_flag(inode, FI_PIN_FILE); f2fs_precache_extents(inode); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - return 0; + return ret; } static void f2fs_swap_deactivate(struct file *file) @@ -3319,6 +3832,27 @@ void f2fs_destroy_post_read_processing(void) kmem_cache_destroy(bio_post_read_ctx_cache); } +int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi) +{ + if (!f2fs_sb_has_encrypt(sbi) && + !f2fs_sb_has_verity(sbi) && + !f2fs_sb_has_compression(sbi)) + return 0; + + sbi->post_read_wq = alloc_workqueue("f2fs_post_read_wq", + WQ_UNBOUND | WQ_HIGHPRI, + num_online_cpus()); + if (!sbi->post_read_wq) + return -ENOMEM; + return 0; +} + +void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi) +{ + if (sbi->post_read_wq) + destroy_workqueue(sbi->post_read_wq); +} + int __init f2fs_init_bio_entry_cache(void) { bio_entry_slab = f2fs_kmem_cache_create("bio_entry_slab", @@ -3328,7 +3862,7 @@ int __init f2fs_init_bio_entry_cache(void) return 0; } -void __exit f2fs_destroy_bio_entry_cache(void) +void f2fs_destroy_bio_entry_cache(void) { kmem_cache_destroy(bio_entry_slab); } diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 9b0bedd82581..6b89eae5e4ca 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -21,9 +21,45 @@ #include "gc.h" static LIST_HEAD(f2fs_stat_list); -static struct dentry *f2fs_debugfs_root; static DEFINE_MUTEX(f2fs_stat_mutex); +#ifdef CONFIG_DEBUG_FS +static struct dentry *f2fs_debugfs_root; +#endif + +/* + * This function calculates BDF of every segments + */ +void f2fs_update_sit_info(struct f2fs_sb_info *sbi) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + unsigned long long blks_per_sec, hblks_per_sec, total_vblocks; + unsigned long long bimodal, dist; + unsigned int segno, vblocks; + int ndirty = 0; + + bimodal = 0; + total_vblocks = 0; + blks_per_sec = BLKS_PER_SEC(sbi); + hblks_per_sec = blks_per_sec / 2; + for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { + vblocks = get_valid_blocks(sbi, segno, true); + dist = abs(vblocks - hblks_per_sec); + bimodal += dist * dist; + + if (vblocks > 0 && vblocks < blks_per_sec) { + total_vblocks += vblocks; + ndirty++; + } + } + dist = div_u64(MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec, 100); + si->bimodal = div64_u64(bimodal, dist); + if (si->dirty_count) + si->avg_vblocks = div_u64(total_vblocks, ndirty); + else + si->avg_vblocks = 0; +} +#ifdef CONFIG_DEBUG_FS static void update_general_status(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = F2FS_STAT(sbi); @@ -56,7 +92,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->nquota_files = sbi->nquota_files; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); - si->aw_cnt = atomic_read(&sbi->aw_cnt); + si->aw_cnt = sbi->atomic_files; si->vw_cnt = atomic_read(&sbi->vw_cnt); si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt); @@ -94,6 +130,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->inline_xattr = atomic_read(&sbi->inline_xattr); si->inline_inode = atomic_read(&sbi->inline_inode); si->inline_dir = atomic_read(&sbi->inline_dir); + si->compr_inode = atomic_read(&sbi->compr_inode); + si->compr_blocks = atomic_read(&sbi->compr_blocks); si->append = sbi->im[APPEND_INO].ino_num; si->update = sbi->im[UPDATE_INO].ino_num; si->orphans = sbi->im[ORPHAN_INO].ino_num; @@ -114,7 +152,6 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID]; si->avail_nids = NM_I(sbi)->available_nids; si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID]; - si->bg_gc = sbi->bg_gc; si->io_skip_bggc = sbi->io_skip_bggc; si->other_skip_bggc = sbi->other_skip_bggc; si->skipped_atomic_files[BG_GC] = sbi->skipped_atomic_files[BG_GC]; @@ -146,39 +183,6 @@ static void update_general_status(struct f2fs_sb_info *sbi) } /* - * This function calculates BDF of every segments - */ -static void update_sit_info(struct f2fs_sb_info *sbi) -{ - struct f2fs_stat_info *si = F2FS_STAT(sbi); - unsigned long long blks_per_sec, hblks_per_sec, total_vblocks; - unsigned long long bimodal, dist; - unsigned int segno, vblocks; - int ndirty = 0; - - bimodal = 0; - total_vblocks = 0; - blks_per_sec = BLKS_PER_SEC(sbi); - hblks_per_sec = blks_per_sec / 2; - for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { - vblocks = get_valid_blocks(sbi, segno, true); - dist = abs(vblocks - hblks_per_sec); - bimodal += dist * dist; - - if (vblocks > 0 && vblocks < blks_per_sec) { - total_vblocks += vblocks; - ndirty++; - } - } - dist = div_u64(MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec, 100); - si->bimodal = div64_u64(bimodal, dist); - if (si->dirty_count) - si->avg_vblocks = div_u64(total_vblocks, ndirty); - else - si->avg_vblocks = 0; -} - -/* * This function calculates memory footprint. */ static void update_mem_info(struct f2fs_sb_info *sbi) @@ -315,6 +319,8 @@ static int stat_show(struct seq_file *s, void *v) si->inline_inode); seq_printf(s, " - Inline_dentry Inode: %u\n", si->inline_dir); + seq_printf(s, " - Compressed Inode: %u, Blocks: %u\n", + si->compr_inode, si->compr_blocks); seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n", si->orphans, si->append, si->update); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", @@ -441,7 +447,7 @@ static int stat_show(struct seq_file *s, void *v) si->block_count[LFS], si->segment_count[LFS]); /* segment usage info */ - update_sit_info(si->sbi); + f2fs_update_sit_info(si->sbi); seq_printf(s, "\nBDF: %u, avg. vblocks: %u\n", si->bimodal, si->avg_vblocks); @@ -461,6 +467,7 @@ static int stat_show(struct seq_file *s, void *v) } DEFINE_SHOW_ATTRIBUTE(stat); +#endif int f2fs_build_stats(struct f2fs_sb_info *sbi) { @@ -491,11 +498,12 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->inline_xattr, 0); atomic_set(&sbi->inline_inode, 0); atomic_set(&sbi->inline_dir, 0); + atomic_set(&sbi->compr_inode, 0); + atomic_set(&sbi->compr_blocks, 0); atomic_set(&sbi->inplace_count, 0); for (i = META_CP; i < META_MAX; i++) atomic_set(&sbi->meta_count[i], 0); - atomic_set(&sbi->aw_cnt, 0); atomic_set(&sbi->vw_cnt, 0); atomic_set(&sbi->max_aw_cnt, 0); atomic_set(&sbi->max_vw_cnt, 0); @@ -520,14 +528,18 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) void __init f2fs_create_root_stats(void) { +#ifdef CONFIG_DEBUG_FS f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL); debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL, &stat_fops); +#endif } void f2fs_destroy_root_stats(void) { +#ifdef CONFIG_DEBUG_FS debugfs_remove_recursive(f2fs_debugfs_root); f2fs_debugfs_root = NULL; +#endif } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index d9ad842945df..27d0dd7a16d6 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -578,6 +578,20 @@ next: goto next; } +bool f2fs_has_enough_room(struct inode *dir, struct page *ipage, + struct fscrypt_name *fname) +{ + struct f2fs_dentry_ptr d; + unsigned int bit_pos; + int slots = GET_DENTRY_SLOTS(fname_len(fname)); + + make_dentry_ptr_inline(dir, &d, inline_data_addr(dir, ipage)); + + bit_pos = f2fs_room_for_filename(d.bitmap, slots, d.max); + + return bit_pos < d.max; +} + void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, const struct qstr *name, f2fs_hash_t name_hash, unsigned int bit_pos) @@ -1069,24 +1083,27 @@ static int f2fs_d_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { struct qstr qstr = {.name = str, .len = len }; + const struct dentry *parent = READ_ONCE(dentry->d_parent); + const struct inode *inode = READ_ONCE(parent->d_inode); - if (!IS_CASEFOLDED(dentry->d_parent->d_inode)) { + if (!inode || !IS_CASEFOLDED(inode)) { if (len != name->len) return -1; - return memcmp(str, name, len); + return memcmp(str, name->name, len); } - return f2fs_ci_compare(dentry->d_parent->d_inode, name, &qstr, false); + return f2fs_ci_compare(inode, name, &qstr, false); } static int f2fs_d_hash(const struct dentry *dentry, struct qstr *str) { struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); const struct unicode_map *um = sbi->s_encoding; + const struct inode *inode = READ_ONCE(dentry->d_inode); unsigned char *norm; int len, ret = 0; - if (!IS_CASEFOLDED(dentry->d_inode)) + if (!inode || !IS_CASEFOLDED(inode)) return 0; norm = f2fs_kmalloc(sbi, PATH_MAX, GFP_ATOMIC); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 059ade83bfb1..5355be6b6755 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -116,6 +116,8 @@ typedef u32 block_t; /* */ typedef u32 nid_t; +#define COMPRESS_EXT_NUM 16 + struct f2fs_mount_info { unsigned int opt; int write_io_size_bits; /* Write IO size bits */ @@ -140,6 +142,12 @@ struct f2fs_mount_info { block_t unusable_cap; /* Amount of space allowed to be * unusable when disabling checkpoint */ + + /* For compression */ + unsigned char compress_algorithm; /* algorithm type */ + unsigned compress_log_size; /* cluster log size */ + unsigned char compress_ext_cnt; /* extension count */ + unsigned char extensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */ }; #define F2FS_FEATURE_ENCRYPT 0x0001 @@ -155,6 +163,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_VERITY 0x0400 #define F2FS_FEATURE_SB_CHKSUM 0x0800 #define F2FS_FEATURE_CASEFOLD 0x1000 +#define F2FS_FEATURE_COMPRESSION 0x2000 #define __F2FS_HAS_FEATURE(raw_super, mask) \ ((raw_super->feature & cpu_to_le32(mask)) != 0) @@ -712,6 +721,12 @@ struct f2fs_inode_info { int i_inline_xattr_size; /* inline xattr size */ struct timespec64 i_crtime; /* inode creation time */ struct timespec64 i_disk_time[4];/* inode disk times */ + + /* for file compress */ + u64 i_compr_blocks; /* # of compressed blocks */ + unsigned char i_compress_algorithm; /* algorithm type */ + unsigned char i_log_cluster_size; /* log of cluster size */ + unsigned int i_cluster_size; /* cluster size */ }; static inline void get_extent_info(struct extent_info *ext, @@ -1018,6 +1033,7 @@ enum need_lock_type { enum cp_reason_type { CP_NO_NEEDED, CP_NON_REGULAR, + CP_COMPRESSED, CP_HARDLINK, CP_SB_NEED_CP, CP_WRONG_PINO, @@ -1056,12 +1072,15 @@ struct f2fs_io_info { block_t old_blkaddr; /* old block address before Cow */ struct page *page; /* page to be written */ struct page *encrypted_page; /* encrypted page */ + struct page *compressed_page; /* compressed page */ struct list_head list; /* serialize IOs */ bool submitted; /* indicate IO submission */ int need_lock; /* indicate we need to lock cp_rwsem */ bool in_list; /* indicate fio is in io_list */ bool is_por; /* indicate IO is from recovery or not */ bool retry; /* need to reallocate block address */ + int compr_blocks; /* # of compressed block addresses */ + bool encrypted; /* indicate file is encrypted */ enum iostat_type io_type; /* io type */ struct writeback_control *io_wbc; /* writeback control */ struct bio **bio; /* bio for ipu */ @@ -1169,6 +1188,18 @@ enum fsync_mode { FSYNC_MODE_NOBARRIER, /* fsync behaves nobarrier based on posix */ }; +/* + * this value is set in page as a private data which indicate that + * the page is atomically written, and it is in inmem_pages list. + */ +#define ATOMIC_WRITTEN_PAGE ((unsigned long)-1) +#define DUMMY_WRITTEN_PAGE ((unsigned long)-2) + +#define IS_ATOMIC_WRITTEN_PAGE(page) \ + (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE) +#define IS_DUMMY_WRITTEN_PAGE(page) \ + (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) + #ifdef CONFIG_FS_ENCRYPTION #define DUMMY_ENCRYPTION_ENABLED(sbi) \ (unlikely(F2FS_OPTION(sbi).test_dummy_encryption)) @@ -1176,6 +1207,75 @@ enum fsync_mode { #define DUMMY_ENCRYPTION_ENABLED(sbi) (0) #endif +/* For compression */ +enum compress_algorithm_type { + COMPRESS_LZO, + COMPRESS_LZ4, + COMPRESS_MAX, +}; + +#define COMPRESS_DATA_RESERVED_SIZE 4 +struct compress_data { + __le32 clen; /* compressed data size */ + __le32 chksum; /* checksum of compressed data */ + __le32 reserved[COMPRESS_DATA_RESERVED_SIZE]; /* reserved */ + u8 cdata[]; /* compressed data */ +}; + +#define COMPRESS_HEADER_SIZE (sizeof(struct compress_data)) + +#define F2FS_COMPRESSED_PAGE_MAGIC 0xF5F2C000 + +/* compress context */ +struct compress_ctx { + struct inode *inode; /* inode the context belong to */ + pgoff_t cluster_idx; /* cluster index number */ + unsigned int cluster_size; /* page count in cluster */ + unsigned int log_cluster_size; /* log of cluster size */ + struct page **rpages; /* pages store raw data in cluster */ + unsigned int nr_rpages; /* total page number in rpages */ + struct page **cpages; /* pages store compressed data in cluster */ + unsigned int nr_cpages; /* total page number in cpages */ + void *rbuf; /* virtual mapped address on rpages */ + struct compress_data *cbuf; /* virtual mapped address on cpages */ + size_t rlen; /* valid data length in rbuf */ + size_t clen; /* valid data length in cbuf */ + void *private; /* payload buffer for specified compression algorithm */ +}; + +/* compress context for write IO path */ +struct compress_io_ctx { + u32 magic; /* magic number to indicate page is compressed */ + struct inode *inode; /* inode the context belong to */ + struct page **rpages; /* pages store raw data in cluster */ + unsigned int nr_rpages; /* total page number in rpages */ + refcount_t ref; /* referrence count of raw page */ +}; + +/* decompress io context for read IO path */ +struct decompress_io_ctx { + u32 magic; /* magic number to indicate page is compressed */ + struct inode *inode; /* inode the context belong to */ + pgoff_t cluster_idx; /* cluster index number */ + unsigned int cluster_size; /* page count in cluster */ + unsigned int log_cluster_size; /* log of cluster size */ + struct page **rpages; /* pages store raw data in cluster */ + unsigned int nr_rpages; /* total page number in rpages */ + struct page **cpages; /* pages store compressed data in cluster */ + unsigned int nr_cpages; /* total page number in cpages */ + struct page **tpages; /* temp pages to pad holes in cluster */ + void *rbuf; /* virtual mapped address on rpages */ + struct compress_data *cbuf; /* virtual mapped address on cpages */ + size_t rlen; /* valid data length in rbuf */ + size_t clen; /* valid data length in cbuf */ + refcount_t ref; /* referrence count of compressed page */ + bool failed; /* indicate IO error during decompression */ +}; + +#define NULL_CLUSTER ((unsigned int)(~0)) +#define MIN_COMPRESS_LOG_SIZE 2 +#define MAX_COMPRESS_LOG_SIZE 8 + struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ @@ -1291,7 +1391,10 @@ struct f2fs_sb_info { struct f2fs_mount_info mount_opt; /* mount options */ /* for cleaning operations */ - struct mutex gc_mutex; /* mutex for GC */ + struct rw_semaphore gc_lock; /* + * semaphore for GC, avoid + * race between GC and GC or CP + */ struct f2fs_gc_kthread *gc_thread; /* GC thread */ unsigned int cur_victim_sec; /* current victim section num */ unsigned int gc_mode; /* current GC state */ @@ -1327,11 +1430,11 @@ struct f2fs_sb_info { atomic_t inline_xattr; /* # of inline_xattr inodes */ atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ - atomic_t aw_cnt; /* # of atomic writes */ + atomic_t compr_inode; /* # of compressed inodes */ + atomic_t compr_blocks; /* # of compressed blocks */ atomic_t vw_cnt; /* # of volatile writes */ atomic_t max_aw_cnt; /* max # of atomic writes */ atomic_t max_vw_cnt; /* max # of volatile writes */ - int bg_gc; /* background gc calls */ unsigned int io_skip_bggc; /* skip background gc for in-flight IO */ unsigned int other_skip_bggc; /* skip background gc for other reasons */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ @@ -1365,6 +1468,8 @@ struct f2fs_sb_info { /* Precomputed FS UUID checksum for seeding other checksums */ __u32 s_chksum_seed; + + struct workqueue_struct *post_read_wq; /* post read workqueue */ }; struct f2fs_private_dio { @@ -2222,26 +2327,6 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, return entry; } -static inline struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, - int npages, bool no_fail) -{ - struct bio *bio; - - if (no_fail) { - /* No failure on bio allocation */ - bio = bio_alloc(GFP_NOIO, npages); - if (!bio) - bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages); - return bio; - } - if (time_to_inject(sbi, FAULT_ALLOC_BIO)) { - f2fs_show_injection_info(sbi, FAULT_ALLOC_BIO); - return NULL; - } - - return bio_alloc(GFP_KERNEL, npages); -} - static inline bool is_idle(struct f2fs_sb_info *sbi, int type) { if (sbi->gc_mode == GC_URGENT) @@ -2378,11 +2463,13 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) /* * On-disk inode flags (f2fs_inode::i_flags) */ +#define F2FS_COMPR_FL 0x00000004 /* Compress file */ #define F2FS_SYNC_FL 0x00000008 /* Synchronous updates */ #define F2FS_IMMUTABLE_FL 0x00000010 /* Immutable file */ #define F2FS_APPEND_FL 0x00000020 /* writes to file may only append */ #define F2FS_NODUMP_FL 0x00000040 /* do not dump file */ #define F2FS_NOATIME_FL 0x00000080 /* do not update atime */ +#define F2FS_NOCOMP_FL 0x00000400 /* Don't compress */ #define F2FS_INDEX_FL 0x00001000 /* hash-indexed directory */ #define F2FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ #define F2FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ @@ -2391,7 +2478,7 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) /* Flags that should be inherited by new inodes from their parent. */ #define F2FS_FL_INHERITED (F2FS_SYNC_FL | F2FS_NODUMP_FL | F2FS_NOATIME_FL | \ F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \ - F2FS_CASEFOLD_FL) + F2FS_CASEFOLD_FL | F2FS_COMPR_FL | F2FS_NOCOMP_FL) /* Flags that are appropriate for regular files (all but dir-specific ones). */ #define F2FS_REG_FLMASK (~(F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \ @@ -2443,6 +2530,8 @@ enum { FI_PIN_FILE, /* indicate file should not be gced */ FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */ FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ + FI_COMPRESSED_FILE, /* indicate file's data can be compressed */ + FI_MMAP_FILE, /* indicate file was mmapped */ }; static inline void __mark_inode_dirty_flag(struct inode *inode, @@ -2459,6 +2548,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode, case FI_DATA_EXIST: case FI_INLINE_DOTS: case FI_PIN_FILE: + case FI_COMPRESSED_FILE: f2fs_mark_inode_dirty_sync(inode, true); } } @@ -2614,16 +2704,27 @@ static inline int f2fs_has_inline_xattr(struct inode *inode) return is_inode_flag_set(inode, FI_INLINE_XATTR); } +static inline int f2fs_compressed_file(struct inode *inode) +{ + return S_ISREG(inode->i_mode) && + is_inode_flag_set(inode, FI_COMPRESSED_FILE); +} + static inline unsigned int addrs_per_inode(struct inode *inode) { unsigned int addrs = CUR_ADDRS_PER_INODE(inode) - get_inline_xattr_addrs(inode); - return ALIGN_DOWN(addrs, 1); + + if (!f2fs_compressed_file(inode)) + return addrs; + return ALIGN_DOWN(addrs, F2FS_I(inode)->i_cluster_size); } static inline unsigned int addrs_per_block(struct inode *inode) { - return ALIGN_DOWN(DEF_ADDRS_PER_BLOCK, 1); + if (!f2fs_compressed_file(inode)) + return DEF_ADDRS_PER_BLOCK; + return ALIGN_DOWN(DEF_ADDRS_PER_BLOCK, F2FS_I(inode)->i_cluster_size); } static inline void *inline_xattr_addr(struct inode *inode, struct page *page) @@ -2656,6 +2757,11 @@ static inline int f2fs_has_inline_dots(struct inode *inode) return is_inode_flag_set(inode, FI_INLINE_DOTS); } +static inline int f2fs_is_mmap_file(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_MMAP_FILE); +} + static inline bool f2fs_is_pinned_file(struct inode *inode) { return is_inode_flag_set(inode, FI_PIN_FILE); @@ -2783,7 +2889,8 @@ static inline bool f2fs_may_extent_tree(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); if (!test_opt(sbi, EXTENT_CACHE) || - is_inode_flag_set(inode, FI_NO_EXTENT)) + is_inode_flag_set(inode, FI_NO_EXTENT) || + is_inode_flag_set(inode, FI_COMPRESSED_FILE)) return false; /* @@ -2903,7 +3010,8 @@ static inline void verify_blkaddr(struct f2fs_sb_info *sbi, static inline bool __is_valid_data_blkaddr(block_t blkaddr) { - if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) + if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR || + blkaddr == COMPRESS_ADDR) return false; return true; } @@ -3001,6 +3109,8 @@ ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, struct page **page); void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, struct page *page, struct inode *inode); +bool f2fs_has_enough_room(struct inode *dir, struct page *ipage, + struct fscrypt_name *fname); void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, const struct qstr *name, f2fs_hash_t name_hash, unsigned int bit_pos); @@ -3155,6 +3265,8 @@ void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, unsigned int val, int alloc); void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); +int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi); +int f2fs_check_write_pointer(struct f2fs_sb_info *sbi); int f2fs_build_segment_manager(struct f2fs_sb_info *sbi); void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi); int __init f2fs_create_segment_manager_caches(void); @@ -3205,10 +3317,13 @@ void f2fs_destroy_checkpoint_caches(void); /* * data.c */ -int f2fs_init_post_read_processing(void); -void f2fs_destroy_post_read_processing(void); +int __init f2fs_init_bioset(void); +void f2fs_destroy_bioset(void); +struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool no_fail); int f2fs_init_bio_entry_cache(void); void f2fs_destroy_bio_entry_cache(void); +void f2fs_submit_bio(struct f2fs_sb_info *sbi, + struct bio *bio, enum page_type type); void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, struct page *page, @@ -3245,8 +3360,14 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int create, int flag); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); +int f2fs_encrypt_one_page(struct f2fs_io_info *fio); bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio); bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio); +int f2fs_write_single_data_page(struct page *page, int *submitted, + struct bio **bio, sector_t *last_block, + struct writeback_control *wbc, + enum iostat_type io_type, + int compr_blocks); void f2fs_invalidate_page(struct page *page, unsigned int offset, unsigned int length); int f2fs_release_page(struct page *page, gfp_t wait); @@ -3256,6 +3377,10 @@ int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, #endif bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len); void f2fs_clear_page_cache_dirty_tag(struct page *page); +int f2fs_init_post_read_processing(void); +void f2fs_destroy_post_read_processing(void); +int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi); +void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi); /* * gc.c @@ -3302,6 +3427,7 @@ struct f2fs_stat_info { int nr_discard_cmd; unsigned int undiscard_blks; int inline_xattr, inline_inode, inline_dir, append, update, orphans; + int compr_inode, compr_blocks; int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; @@ -3333,7 +3459,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) #define stat_inc_cp_count(si) ((si)->cp_count++) #define stat_inc_bg_cp_count(si) ((si)->bg_cp_count++) #define stat_inc_call_count(si) ((si)->call_count++) -#define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++) +#define stat_inc_bggc_count(si) ((si)->bg_gc++) #define stat_io_skip_bggc_count(sbi) ((sbi)->io_skip_bggc++) #define stat_other_skip_bggc_count(sbi) ((sbi)->other_skip_bggc++) #define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++) @@ -3372,6 +3498,20 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) if (f2fs_has_inline_dentry(inode)) \ (atomic_dec(&F2FS_I_SB(inode)->inline_dir)); \ } while (0) +#define stat_inc_compr_inode(inode) \ + do { \ + if (f2fs_compressed_file(inode)) \ + (atomic_inc(&F2FS_I_SB(inode)->compr_inode)); \ + } while (0) +#define stat_dec_compr_inode(inode) \ + do { \ + if (f2fs_compressed_file(inode)) \ + (atomic_dec(&F2FS_I_SB(inode)->compr_inode)); \ + } while (0) +#define stat_add_compr_blocks(inode, blocks) \ + (atomic_add(blocks, &F2FS_I_SB(inode)->compr_blocks)) +#define stat_sub_compr_blocks(inode, blocks) \ + (atomic_sub(blocks, &F2FS_I_SB(inode)->compr_blocks)) #define stat_inc_meta_count(sbi, blkaddr) \ do { \ if (blkaddr < SIT_I(sbi)->sit_base_addr) \ @@ -3389,13 +3529,9 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) ((sbi)->block_count[(curseg)->alloc_type]++) #define stat_inc_inplace_blocks(sbi) \ (atomic_inc(&(sbi)->inplace_count)) -#define stat_inc_atomic_write(inode) \ - (atomic_inc(&F2FS_I_SB(inode)->aw_cnt)) -#define stat_dec_atomic_write(inode) \ - (atomic_dec(&F2FS_I_SB(inode)->aw_cnt)) #define stat_update_max_atomic_write(inode) \ do { \ - int cur = atomic_read(&F2FS_I_SB(inode)->aw_cnt); \ + int cur = F2FS_I_SB(inode)->atomic_files; \ int max = atomic_read(&F2FS_I_SB(inode)->max_aw_cnt); \ if (cur > max) \ atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ @@ -3447,6 +3583,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi); void f2fs_destroy_stats(struct f2fs_sb_info *sbi); void __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); +void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #else #define stat_inc_cp_count(si) do { } while (0) #define stat_inc_bg_cp_count(si) do { } while (0) @@ -3456,8 +3593,8 @@ void f2fs_destroy_root_stats(void); #define stat_other_skip_bggc_count(sbi) do { } while (0) #define stat_inc_dirty_inode(sbi, type) do { } while (0) #define stat_dec_dirty_inode(sbi, type) do { } while (0) -#define stat_inc_total_hit(sb) do { } while (0) -#define stat_inc_rbtree_node_hit(sb) do { } while (0) +#define stat_inc_total_hit(sbi) do { } while (0) +#define stat_inc_rbtree_node_hit(sbi) do { } while (0) #define stat_inc_largest_node_hit(sbi) do { } while (0) #define stat_inc_cached_node_hit(sbi) do { } while (0) #define stat_inc_inline_xattr(inode) do { } while (0) @@ -3466,6 +3603,10 @@ void f2fs_destroy_root_stats(void); #define stat_dec_inline_inode(inode) do { } while (0) #define stat_inc_inline_dir(inode) do { } while (0) #define stat_dec_inline_dir(inode) do { } while (0) +#define stat_inc_compr_inode(inode) do { } while (0) +#define stat_dec_compr_inode(inode) do { } while (0) +#define stat_add_compr_blocks(inode, blocks) do { } while (0) +#define stat_sub_compr_blocks(inode, blocks) do { } while (0) #define stat_inc_atomic_write(inode) do { } while (0) #define stat_dec_atomic_write(inode) do { } while (0) #define stat_update_max_atomic_write(inode) do { } while (0) @@ -3485,6 +3626,7 @@ static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } static inline void __init f2fs_create_root_stats(void) { } static inline void f2fs_destroy_root_stats(void) { } +static inline void update_sit_info(struct f2fs_sb_info *sbi) {} #endif extern const struct file_operations f2fs_dir_operations; @@ -3513,6 +3655,7 @@ void f2fs_truncate_inline_inode(struct inode *inode, int f2fs_read_inline_data(struct inode *inode, struct page *page); int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page); int f2fs_convert_inline_inode(struct inode *inode); +int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry); int f2fs_write_inline_data(struct inode *inode, struct page *page); bool f2fs_recover_inline_data(struct inode *inode, struct page *npage); struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir, @@ -3605,7 +3748,85 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode) */ static inline bool f2fs_post_read_required(struct inode *inode) { - return f2fs_encrypted_file(inode) || fsverity_active(inode); + return f2fs_encrypted_file(inode) || fsverity_active(inode) || + f2fs_compressed_file(inode); +} + +/* + * compress.c + */ +#ifdef CONFIG_F2FS_FS_COMPRESSION +bool f2fs_is_compressed_page(struct page *page); +struct page *f2fs_compress_control_page(struct page *page); +int f2fs_prepare_compress_overwrite(struct inode *inode, + struct page **pagep, pgoff_t index, void **fsdata); +bool f2fs_compress_write_end(struct inode *inode, void *fsdata, + pgoff_t index, unsigned copied); +void f2fs_compress_write_end_io(struct bio *bio, struct page *page); +bool f2fs_is_compress_backend_ready(struct inode *inode); +void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity); +bool f2fs_cluster_is_empty(struct compress_ctx *cc); +bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); +void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page); +int f2fs_write_multi_pages(struct compress_ctx *cc, + int *submitted, + struct writeback_control *wbc, + enum iostat_type io_type); +int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index); +int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, + unsigned nr_pages, sector_t *last_block_in_bio, + bool is_readahead); +struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc); +void f2fs_free_dic(struct decompress_io_ctx *dic); +void f2fs_decompress_end_io(struct page **rpages, + unsigned int cluster_size, bool err, bool verity); +int f2fs_init_compress_ctx(struct compress_ctx *cc); +void f2fs_destroy_compress_ctx(struct compress_ctx *cc); +void f2fs_init_compress_info(struct f2fs_sb_info *sbi); +#else +static inline bool f2fs_is_compressed_page(struct page *page) { return false; } +static inline bool f2fs_is_compress_backend_ready(struct inode *inode) +{ + if (!f2fs_compressed_file(inode)) + return true; + /* not support compression */ + return false; +} +static inline struct page *f2fs_compress_control_page(struct page *page) +{ + WARN_ON_ONCE(1); + return ERR_PTR(-EINVAL); +} +#endif + +static inline void set_compress_context(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + F2FS_I(inode)->i_compress_algorithm = + F2FS_OPTION(sbi).compress_algorithm; + F2FS_I(inode)->i_log_cluster_size = + F2FS_OPTION(sbi).compress_log_size; + F2FS_I(inode)->i_cluster_size = + 1 << F2FS_I(inode)->i_log_cluster_size; + F2FS_I(inode)->i_flags |= F2FS_COMPR_FL; + set_inode_flag(inode, FI_COMPRESSED_FILE); + stat_inc_compr_inode(inode); +} + +static inline u64 f2fs_disable_compressed_file(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + + if (!f2fs_compressed_file(inode)) + return 0; + if (fi->i_compr_blocks) + return fi->i_compr_blocks; + + fi->i_flags &= ~F2FS_COMPR_FL; + clear_inode_flag(inode, FI_COMPRESSED_FILE); + stat_dec_compr_inode(inode); + return 0; } #define F2FS_FEATURE_FUNCS(name, flagname) \ @@ -3626,6 +3847,7 @@ F2FS_FEATURE_FUNCS(lost_found, LOST_FOUND); F2FS_FEATURE_FUNCS(verity, VERITY); F2FS_FEATURE_FUNCS(sb_chksum, SB_CHKSUM); F2FS_FEATURE_FUNCS(casefold, CASEFOLD); +F2FS_FEATURE_FUNCS(compression, COMPRESSION); #ifdef CONFIG_BLK_DEV_ZONED static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi, @@ -3707,6 +3929,30 @@ static inline bool f2fs_may_encrypt(struct inode *inode) #endif } +static inline bool f2fs_may_compress(struct inode *inode) +{ + if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) || + f2fs_is_atomic_file(inode) || + f2fs_is_volatile_file(inode)) + return false; + return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode); +} + +static inline void f2fs_i_compr_blocks_update(struct inode *inode, + u64 blocks, bool add) +{ + int diff = F2FS_I(inode)->i_cluster_size - blocks; + + if (add) { + F2FS_I(inode)->i_compr_blocks += diff; + stat_add_compr_blocks(inode, diff); + } else { + F2FS_I(inode)->i_compr_blocks -= diff; + stat_sub_compr_blocks(inode, diff); + } + f2fs_mark_inode_dirty_sync(inode, true); +} + static inline int block_unaligned_IO(struct inode *inode, struct kiocb *iocb, struct iov_iter *iter) { @@ -3738,6 +3984,8 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, return true; if (f2fs_is_multi_device(sbi)) return true; + if (f2fs_compressed_file(inode)) + return true; /* * for blkzoned device, fallback direct IO to buffered IO, so * all IOs can be serialized by log-structured write. diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 85af112e868d..86ddbb55d2b1 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -50,8 +50,9 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct dnode_of_data dn = { .node_changed = false }; - int err; + struct dnode_of_data dn; + bool need_alloc = true; + int err = 0; if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; @@ -63,6 +64,26 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) goto err; } +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + int ret = f2fs_is_compressed_cluster(inode, page->index); + + if (ret < 0) { + err = ret; + goto err; + } else if (ret) { + if (ret < F2FS_I(inode)->i_cluster_size) { + err = -EAGAIN; + goto err; + } + need_alloc = false; + } + } +#endif + /* should do out of any locked page */ + if (need_alloc) + f2fs_balance_fs(sbi, true); + sb_start_pagefault(inode->i_sb); f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); @@ -78,15 +99,17 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) goto out_sem; } - /* block allocation */ - __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); - set_new_dnode(&dn, inode, NULL, NULL, 0); - err = f2fs_get_block(&dn, page->index); - f2fs_put_dnode(&dn); - __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); - if (err) { - unlock_page(page); - goto out_sem; + if (need_alloc) { + /* block allocation */ + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_block(&dn, page->index); + f2fs_put_dnode(&dn); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); + if (err) { + unlock_page(page); + goto out_sem; + } } /* fill the page */ @@ -120,8 +143,6 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) out_sem: up_read(&F2FS_I(inode)->i_mmap_sem); - f2fs_balance_fs(sbi, dn.node_changed); - sb_end_pagefault(inode->i_sb); err: return block_page_mkwrite_return(err); @@ -155,6 +176,8 @@ static inline enum cp_reason_type need_do_checkpoint(struct inode *inode) if (!S_ISREG(inode->i_mode)) cp_reason = CP_NON_REGULAR; + else if (f2fs_compressed_file(inode)) + cp_reason = CP_COMPRESSED; else if (inode->i_nlink != 1) cp_reason = CP_HARDLINK; else if (is_sbi_flag_set(sbi, SBI_NEED_CP)) @@ -485,6 +508,9 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; + if (!f2fs_is_compress_backend_ready(inode)) + return -EOPNOTSUPP; + /* we don't need to use inline_data strictly */ err = f2fs_convert_inline_inode(inode); if (err) @@ -492,6 +518,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); vma->vm_ops = &f2fs_file_vm_ops; + set_inode_flag(inode, FI_MMAP_FILE); return 0; } @@ -502,6 +529,9 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) if (err) return err; + if (!f2fs_is_compress_backend_ready(inode)) + return -EOPNOTSUPP; + err = fsverity_file_open(inode, filp); if (err) return err; @@ -518,6 +548,9 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) int nr_free = 0, ofs = dn->ofs_in_node, len = count; __le32 *addr; int base = 0; + bool compressed_cluster = false; + int cluster_index = 0, valid_blocks = 0; + int cluster_size = F2FS_I(dn->inode)->i_cluster_size; if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) base = get_extra_isize(dn->inode); @@ -525,26 +558,43 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) raw_node = F2FS_NODE(dn->node_page); addr = blkaddr_in_node(raw_node) + base + ofs; - for (; count > 0; count--, addr++, dn->ofs_in_node++) { + /* Assumption: truncateion starts with cluster */ + for (; count > 0; count--, addr++, dn->ofs_in_node++, cluster_index++) { block_t blkaddr = le32_to_cpu(*addr); + if (f2fs_compressed_file(dn->inode) && + !(cluster_index & (cluster_size - 1))) { + if (compressed_cluster) + f2fs_i_compr_blocks_update(dn->inode, + valid_blocks, false); + compressed_cluster = (blkaddr == COMPRESS_ADDR); + valid_blocks = 0; + } + if (blkaddr == NULL_ADDR) continue; dn->data_blkaddr = NULL_ADDR; f2fs_set_data_blkaddr(dn); - if (__is_valid_data_blkaddr(blkaddr) && - !f2fs_is_valid_blkaddr(sbi, blkaddr, + if (__is_valid_data_blkaddr(blkaddr)) { + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) - continue; + continue; + if (compressed_cluster) + valid_blocks++; + } - f2fs_invalidate_blocks(sbi, blkaddr); if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page)) clear_inode_flag(dn->inode, FI_FIRST_BLOCK_WRITTEN); + + f2fs_invalidate_blocks(sbi, blkaddr); nr_free++; } + if (compressed_cluster) + f2fs_i_compr_blocks_update(dn->inode, valid_blocks, false); + if (nr_free) { pgoff_t fofs; /* @@ -587,6 +637,9 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, return 0; } + if (f2fs_compressed_file(inode)) + return 0; + page = f2fs_get_lock_data_page(inode, index, true); if (IS_ERR(page)) return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page); @@ -602,7 +655,7 @@ truncate_out: return 0; } -int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) +static int do_truncate_blocks(struct inode *inode, u64 from, bool lock) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; @@ -667,6 +720,28 @@ free_partial: return err; } +int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) +{ + u64 free_from = from; + + /* + * for compressed file, only support cluster size + * aligned truncation. + */ + if (f2fs_compressed_file(inode)) { + size_t cluster_shift = PAGE_SHIFT + + F2FS_I(inode)->i_log_cluster_size; + size_t cluster_mask = (1 << cluster_shift) - 1; + + free_from = from >> cluster_shift; + if (from & cluster_mask) + free_from++; + free_from <<= cluster_shift; + } + + return do_truncate_blocks(inode, free_from, lock); +} + int f2fs_truncate(struct inode *inode) { int err; @@ -786,6 +861,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; + if ((attr->ia_valid & ATTR_SIZE) && + !f2fs_is_compress_backend_ready(inode)) + return -EOPNOTSUPP; + err = setattr_prepare(dentry, attr); if (err) return err; @@ -1026,8 +1105,8 @@ next_dnode: } else if (ret == -ENOENT) { if (dn.max_level == 0) return -ENOENT; - done = min((pgoff_t)ADDRS_PER_BLOCK(inode) - dn.ofs_in_node, - len); + done = min((pgoff_t)ADDRS_PER_BLOCK(inode) - + dn.ofs_in_node, len); blkaddr += done; do_replace += done; goto next; @@ -1190,13 +1269,13 @@ static int __exchange_data_block(struct inode *src_inode, src_blkaddr = f2fs_kvzalloc(F2FS_I_SB(src_inode), array_size(olen, sizeof(block_t)), - GFP_KERNEL); + GFP_NOFS); if (!src_blkaddr) return -ENOMEM; do_replace = f2fs_kvzalloc(F2FS_I_SB(src_inode), array_size(olen, sizeof(int)), - GFP_KERNEL); + GFP_NOFS); if (!do_replace) { kvfree(src_blkaddr); return -ENOMEM; @@ -1563,7 +1642,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset, next_alloc: if (has_not_enough_free_secs(sbi, 0, GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); err = f2fs_gc(sbi, true, false, NULL_SEGNO); if (err && err != -ENODATA && err != -EAGAIN) goto out_err; @@ -1621,6 +1700,8 @@ static long f2fs_fallocate(struct file *file, int mode, return -EIO; if (!f2fs_is_checkpoint_ready(F2FS_I_SB(inode))) return -ENOSPC; + if (!f2fs_is_compress_backend_ready(inode)) + return -EOPNOTSUPP; /* f2fs only support ->fallocate for regular file */ if (!S_ISREG(inode->i_mode)) @@ -1630,6 +1711,11 @@ static long f2fs_fallocate(struct file *file, int mode, (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; + if (f2fs_compressed_file(inode) && + (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | + FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE))) + return -EOPNOTSUPP; + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE)) @@ -1719,7 +1805,40 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) return -ENOTEMPTY; } + if (iflags & (F2FS_COMPR_FL | F2FS_NOCOMP_FL)) { + if (!f2fs_sb_has_compression(F2FS_I_SB(inode))) + return -EOPNOTSUPP; + if ((iflags & F2FS_COMPR_FL) && (iflags & F2FS_NOCOMP_FL)) + return -EINVAL; + } + + if ((iflags ^ fi->i_flags) & F2FS_COMPR_FL) { + if (S_ISREG(inode->i_mode) && + (fi->i_flags & F2FS_COMPR_FL || i_size_read(inode) || + F2FS_HAS_BLOCKS(inode))) + return -EINVAL; + if (iflags & F2FS_NOCOMP_FL) + return -EINVAL; + if (iflags & F2FS_COMPR_FL) { + int err = f2fs_convert_inline_inode(inode); + + if (err) + return err; + + if (!f2fs_may_compress(inode)) + return -EINVAL; + + set_compress_context(inode); + } + } + if ((iflags ^ fi->i_flags) & F2FS_NOCOMP_FL) { + if (fi->i_flags & F2FS_COMPR_FL) + return -EINVAL; + } + fi->i_flags = iflags | (fi->i_flags & ~mask); + f2fs_bug_on(F2FS_I_SB(inode), (fi->i_flags & F2FS_COMPR_FL) && + (fi->i_flags & F2FS_NOCOMP_FL)); if (fi->i_flags & F2FS_PROJINHERIT_FL) set_inode_flag(inode, FI_PROJ_INHERIT); @@ -1745,11 +1864,13 @@ static const struct { u32 iflag; u32 fsflag; } f2fs_fsflags_map[] = { + { F2FS_COMPR_FL, FS_COMPR_FL }, { F2FS_SYNC_FL, FS_SYNC_FL }, { F2FS_IMMUTABLE_FL, FS_IMMUTABLE_FL }, { F2FS_APPEND_FL, FS_APPEND_FL }, { F2FS_NODUMP_FL, FS_NODUMP_FL }, { F2FS_NOATIME_FL, FS_NOATIME_FL }, + { F2FS_NOCOMP_FL, FS_NOCOMP_FL }, { F2FS_INDEX_FL, FS_INDEX_FL }, { F2FS_DIRSYNC_FL, FS_DIRSYNC_FL }, { F2FS_PROJINHERIT_FL, FS_PROJINHERIT_FL }, @@ -1757,11 +1878,13 @@ static const struct { }; #define F2FS_GETTABLE_FS_FL ( \ + FS_COMPR_FL | \ FS_SYNC_FL | \ FS_IMMUTABLE_FL | \ FS_APPEND_FL | \ FS_NODUMP_FL | \ FS_NOATIME_FL | \ + FS_NOCOMP_FL | \ FS_INDEX_FL | \ FS_DIRSYNC_FL | \ FS_PROJINHERIT_FL | \ @@ -1772,11 +1895,13 @@ static const struct { FS_CASEFOLD_FL) #define F2FS_SETTABLE_FS_FL ( \ + FS_COMPR_FL | \ FS_SYNC_FL | \ FS_IMMUTABLE_FL | \ FS_APPEND_FL | \ FS_NODUMP_FL | \ FS_NOATIME_FL | \ + FS_NOCOMP_FL | \ FS_DIRSYNC_FL | \ FS_PROJINHERIT_FL | \ FS_CASEFOLD_FL) @@ -1897,6 +2022,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) inode_lock(inode); + f2fs_disable_compressed_file(inode); + if (f2fs_is_atomic_file(inode)) { if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) ret = -EINVAL; @@ -1935,7 +2062,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); F2FS_I(inode)->inmem_task = current; - stat_inc_atomic_write(inode); stat_update_max_atomic_write(inode); out: inode_unlock(inode); @@ -2324,12 +2450,12 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) return ret; if (!sync) { - if (!mutex_trylock(&sbi->gc_mutex)) { + if (!down_write_trylock(&sbi->gc_lock)) { ret = -EBUSY; goto out; } } else { - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); } ret = f2fs_gc(sbi, sync, true, NULL_SEGNO); @@ -2367,12 +2493,12 @@ static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) do_more: if (!range.sync) { - if (!mutex_trylock(&sbi->gc_mutex)) { + if (!down_write_trylock(&sbi->gc_lock)) { ret = -EBUSY; goto out; } } else { - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); } ret = f2fs_gc(sbi, range.sync, true, GET_SEGNO(sbi, range.start)); @@ -2803,7 +2929,7 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) end_segno = min(start_segno + range.segments, dev_end_segno); while (start_segno < end_segno) { - if (!mutex_trylock(&sbi->gc_mutex)) { + if (!down_write_trylock(&sbi->gc_lock)) { ret = -EBUSY; goto out; } @@ -3098,10 +3224,16 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) ret = -EAGAIN; goto out; } + ret = f2fs_convert_inline_inode(inode); if (ret) goto out; + if (f2fs_disable_compressed_file(inode)) { + ret = -EOPNOTSUPP; + goto out; + } + set_inode_flag(inode, FI_PIN_FILE); ret = F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN]; done: @@ -3350,6 +3482,17 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } } +static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + + if (!f2fs_is_compress_backend_ready(inode)) + return -EOPNOTSUPP; + + return generic_file_read_iter(iocb, iter); +} + static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; @@ -3361,6 +3504,9 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out; } + if (!f2fs_is_compress_backend_ready(inode)) + return -EOPNOTSUPP; + if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock(inode)) { ret = -EAGAIN; @@ -3389,18 +3535,41 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = -EAGAIN; goto out; } - } else { - preallocated = true; - target_size = iocb->ki_pos + iov_iter_count(from); + goto write; + } - err = f2fs_preallocate_blocks(iocb, from); - if (err) { - clear_inode_flag(inode, FI_NO_PREALLOC); - inode_unlock(inode); - ret = err; - goto out; - } + if (is_inode_flag_set(inode, FI_NO_PREALLOC)) + goto write; + + if (iocb->ki_flags & IOCB_DIRECT) { + /* + * Convert inline data for Direct I/O before entering + * f2fs_direct_IO(). + */ + err = f2fs_convert_inline_inode(inode); + if (err) + goto out_err; + /* + * If force_buffere_io() is true, we have to allocate + * blocks all the time, since f2fs_direct_IO will fall + * back to buffered IO. + */ + if (!f2fs_force_buffered_io(inode, iocb, from) && + allow_outplace_dio(inode, iocb, from)) + goto write; + } + preallocated = true; + target_size = iocb->ki_pos + iov_iter_count(from); + + err = f2fs_preallocate_blocks(iocb, from); + if (err) { +out_err: + clear_inode_flag(inode, FI_NO_PREALLOC); + inode_unlock(inode); + ret = err; + goto out; } +write: ret = __generic_file_write_iter(iocb, from); clear_inode_flag(inode, FI_NO_PREALLOC); @@ -3475,7 +3644,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) const struct file_operations f2fs_file_operations = { .llseek = f2fs_llseek, - .read_iter = generic_file_read_iter, + .read_iter = f2fs_file_read_iter, .write_iter = f2fs_file_write_iter, .open = f2fs_file_open, .release = f2fs_release_file, diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b3d399623290..db8725d473b5 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -78,18 +78,18 @@ static int gc_thread_func(void *data) */ if (sbi->gc_mode == GC_URGENT) { wait_ms = gc_th->urgent_sleep_time; - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); goto do_gc; } - if (!mutex_trylock(&sbi->gc_mutex)) { + if (!down_write_trylock(&sbi->gc_lock)) { stat_other_skip_bggc_count(sbi); goto next; } if (!is_idle(sbi, GC_TIME)) { increase_sleep_time(gc_th, &wait_ms); - mutex_unlock(&sbi->gc_mutex); + up_write(&sbi->gc_lock); stat_io_skip_bggc_count(sbi); goto next; } @@ -99,7 +99,7 @@ static int gc_thread_func(void *data) else increase_sleep_time(gc_th, &wait_ms); do_gc: - stat_inc_bggc_count(sbi); + stat_inc_bggc_count(sbi->stat_info); /* if return value is not zero, no victim was selected */ if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO)) @@ -1049,8 +1049,10 @@ next_step: if (phase == 3) { inode = f2fs_iget(sb, dni.ino); - if (IS_ERR(inode) || is_bad_inode(inode)) + if (IS_ERR(inode) || is_bad_inode(inode)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); continue; + } if (!down_write_trylock( &F2FS_I(inode)->i_gc_rwsem[WRITE])) { @@ -1368,7 +1370,7 @@ stop: reserved_segments(sbi), prefree_segments(sbi)); - mutex_unlock(&sbi->gc_mutex); + up_write(&sbi->gc_lock); put_gc_inode(&gc_list); @@ -1407,9 +1409,9 @@ static int free_segment_range(struct f2fs_sb_info *sbi, unsigned int start, .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); do_garbage_collect(sbi, segno, &gc_list, FG_GC); - mutex_unlock(&sbi->gc_mutex); + up_write(&sbi->gc_lock); put_gc_inode(&gc_list); if (get_valid_blocks(sbi, segno, true)) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 896db0416f0e..4167e5408151 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -368,7 +368,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, struct f2fs_dentry_ptr src, dst; int err; - page = f2fs_grab_cache_page(dir->i_mapping, 0, false); + page = f2fs_grab_cache_page(dir->i_mapping, 0, true); if (!page) { f2fs_put_page(ipage, 1); return -ENOMEM; @@ -530,7 +530,7 @@ recover: return err; } -static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, +static int do_convert_inline_dir(struct inode *dir, struct page *ipage, void *inline_dentry) { if (!F2FS_I(dir)->i_dir_level) @@ -539,6 +539,44 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, return f2fs_move_rehashed_dirents(dir, ipage, inline_dentry); } +int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct page *ipage; + struct fscrypt_name fname; + void *inline_dentry = NULL; + int err = 0; + + if (!f2fs_has_inline_dentry(dir)) + return 0; + + f2fs_lock_op(sbi); + + err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname); + if (err) + goto out; + + ipage = f2fs_get_node_page(sbi, dir->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto out; + } + + if (f2fs_has_enough_room(dir, ipage, &fname)) { + f2fs_put_page(ipage, 1); + goto out; + } + + inline_dentry = inline_data_addr(dir, ipage); + + err = do_convert_inline_dir(dir, ipage, inline_dentry); + if (!err) + f2fs_put_page(ipage, 1); +out: + f2fs_unlock_op(sbi); + return err; +} + int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, const struct qstr *orig_name, struct inode *inode, nid_t ino, umode_t mode) @@ -562,7 +600,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, bit_pos = f2fs_room_for_filename(d.bitmap, slots, d.max); if (bit_pos >= d.max) { - err = f2fs_convert_inline_dir(dir, ipage, inline_dentry); + err = do_convert_inline_dir(dir, ipage, inline_dentry); if (err) return err; err = -EAGAIN; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 502bd491336a..78c3f1d70f1d 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -200,6 +200,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_inode *ri = F2FS_INODE(node_page); unsigned long long iblocks; iblocks = le64_to_cpu(F2FS_INODE(node_page)->i_blocks); @@ -286,6 +287,19 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_compression(sbi) && + fi->i_flags & F2FS_COMPR_FL && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, + i_log_cluster_size)) { + if (ri->i_compress_algorithm >= COMPRESS_MAX) + return false; + if (le64_to_cpu(ri->i_compr_blocks) > inode->i_blocks) + return false; + if (ri->i_log_cluster_size < MIN_COMPRESS_LOG_SIZE || + ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE) + return false; + } + return true; } @@ -407,6 +421,18 @@ static int do_read_inode(struct inode *inode) fi->i_crtime.tv_nsec = le32_to_cpu(ri->i_crtime_nsec); } + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_compression(sbi) && + (fi->i_flags & F2FS_COMPR_FL)) { + if (F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, + i_log_cluster_size)) { + fi->i_compr_blocks = le64_to_cpu(ri->i_compr_blocks); + fi->i_compress_algorithm = ri->i_compress_algorithm; + fi->i_log_cluster_size = ri->i_log_cluster_size; + fi->i_cluster_size = 1 << fi->i_log_cluster_size; + set_inode_flag(inode, FI_COMPRESSED_FILE); + } + } + F2FS_I(inode)->i_disk_time[0] = inode->i_atime; F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; @@ -416,6 +442,8 @@ static int do_read_inode(struct inode *inode) stat_inc_inline_xattr(inode); stat_inc_inline_inode(inode); stat_inc_inline_dir(inode); + stat_inc_compr_inode(inode); + stat_add_compr_blocks(inode, F2FS_I(inode)->i_compr_blocks); return 0; } @@ -569,6 +597,17 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) ri->i_crtime_nsec = cpu_to_le32(F2FS_I(inode)->i_crtime.tv_nsec); } + + if (f2fs_sb_has_compression(F2FS_I_SB(inode)) && + F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, + i_log_cluster_size)) { + ri->i_compr_blocks = + cpu_to_le64(F2FS_I(inode)->i_compr_blocks); + ri->i_compress_algorithm = + F2FS_I(inode)->i_compress_algorithm; + ri->i_log_cluster_size = + F2FS_I(inode)->i_log_cluster_size; + } } __set_inode_rdev(inode, ri); @@ -711,6 +750,8 @@ no_delete: stat_dec_inline_xattr(inode); stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); + stat_dec_compr_inode(inode); + stat_sub_compr_blocks(inode, F2FS_I(inode)->i_compr_blocks); if (likely(!f2fs_cp_error(sbi) && !is_sbi_flag_set(sbi, SBI_CP_DISABLED))) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a1c507b0b4ac..2aa035422c0f 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -119,6 +119,13 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL) set_inode_flag(inode, FI_PROJ_INHERIT); + if (f2fs_sb_has_compression(sbi)) { + /* Inherit the compression flag in directory */ + if ((F2FS_I(dir)->i_flags & F2FS_COMPR_FL) && + f2fs_may_compress(inode)) + set_compress_context(inode); + } + f2fs_set_inode_flags(inode); trace_f2fs_new_inode(inode, 0); @@ -149,6 +156,9 @@ static inline int is_extension_exist(const unsigned char *s, const char *sub) size_t sublen = strlen(sub); int i; + if (sublen == 1 && *sub == '*') + return 1; + /* * filename format of multimedia file should be defined as: * "filename + '.' + extension + (optional: '.' + temp extension)". @@ -262,6 +272,45 @@ int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, return 0; } +static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode, + const unsigned char *name) +{ + __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; + unsigned char (*ext)[F2FS_EXTENSION_LEN]; + unsigned int ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + int i, cold_count, hot_count; + + if (!f2fs_sb_has_compression(sbi) || + is_inode_flag_set(inode, FI_COMPRESSED_FILE) || + F2FS_I(inode)->i_flags & F2FS_NOCOMP_FL || + !f2fs_may_compress(inode)) + return; + + down_read(&sbi->sb_lock); + + cold_count = le32_to_cpu(sbi->raw_super->extension_count); + hot_count = sbi->raw_super->hot_ext_count; + + for (i = cold_count; i < cold_count + hot_count; i++) { + if (is_extension_exist(name, extlist[i])) { + up_read(&sbi->sb_lock); + return; + } + } + + up_read(&sbi->sb_lock); + + ext = F2FS_OPTION(sbi).extensions; + + for (i = 0; i < ext_cnt; i++) { + if (!is_extension_exist(name, ext[i])) + continue; + + set_compress_context(inode); + return; + } +} + static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { @@ -286,6 +335,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, if (!test_opt(sbi, DISABLE_EXT_IDENTIFY)) set_file_temperature(sbi, inode, dentry->d_name.name); + set_compress_inode(sbi, inode, dentry->d_name.name); + inode->i_op = &f2fs_file_inode_operations; inode->i_fop = &f2fs_file_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; @@ -797,6 +848,7 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, if (whiteout) { f2fs_i_links_write(inode, false); + inode->i_state |= I_LINKABLE; *whiteout = inode; } else { d_tmpfile(dentry, inode); @@ -849,12 +901,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); struct inode *whiteout = NULL; - struct page *old_dir_page; + struct page *old_dir_page = NULL; struct page *old_page, *new_page = NULL; struct f2fs_dir_entry *old_dir_entry = NULL; struct f2fs_dir_entry *old_entry; struct f2fs_dir_entry *new_entry; - bool is_old_inline = f2fs_has_inline_dentry(old_dir); int err; if (unlikely(f2fs_cp_error(sbi))) @@ -867,6 +918,26 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, F2FS_I(old_dentry->d_inode)->i_projid))) return -EXDEV; + /* + * If new_inode is null, the below renaming flow will + * add a link in old_dir which can conver inline_dir. + * After then, if we failed to get the entry due to other + * reasons like ENOMEM, we had to remove the new entry. + * Instead of adding such the error handling routine, let's + * simply convert first here. + */ + if (old_dir == new_dir && !new_inode) { + err = f2fs_try_convert_inline_dir(old_dir, new_dentry); + if (err) + return err; + } + + if (flags & RENAME_WHITEOUT) { + err = f2fs_create_whiteout(old_dir, &whiteout); + if (err) + return err; + } + err = dquot_initialize(old_dir); if (err) goto out; @@ -898,17 +969,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } } - if (flags & RENAME_WHITEOUT) { - err = f2fs_create_whiteout(old_dir, &whiteout); - if (err) - goto out_dir; - } - if (new_inode) { err = -ENOTEMPTY; if (old_dir_entry && !f2fs_empty_dir(new_inode)) - goto out_whiteout; + goto out_dir; err = -ENOENT; new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, @@ -916,7 +981,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!new_entry) { if (IS_ERR(new_page)) err = PTR_ERR(new_page); - goto out_whiteout; + goto out_dir; } f2fs_balance_fs(sbi, true); @@ -928,6 +993,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto put_out_dir; f2fs_set_link(new_dir, new_entry, new_page, old_inode); + new_page = NULL; new_inode->i_ctime = current_time(new_inode); down_write(&F2FS_I(new_inode)->i_sem); @@ -948,33 +1014,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, err = f2fs_add_link(new_dentry, old_inode); if (err) { f2fs_unlock_op(sbi); - goto out_whiteout; + goto out_dir; } if (old_dir_entry) f2fs_i_links_write(new_dir, true); - - /* - * old entry and new entry can locate in the same inline - * dentry in inode, when attaching new entry in inline dentry, - * it could force inline dentry conversion, after that, - * old_entry and old_page will point to wrong address, in - * order to avoid this, let's do the check and update here. - */ - if (is_old_inline && !f2fs_has_inline_dentry(old_dir)) { - f2fs_put_page(old_page, 0); - old_page = NULL; - - old_entry = f2fs_find_entry(old_dir, - &old_dentry->d_name, &old_page); - if (!old_entry) { - err = -ENOENT; - if (IS_ERR(old_page)) - err = PTR_ERR(old_page); - f2fs_unlock_op(sbi); - goto out_whiteout; - } - } } down_write(&F2FS_I(old_inode)->i_sem); @@ -989,9 +1033,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_mark_inode_dirty_sync(old_inode, false); f2fs_delete_entry(old_entry, old_page, old_dir, NULL); + old_page = NULL; if (whiteout) { - whiteout->i_state |= I_LINKABLE; set_inode_flag(whiteout, FI_INC_LINK); err = f2fs_add_link(old_dentry, whiteout); if (err) @@ -1025,17 +1069,15 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, put_out_dir: f2fs_unlock_op(sbi); - if (new_page) - f2fs_put_page(new_page, 0); -out_whiteout: - if (whiteout) - iput(whiteout); + f2fs_put_page(new_page, 0); out_dir: if (old_dir_entry) f2fs_put_page(old_dir_page, 0); out_old: f2fs_put_page(old_page, 0); out: + if (whiteout) + iput(whiteout); return err; } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 76477f71d4ee..763d5c0951d1 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -723,6 +723,7 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) int ret = 0; unsigned long s_flags = sbi->sb->s_flags; bool need_writecp = false; + bool fix_curseg_write_pointer = false; #ifdef CONFIG_QUOTA int quota_enabled; #endif @@ -774,6 +775,8 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) sbi->sb->s_flags = s_flags; } skip: + fix_curseg_write_pointer = !check_only || list_empty(&inode_list); + destroy_fsync_dnodes(&inode_list, err); destroy_fsync_dnodes(&tmp_inode_list, err); @@ -784,9 +787,22 @@ skip: if (err) { truncate_inode_pages_final(NODE_MAPPING(sbi)); truncate_inode_pages_final(META_MAPPING(sbi)); - } else { - clear_sbi_flag(sbi, SBI_POR_DOING); } + + /* + * If fsync data succeeds or there is no fsync data to recover, + * and the f2fs is not read only, check and fix zoned block devices' + * write pointer consistency. + */ + if (!err && fix_curseg_write_pointer && !f2fs_readonly(sbi->sb) && + f2fs_sb_has_blkzoned(sbi)) { + err = f2fs_fix_curseg_write_pointer(sbi); + ret = err; + } + + if (!err) + clear_sbi_flag(sbi, SBI_POR_DOING); + mutex_unlock(&sbi->cp_mutex); /* let's drop all the directory inodes for clean checkpoint */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 56e81447e2f3..cf0eb002cfd4 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -334,7 +334,6 @@ void f2fs_drop_inmem_pages(struct inode *inode) } fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; - stat_dec_atomic_write(inode); spin_lock(&sbi->inode_lock[ATOMIC_FILE]); if (!list_empty(&fi->inmem_ilist)) @@ -505,7 +504,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) * dir/node pages without enough free segments. */ if (has_not_enough_free_secs(sbi, 0, 0)) { - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); f2fs_gc(sbi, false, false, NULL_SEGNO); } } @@ -2225,7 +2224,7 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) struct sit_info *sit_i = SIT_I(sbi); f2fs_bug_on(sbi, addr == NULL_ADDR); - if (addr == NEW_ADDR) + if (addr == NEW_ADDR || addr == COMPRESS_ADDR) return; invalidate_mapping_pages(META_MAPPING(sbi), addr, addr); @@ -2861,9 +2860,9 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) if (sbi->discard_blks == 0) goto out; - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); err = f2fs_write_checkpoint(sbi, &cpc); - mutex_unlock(&sbi->gc_mutex); + up_write(&sbi->gc_lock); if (err) goto out; @@ -3036,7 +3035,8 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) if (fio->type == DATA) { struct inode *inode = fio->page->mapping->host; - if (is_cold_data(fio->page) || file_is_cold(inode)) + if (is_cold_data(fio->page) || file_is_cold(inode) || + f2fs_compressed_file(inode)) return CURSEG_COLD_DATA; if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || @@ -3289,7 +3289,7 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) stat_inc_inplace_blocks(fio->sbi); - if (fio->bio) + if (fio->bio && !(SM_I(sbi)->ipu_policy & (1 << F2FS_IPU_NOCACHE))) err = f2fs_merge_page_bio(fio); else err = f2fs_submit_page_bio(fio); @@ -4368,6 +4368,263 @@ out: return 0; } +#ifdef CONFIG_BLK_DEV_ZONED + +static int check_zone_write_pointer(struct f2fs_sb_info *sbi, + struct f2fs_dev_info *fdev, + struct blk_zone *zone) +{ + unsigned int wp_segno, wp_blkoff, zone_secno, zone_segno, segno; + block_t zone_block, wp_block, last_valid_block; + unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT; + int i, s, b, ret; + struct seg_entry *se; + + if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ) + return 0; + + wp_block = fdev->start_blk + (zone->wp >> log_sectors_per_block); + wp_segno = GET_SEGNO(sbi, wp_block); + wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno); + zone_block = fdev->start_blk + (zone->start >> log_sectors_per_block); + zone_segno = GET_SEGNO(sbi, zone_block); + zone_secno = GET_SEC_FROM_SEG(sbi, zone_segno); + + if (zone_segno >= MAIN_SEGS(sbi)) + return 0; + + /* + * Skip check of zones cursegs point to, since + * fix_curseg_write_pointer() checks them. + */ + for (i = 0; i < NO_CHECK_TYPE; i++) + if (zone_secno == GET_SEC_FROM_SEG(sbi, + CURSEG_I(sbi, i)->segno)) + return 0; + + /* + * Get last valid block of the zone. + */ + last_valid_block = zone_block - 1; + for (s = sbi->segs_per_sec - 1; s >= 0; s--) { + segno = zone_segno + s; + se = get_seg_entry(sbi, segno); + for (b = sbi->blocks_per_seg - 1; b >= 0; b--) + if (f2fs_test_bit(b, se->cur_valid_map)) { + last_valid_block = START_BLOCK(sbi, segno) + b; + break; + } + if (last_valid_block >= zone_block) + break; + } + + /* + * If last valid block is beyond the write pointer, report the + * inconsistency. This inconsistency does not cause write error + * because the zone will not be selected for write operation until + * it get discarded. Just report it. + */ + if (last_valid_block >= wp_block) { + f2fs_notice(sbi, "Valid block beyond write pointer: " + "valid block[0x%x,0x%x] wp[0x%x,0x%x]", + GET_SEGNO(sbi, last_valid_block), + GET_BLKOFF_FROM_SEG0(sbi, last_valid_block), + wp_segno, wp_blkoff); + return 0; + } + + /* + * If there is no valid block in the zone and if write pointer is + * not at zone start, reset the write pointer. + */ + if (last_valid_block + 1 == zone_block && zone->wp != zone->start) { + f2fs_notice(sbi, + "Zone without valid block has non-zero write " + "pointer. Reset the write pointer: wp[0x%x,0x%x]", + wp_segno, wp_blkoff); + ret = __f2fs_issue_discard_zone(sbi, fdev->bdev, zone_block, + zone->len >> log_sectors_per_block); + if (ret) { + f2fs_err(sbi, "Discard zone failed: %s (errno=%d)", + fdev->path, ret); + return ret; + } + } + + return 0; +} + +static struct f2fs_dev_info *get_target_zoned_dev(struct f2fs_sb_info *sbi, + block_t zone_blkaddr) +{ + int i; + + for (i = 0; i < sbi->s_ndevs; i++) { + if (!bdev_is_zoned(FDEV(i).bdev)) + continue; + if (sbi->s_ndevs == 1 || (FDEV(i).start_blk <= zone_blkaddr && + zone_blkaddr <= FDEV(i).end_blk)) + return &FDEV(i); + } + + return NULL; +} + +static int report_one_zone_cb(struct blk_zone *zone, unsigned int idx, + void *data) { + memcpy(data, zone, sizeof(struct blk_zone)); + return 0; +} + +static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *cs = CURSEG_I(sbi, type); + struct f2fs_dev_info *zbd; + struct blk_zone zone; + unsigned int cs_section, wp_segno, wp_blkoff, wp_sector_off; + block_t cs_zone_block, wp_block; + unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT; + sector_t zone_sector; + int err; + + cs_section = GET_SEC_FROM_SEG(sbi, cs->segno); + cs_zone_block = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, cs_section)); + + zbd = get_target_zoned_dev(sbi, cs_zone_block); + if (!zbd) + return 0; + + /* report zone for the sector the curseg points to */ + zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) + << log_sectors_per_block; + err = blkdev_report_zones(zbd->bdev, zone_sector, 1, + report_one_zone_cb, &zone); + if (err != 1) { + f2fs_err(sbi, "Report zone failed: %s errno=(%d)", + zbd->path, err); + return err; + } + + if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ) + return 0; + + wp_block = zbd->start_blk + (zone.wp >> log_sectors_per_block); + wp_segno = GET_SEGNO(sbi, wp_block); + wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno); + wp_sector_off = zone.wp & GENMASK(log_sectors_per_block - 1, 0); + + if (cs->segno == wp_segno && cs->next_blkoff == wp_blkoff && + wp_sector_off == 0) + return 0; + + f2fs_notice(sbi, "Unaligned curseg[%d] with write pointer: " + "curseg[0x%x,0x%x] wp[0x%x,0x%x]", + type, cs->segno, cs->next_blkoff, wp_segno, wp_blkoff); + + f2fs_notice(sbi, "Assign new section to curseg[%d]: " + "curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff); + allocate_segment_by_default(sbi, type, true); + + /* check consistency of the zone curseg pointed to */ + if (check_zone_write_pointer(sbi, zbd, &zone)) + return -EIO; + + /* check newly assigned zone */ + cs_section = GET_SEC_FROM_SEG(sbi, cs->segno); + cs_zone_block = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, cs_section)); + + zbd = get_target_zoned_dev(sbi, cs_zone_block); + if (!zbd) + return 0; + + zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) + << log_sectors_per_block; + err = blkdev_report_zones(zbd->bdev, zone_sector, 1, + report_one_zone_cb, &zone); + if (err != 1) { + f2fs_err(sbi, "Report zone failed: %s errno=(%d)", + zbd->path, err); + return err; + } + + if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ) + return 0; + + if (zone.wp != zone.start) { + f2fs_notice(sbi, + "New zone for curseg[%d] is not yet discarded. " + "Reset the zone: curseg[0x%x,0x%x]", + type, cs->segno, cs->next_blkoff); + err = __f2fs_issue_discard_zone(sbi, zbd->bdev, + zone_sector >> log_sectors_per_block, + zone.len >> log_sectors_per_block); + if (err) { + f2fs_err(sbi, "Discard zone failed: %s (errno=%d)", + zbd->path, err); + return err; + } + } + + return 0; +} + +int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi) +{ + int i, ret; + + for (i = 0; i < NO_CHECK_TYPE; i++) { + ret = fix_curseg_write_pointer(sbi, i); + if (ret) + return ret; + } + + return 0; +} + +struct check_zone_write_pointer_args { + struct f2fs_sb_info *sbi; + struct f2fs_dev_info *fdev; +}; + +static int check_zone_write_pointer_cb(struct blk_zone *zone, unsigned int idx, + void *data) { + struct check_zone_write_pointer_args *args; + args = (struct check_zone_write_pointer_args *)data; + + return check_zone_write_pointer(args->sbi, args->fdev, zone); +} + +int f2fs_check_write_pointer(struct f2fs_sb_info *sbi) +{ + int i, ret; + struct check_zone_write_pointer_args args; + + for (i = 0; i < sbi->s_ndevs; i++) { + if (!bdev_is_zoned(FDEV(i).bdev)) + continue; + + args.sbi = sbi; + args.fdev = &FDEV(i); + ret = blkdev_report_zones(FDEV(i).bdev, 0, BLK_ALL_ZONES, + check_zone_write_pointer_cb, &args); + if (ret < 0) + return ret; + } + + return 0; +} +#else +int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi) +{ + return 0; +} + +int f2fs_check_write_pointer(struct f2fs_sb_info *sbi) +{ + return 0; +} +#endif + /* * Update min, max modified time for cost-benefit GC algorithm */ diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index a95467b202ea..459dc3901a57 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -200,18 +200,6 @@ struct segment_allocation { void (*allocate_segment)(struct f2fs_sb_info *, int, bool); }; -/* - * this value is set in page as a private data which indicate that - * the page is atomically written, and it is in inmem_pages list. - */ -#define ATOMIC_WRITTEN_PAGE ((unsigned long)-1) -#define DUMMY_WRITTEN_PAGE ((unsigned long)-2) - -#define IS_ATOMIC_WRITTEN_PAGE(page) \ - (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE) -#define IS_DUMMY_WRITTEN_PAGE(page) \ - (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) - #define MAX_SKIP_GC_COUNT 16 struct inmem_pages { @@ -619,8 +607,10 @@ static inline int utilization(struct f2fs_sb_info *sbi) * threashold, * F2FS_IPU_FSYNC - activated in fsync path only for high performance flash * storages. IPU will be triggered only if the # of dirty - * pages over min_fsync_blocks. - * F2FS_IPUT_DISABLE - disable IPU. (=default option) + * pages over min_fsync_blocks. (=default option) + * F2FS_IPU_ASYNC - do IPU given by asynchronous write requests. + * F2FS_IPU_NOCACHE - disable IPU bio cache. + * F2FS_IPUT_DISABLE - disable IPU. (=default option in LFS mode) */ #define DEF_MIN_IPU_UTIL 70 #define DEF_MIN_FSYNC_BLOCKS 8 @@ -635,6 +625,7 @@ enum { F2FS_IPU_SSR_UTIL, F2FS_IPU_FSYNC, F2FS_IPU_ASYNC, + F2FS_IPU_NOCACHE, }; static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 5111e1ffe58a..65a7a432dfee 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -141,6 +141,9 @@ enum { Opt_checkpoint_disable_cap, Opt_checkpoint_disable_cap_perc, Opt_checkpoint_enable, + Opt_compress_algorithm, + Opt_compress_log_size, + Opt_compress_extension, Opt_err, }; @@ -203,6 +206,9 @@ static match_table_t f2fs_tokens = { {Opt_checkpoint_disable_cap, "checkpoint=disable:%u"}, {Opt_checkpoint_disable_cap_perc, "checkpoint=disable:%u%%"}, {Opt_checkpoint_enable, "checkpoint=enable"}, + {Opt_compress_algorithm, "compress_algorithm=%s"}, + {Opt_compress_log_size, "compress_log_size=%u"}, + {Opt_compress_extension, "compress_extension=%s"}, {Opt_err, NULL}, }; @@ -391,8 +397,9 @@ static int parse_options(struct super_block *sb, char *options) { struct f2fs_sb_info *sbi = F2FS_SB(sb); substring_t args[MAX_OPT_ARGS]; + unsigned char (*ext)[F2FS_EXTENSION_LEN]; char *p, *name; - int arg = 0; + int arg = 0, ext_cnt; kuid_t uid; kgid_t gid; #ifdef CONFIG_QUOTA @@ -810,6 +817,66 @@ static int parse_options(struct super_block *sb, char *options) case Opt_checkpoint_enable: clear_opt(sbi, DISABLE_CHECKPOINT); break; + case Opt_compress_algorithm: + if (!f2fs_sb_has_compression(sbi)) { + f2fs_err(sbi, "Compression feature if off"); + return -EINVAL; + } + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + if (strlen(name) == 3 && !strcmp(name, "lzo")) { + F2FS_OPTION(sbi).compress_algorithm = + COMPRESS_LZO; + } else if (strlen(name) == 3 && + !strcmp(name, "lz4")) { + F2FS_OPTION(sbi).compress_algorithm = + COMPRESS_LZ4; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_compress_log_size: + if (!f2fs_sb_has_compression(sbi)) { + f2fs_err(sbi, "Compression feature is off"); + return -EINVAL; + } + if (args->from && match_int(args, &arg)) + return -EINVAL; + if (arg < MIN_COMPRESS_LOG_SIZE || + arg > MAX_COMPRESS_LOG_SIZE) { + f2fs_err(sbi, + "Compress cluster log size is out of range"); + return -EINVAL; + } + F2FS_OPTION(sbi).compress_log_size = arg; + break; + case Opt_compress_extension: + if (!f2fs_sb_has_compression(sbi)) { + f2fs_err(sbi, "Compression feature is off"); + return -EINVAL; + } + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + + ext = F2FS_OPTION(sbi).extensions; + ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + + if (strlen(name) >= F2FS_EXTENSION_LEN || + ext_cnt >= COMPRESS_EXT_NUM) { + f2fs_err(sbi, + "invalid extension length/number"); + kfree(name); + return -EINVAL; + } + + strcpy(ext[ext_cnt], name); + F2FS_OPTION(sbi).compress_ext_cnt++; + kfree(name); + break; default: f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", p); @@ -1125,6 +1192,8 @@ static void f2fs_put_super(struct super_block *sb) f2fs_destroy_node_manager(sbi); f2fs_destroy_segment_manager(sbi); + f2fs_destroy_post_read_wq(sbi); + kvfree(sbi->ckpt); f2fs_unregister_sysfs(sbi); @@ -1169,9 +1238,9 @@ int f2fs_sync_fs(struct super_block *sb, int sync) cpc.reason = __get_cp_reason(sbi); - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); err = f2fs_write_checkpoint(sbi, &cpc); - mutex_unlock(&sbi->gc_mutex); + up_write(&sbi->gc_lock); } f2fs_trace_ios(NULL, 1); @@ -1213,12 +1282,10 @@ static int f2fs_statfs_project(struct super_block *sb, return PTR_ERR(dquot); spin_lock(&dquot->dq_dqb_lock); - limit = 0; - if (dquot->dq_dqb.dqb_bsoftlimit) - limit = dquot->dq_dqb.dqb_bsoftlimit; - if (dquot->dq_dqb.dqb_bhardlimit && - (!limit || dquot->dq_dqb.dqb_bhardlimit < limit)) - limit = dquot->dq_dqb.dqb_bhardlimit; + limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit, + dquot->dq_dqb.dqb_bhardlimit); + if (limit) + limit >>= sb->s_blocksize_bits; if (limit && buf->f_blocks > limit) { curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits; @@ -1228,12 +1295,8 @@ static int f2fs_statfs_project(struct super_block *sb, (buf->f_blocks - curblock) : 0; } - limit = 0; - if (dquot->dq_dqb.dqb_isoftlimit) - limit = dquot->dq_dqb.dqb_isoftlimit; - if (dquot->dq_dqb.dqb_ihardlimit && - (!limit || dquot->dq_dqb.dqb_ihardlimit < limit)) - limit = dquot->dq_dqb.dqb_ihardlimit; + limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit, + dquot->dq_dqb.dqb_ihardlimit); if (limit && buf->f_files > limit) { buf->f_files = limit; @@ -1340,6 +1403,35 @@ static inline void f2fs_show_quota_options(struct seq_file *seq, #endif } +static inline void f2fs_show_compress_options(struct seq_file *seq, + struct super_block *sb) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + char *algtype = ""; + int i; + + if (!f2fs_sb_has_compression(sbi)) + return; + + switch (F2FS_OPTION(sbi).compress_algorithm) { + case COMPRESS_LZO: + algtype = "lzo"; + break; + case COMPRESS_LZ4: + algtype = "lz4"; + break; + } + seq_printf(seq, ",compress_algorithm=%s", algtype); + + seq_printf(seq, ",compress_log_size=%u", + F2FS_OPTION(sbi).compress_log_size); + + for (i = 0; i < F2FS_OPTION(sbi).compress_ext_cnt; i++) { + seq_printf(seq, ",compress_extension=%s", + F2FS_OPTION(sbi).extensions[i]); + } +} + static int f2fs_show_options(struct seq_file *seq, struct dentry *root) { struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); @@ -1462,6 +1554,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",fsync_mode=%s", "strict"); else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_NOBARRIER) seq_printf(seq, ",fsync_mode=%s", "nobarrier"); + + f2fs_show_compress_options(seq, sbi->sb); return 0; } @@ -1476,6 +1570,9 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).test_dummy_encryption = false; F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); + F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZO; + F2FS_OPTION(sbi).compress_log_size = MIN_COMPRESS_LOG_SIZE; + F2FS_OPTION(sbi).compress_ext_cnt = 0; set_opt(sbi, BG_GC); set_opt(sbi, INLINE_XATTR); @@ -1524,7 +1621,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) f2fs_update_time(sbi, DISABLE_TIME); while (!f2fs_time_over(sbi, DISABLE_TIME)) { - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); err = f2fs_gc(sbi, true, false, NULL_SEGNO); if (err == -ENODATA) { err = 0; @@ -1546,7 +1643,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) goto restore_flag; } - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); cpc.reason = CP_PAUSE; set_sbi_flag(sbi, SBI_CP_DISABLED); err = f2fs_write_checkpoint(sbi, &cpc); @@ -1558,7 +1655,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) spin_unlock(&sbi->stat_lock); out_unlock: - mutex_unlock(&sbi->gc_mutex); + up_write(&sbi->gc_lock); restore_flag: sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ return err; @@ -1566,12 +1663,12 @@ restore_flag: static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) { - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); f2fs_dirty_to_prefree(sbi); clear_sbi_flag(sbi, SBI_CP_DISABLED); set_sbi_flag(sbi, SBI_IS_DIRTY); - mutex_unlock(&sbi->gc_mutex); + up_write(&sbi->gc_lock); f2fs_sync_fs(sbi->sb, 1); } @@ -2158,7 +2255,7 @@ static int f2fs_dquot_commit(struct dquot *dquot) struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb); int ret; - down_read(&sbi->quota_sem); + down_read_nested(&sbi->quota_sem, SINGLE_DEPTH_NESTING); ret = dquot_commit(dquot); if (ret < 0) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); @@ -2182,13 +2279,10 @@ static int f2fs_dquot_acquire(struct dquot *dquot) static int f2fs_dquot_release(struct dquot *dquot) { struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb); - int ret; + int ret = dquot_release(dquot); - down_read(&sbi->quota_sem); - ret = dquot_release(dquot); if (ret < 0) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); - up_read(&sbi->quota_sem); return ret; } @@ -2196,29 +2290,22 @@ static int f2fs_dquot_mark_dquot_dirty(struct dquot *dquot) { struct super_block *sb = dquot->dq_sb; struct f2fs_sb_info *sbi = F2FS_SB(sb); - int ret; - - down_read(&sbi->quota_sem); - ret = dquot_mark_dquot_dirty(dquot); + int ret = dquot_mark_dquot_dirty(dquot); /* if we are using journalled quota */ if (is_journalled_quota(sbi)) set_sbi_flag(sbi, SBI_QUOTA_NEED_FLUSH); - up_read(&sbi->quota_sem); return ret; } static int f2fs_dquot_commit_info(struct super_block *sb, int type) { struct f2fs_sb_info *sbi = F2FS_SB(sb); - int ret; + int ret = dquot_commit_info(sb, type); - down_read(&sbi->quota_sem); - ret = dquot_commit_info(sb, type); if (ret < 0) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); - up_read(&sbi->quota_sem); return ret; } @@ -3311,7 +3398,7 @@ try_onemore: /* init f2fs-specific super block info */ sbi->valid_super_block = valid_super_block; - mutex_init(&sbi->gc_mutex); + init_rwsem(&sbi->gc_lock); mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); mutex_init(&sbi->resize_mutex); @@ -3400,6 +3487,12 @@ try_onemore: goto free_devices; } + err = f2fs_init_post_read_wq(sbi); + if (err) { + f2fs_err(sbi, "Failed to initialize post read workqueue"); + goto free_devices; + } + sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); percpu_counter_set(&sbi->total_valid_inode_count, @@ -3544,6 +3637,17 @@ try_onemore: goto free_meta; } } + + /* + * If the f2fs is not readonly and fsync data recovery succeeds, + * check zoned block devices' write pointer consistency. + */ + if (!err && !f2fs_readonly(sb) && f2fs_sb_has_blkzoned(sbi)) { + err = f2fs_check_write_pointer(sbi); + if (err) + goto free_meta; + } + reset_checkpoint: /* f2fs_recover_fsync_data() cleared this already */ clear_sbi_flag(sbi, SBI_POR_DOING); @@ -3621,6 +3725,7 @@ free_nm: f2fs_destroy_node_manager(sbi); free_sm: f2fs_destroy_segment_manager(sbi); + f2fs_destroy_post_read_wq(sbi); free_devices: destroy_device_list(sbi); kvfree(sbi->ckpt); @@ -3762,8 +3867,12 @@ static int __init init_f2fs_fs(void) err = f2fs_init_bio_entry_cache(); if (err) goto free_post_read; + err = f2fs_init_bioset(); + if (err) + goto free_bio_enrty_cache; return 0; - +free_bio_enrty_cache: + f2fs_destroy_bio_entry_cache(); free_post_read: f2fs_destroy_post_read_processing(); free_root_stats: @@ -3789,6 +3898,7 @@ fail: static void __exit exit_f2fs_fs(void) { + f2fs_destroy_bioset(); f2fs_destroy_bio_entry_cache(); f2fs_destroy_post_read_processing(); f2fs_destroy_root_stats(); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 70945ceb9c0c..91d649790b1b 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -25,6 +25,9 @@ enum { DCC_INFO, /* struct discard_cmd_control */ NM_INFO, /* struct f2fs_nm_info */ F2FS_SBI, /* struct f2fs_sb_info */ +#ifdef CONFIG_F2FS_STAT_FS + STAT_INFO, /* struct f2fs_stat_info */ +#endif #ifdef CONFIG_F2FS_FAULT_INJECTION FAULT_INFO_RATE, /* struct f2fs_fault_info */ FAULT_INFO_TYPE, /* struct f2fs_fault_info */ @@ -42,6 +45,9 @@ struct f2fs_attr { int id; }; +static ssize_t f2fs_sbi_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf); + static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) { if (struct_type == GC_THREAD) @@ -59,41 +65,25 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) struct_type == FAULT_INFO_TYPE) return (unsigned char *)&F2FS_OPTION(sbi).fault_info; #endif +#ifdef CONFIG_F2FS_STAT_FS + else if (struct_type == STAT_INFO) + return (unsigned char *)F2FS_STAT(sbi); +#endif return NULL; } static ssize_t dirty_segments_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)(dirty_segments(sbi))); + return sprintf(buf, "%llu\n", + (unsigned long long)(dirty_segments(sbi))); } -static ssize_t unusable_show(struct f2fs_attr *a, +static ssize_t free_segments_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - block_t unusable; - - if (test_opt(sbi, DISABLE_CHECKPOINT)) - unusable = sbi->unusable_block_count; - else - unusable = f2fs_get_unusable_blocks(sbi); - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)unusable); -} - -static ssize_t encoding_show(struct f2fs_attr *a, - struct f2fs_sb_info *sbi, char *buf) -{ -#ifdef CONFIG_UNICODE - if (f2fs_sb_has_casefold(sbi)) - return snprintf(buf, PAGE_SIZE, "%s (%d.%d.%d)\n", - sbi->s_encoding->charset, - (sbi->s_encoding->version >> 16) & 0xff, - (sbi->s_encoding->version >> 8) & 0xff, - sbi->s_encoding->version & 0xff); -#endif - return snprintf(buf, PAGE_SIZE, "(none)"); + return sprintf(buf, "%llu\n", + (unsigned long long)(free_segments(sbi))); } static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, @@ -102,10 +92,10 @@ static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, struct super_block *sb = sbi->sb; if (!sb->s_bdev->bd_part) - return snprintf(buf, PAGE_SIZE, "0\n"); + return sprintf(buf, "0\n"); - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)(sbi->kbytes_written + + return sprintf(buf, "%llu\n", + (unsigned long long)(sbi->kbytes_written + BD_PART_WRITTEN(sbi))); } @@ -116,7 +106,7 @@ static ssize_t features_show(struct f2fs_attr *a, int len = 0; if (!sb->s_bdev->bd_part) - return snprintf(buf, PAGE_SIZE, "0\n"); + return sprintf(buf, "0\n"); if (f2fs_sb_has_encrypt(sbi)) len += snprintf(buf, PAGE_SIZE - len, "%s", @@ -154,6 +144,9 @@ static ssize_t features_show(struct f2fs_attr *a, if (f2fs_sb_has_casefold(sbi)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "casefold"); + if (f2fs_sb_has_compression(sbi)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "compression"); len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "pin_file"); len += snprintf(buf + len, PAGE_SIZE - len, "\n"); @@ -163,9 +156,66 @@ static ssize_t features_show(struct f2fs_attr *a, static ssize_t current_reserved_blocks_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return snprintf(buf, PAGE_SIZE, "%u\n", sbi->current_reserved_blocks); + return sprintf(buf, "%u\n", sbi->current_reserved_blocks); +} + +static ssize_t unusable_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + block_t unusable; + + if (test_opt(sbi, DISABLE_CHECKPOINT)) + unusable = sbi->unusable_block_count; + else + unusable = f2fs_get_unusable_blocks(sbi); + return sprintf(buf, "%llu\n", (unsigned long long)unusable); +} + +static ssize_t encoding_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ +#ifdef CONFIG_UNICODE + if (f2fs_sb_has_casefold(sbi)) + return snprintf(buf, PAGE_SIZE, "%s (%d.%d.%d)\n", + sbi->s_encoding->charset, + (sbi->s_encoding->version >> 16) & 0xff, + (sbi->s_encoding->version >> 8) & 0xff, + sbi->s_encoding->version & 0xff); +#endif + return sprintf(buf, "(none)"); } +#ifdef CONFIG_F2FS_STAT_FS +static ssize_t moved_blocks_foreground_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + + return sprintf(buf, "%llu\n", + (unsigned long long)(si->tot_blks - + (si->bg_data_blks + si->bg_node_blks))); +} + +static ssize_t moved_blocks_background_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + + return sprintf(buf, "%llu\n", + (unsigned long long)(si->bg_data_blks + si->bg_node_blks)); +} + +static ssize_t avg_vblocks_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + + si->dirty_count = dirty_segments(sbi); + f2fs_update_sit_info(sbi); + return sprintf(buf, "%llu\n", (unsigned long long)(si->avg_vblocks)); +} +#endif + static ssize_t f2fs_sbi_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -199,7 +249,7 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, ui = (unsigned int *)(ptr + a->offset); - return snprintf(buf, PAGE_SIZE, "%u\n", *ui); + return sprintf(buf, "%u\n", *ui); } static ssize_t __sbi_store(struct f2fs_attr *a, @@ -389,6 +439,7 @@ enum feat_id { FEAT_VERITY, FEAT_SB_CHECKSUM, FEAT_CASEFOLD, + FEAT_COMPRESSION, }; static ssize_t f2fs_feature_show(struct f2fs_attr *a, @@ -408,7 +459,8 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a, case FEAT_VERITY: case FEAT_SB_CHECKSUM: case FEAT_CASEFOLD: - return snprintf(buf, PAGE_SIZE, "supported\n"); + case FEAT_COMPRESSION: + return sprintf(buf, "supported\n"); } return 0; } @@ -437,6 +489,14 @@ static struct f2fs_attr f2fs_attr_##_name = { \ .id = _id, \ } +#define F2FS_STAT_ATTR(_struct_type, _struct_name, _name, _elname) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0444 }, \ + .show = f2fs_sbi_show, \ + .struct_type = _struct_type, \ + .offset = offsetof(struct _struct_name, _elname), \ +} + F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent_sleep_time, urgent_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); @@ -478,11 +538,21 @@ F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); #endif F2FS_GENERAL_RO_ATTR(dirty_segments); +F2FS_GENERAL_RO_ATTR(free_segments); F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); F2FS_GENERAL_RO_ATTR(features); F2FS_GENERAL_RO_ATTR(current_reserved_blocks); F2FS_GENERAL_RO_ATTR(unusable); F2FS_GENERAL_RO_ATTR(encoding); +#ifdef CONFIG_F2FS_STAT_FS +F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count); +F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count); +F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, gc_foreground_calls, call_count); +F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, gc_background_calls, bg_gc); +F2FS_GENERAL_RO_ATTR(moved_blocks_background); +F2FS_GENERAL_RO_ATTR(moved_blocks_foreground); +F2FS_GENERAL_RO_ATTR(avg_vblocks); +#endif #ifdef CONFIG_FS_ENCRYPTION F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO); @@ -503,6 +573,7 @@ F2FS_FEATURE_RO_ATTR(verity, FEAT_VERITY); #endif F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM); F2FS_FEATURE_RO_ATTR(casefold, FEAT_CASEFOLD); +F2FS_FEATURE_RO_ATTR(compression, FEAT_COMPRESSION); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -543,12 +614,22 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(inject_type), #endif ATTR_LIST(dirty_segments), + ATTR_LIST(free_segments), ATTR_LIST(unusable), ATTR_LIST(lifetime_write_kbytes), ATTR_LIST(features), ATTR_LIST(reserved_blocks), ATTR_LIST(current_reserved_blocks), ATTR_LIST(encoding), +#ifdef CONFIG_F2FS_STAT_FS + ATTR_LIST(cp_foreground_calls), + ATTR_LIST(cp_background_calls), + ATTR_LIST(gc_foreground_calls), + ATTR_LIST(gc_background_calls), + ATTR_LIST(moved_blocks_foreground), + ATTR_LIST(moved_blocks_background), + ATTR_LIST(avg_vblocks), +#endif NULL, }; ATTRIBUTE_GROUPS(f2fs); @@ -573,6 +654,7 @@ static struct attribute *f2fs_feat_attrs[] = { #endif ATTR_LIST(sb_checksum), ATTR_LIST(casefold), + ATTR_LIST(compression), NULL, }; ATTRIBUTE_GROUPS(f2fs_feat); @@ -733,10 +815,12 @@ int __init f2fs_init_sysfs(void) ret = kobject_init_and_add(&f2fs_feat, &f2fs_feat_ktype, NULL, "features"); - if (ret) + if (ret) { + kobject_put(&f2fs_feat); kset_unregister(&f2fs_kset); - else + } else { f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + } return ret; } @@ -757,8 +841,11 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) init_completion(&sbi->s_kobj_unregister); err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL, "%s", sb->s_id); - if (err) + if (err) { + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); return err; + } if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); @@ -786,4 +873,5 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); } kobject_del(&sbi->s_kobj); + kobject_put(&sbi->s_kobj); } diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 5f04c5c810fb..594b05ae16c9 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -21,6 +21,7 @@ #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <asm/unaligned.h> +#include <linux/random.h> #include <linux/iversion.h> #include "fat.h" @@ -521,7 +522,7 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) inode->i_uid = sbi->options.fs_uid; inode->i_gid = sbi->options.fs_gid; inode_inc_iversion(inode); - inode->i_generation = get_seconds(); + inode->i_generation = prandom_u32(); if ((de->attr & ATTR_DIR) && !IS_FREE(de->name)) { inode->i_generation &= ~1; diff --git a/fs/file.c b/fs/file.c index 3da91a112bab..a364e1a9b7e8 100644 --- a/fs/file.c +++ b/fs/file.c @@ -642,7 +642,9 @@ out_unlock: EXPORT_SYMBOL(__close_fd); /* for ksys_close() */ /* - * variant of __close_fd that gets a ref on the file for later fput + * variant of __close_fd that gets a ref on the file for later fput. + * The caller must ensure that filp_close() called on the file, and then + * an fput(). */ int __close_fd_get_file(unsigned int fd, struct file **res) { @@ -662,7 +664,7 @@ int __close_fd_get_file(unsigned int fd, struct file **res) spin_unlock(&files->file_lock); get_file(file); *res = file; - return filp_close(file, files); + return 0; out_unlock: spin_unlock(&files->file_lock); @@ -706,9 +708,9 @@ void do_close_on_exec(struct files_struct *files) spin_unlock(&files->file_lock); } -static struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs) +static struct file *__fget_files(struct files_struct *files, unsigned int fd, + fmode_t mask, unsigned int refs) { - struct files_struct *files = current->files; struct file *file; rcu_read_lock(); @@ -729,6 +731,12 @@ loop: return file; } +static inline struct file *__fget(unsigned int fd, fmode_t mask, + unsigned int refs) +{ + return __fget_files(current->files, fd, mask, refs); +} + struct file *fget_many(unsigned int fd, unsigned int refs) { return __fget(fd, FMODE_PATH, refs); @@ -746,6 +754,18 @@ struct file *fget_raw(unsigned int fd) } EXPORT_SYMBOL(fget_raw); +struct file *fget_task(struct task_struct *task, unsigned int fd) +{ + struct file *file = NULL; + + task_lock(task); + if (task->files) + file = __fget_files(task->files, fd, 0, 1); + task_unlock(task); + + return file; +} + /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. * diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 335607b8c5c0..76ac9c7d32ec 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2063,7 +2063,7 @@ void wb_workfn(struct work_struct *work) struct bdi_writeback, dwork); long pages_written; - set_worker_desc("flush-%s", dev_name(wb->bdi->dev)); + set_worker_desc("flush-%s", bdi_dev_name(wb->bdi)); current->flags |= PF_SWAPWRITE; if (likely(!current_is_workqueue_rescuer() || diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 9c6df721321a..ba83b49ce18c 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -183,14 +183,12 @@ static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc struct inode *inode = page->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - int ret; if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) goto out; if (PageChecked(page) || current->journal_info) goto out_ignore; - ret = __gfs2_jdata_writepage(page, wbc); - return ret; + return __gfs2_jdata_writepage(page, wbc); out_ignore: redirty_page_for_writepage(wbc, page); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index eb9c0578978f..c8b62577e2f2 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -73,9 +73,6 @@ #include "bmap.h" #include "util.h" -#define IS_LEAF 1 /* Hashed (leaf) directory */ -#define IS_DINODE 2 /* Linear (stuffed dinode block) directory */ - #define MAX_RA_BLOCKS 32 /* max read-ahead blocks */ #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index b7123de7c180..d0eceaff3cea 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -826,7 +826,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb)); if (glops->go_flags & GLOF_LVB) { - gl->gl_lksb.sb_lvbptr = kzalloc(GFS2_MIN_LVB_SIZE, GFP_NOFS); + gl->gl_lksb.sb_lvbptr = kzalloc(GDLM_LVB_SIZE, GFP_NOFS); if (!gl->gl_lksb.sb_lvbptr) { kmem_cache_free(cachep, gl); return -ENOMEM; diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 4ede1f18de85..061d22e1ceb6 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -95,7 +95,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl) /* A shortened, inline version of gfs2_trans_begin() * tr->alloced is not set since the transaction structure is * on the stack */ - tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64)); + tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes); tr.tr_ip = _RET_IP_; if (gfs2_log_reserve(sdp, tr.tr_reserved) < 0) return; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 5f89c515f5bb..9fd88ed18807 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -387,8 +387,6 @@ struct gfs2_glock { struct rhash_head gl_node; }; -#define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */ - enum { GIF_INVALID = 0, GIF_QD_LOCKED = 1, @@ -505,6 +503,7 @@ struct gfs2_trans { unsigned int tr_num_buf_rm; unsigned int tr_num_databuf_rm; unsigned int tr_num_revoke; + unsigned int tr_num_revoke_rm; struct list_head tr_list; struct list_head tr_databuf; @@ -703,6 +702,7 @@ struct gfs2_sbd { u32 sd_fsb2bb_shift; u32 sd_diptrs; /* Number of pointers in a dinode */ u32 sd_inptrs; /* Number of pointers in a indirect block */ + u32 sd_ldptrs; /* Number of pointers in a log descriptor block */ u32 sd_jbsize; /* Size of a journaled data block */ u32 sd_hash_bsize; /* sizeof(exhash block) */ u32 sd_hash_bsize_shift; @@ -803,7 +803,7 @@ struct gfs2_sbd { struct gfs2_trans *sd_log_tr; unsigned int sd_log_blks_reserved; - int sd_log_commited_revoke; + int sd_log_committed_revoke; atomic_t sd_log_pinned; unsigned int sd_log_num_revoke; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index dafef10b91f1..2716d56ed0a0 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -136,7 +136,6 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, if (inode->i_state & I_NEW) { struct gfs2_sbd *sdp = GFS2_SB(inode); - ip->i_no_formal_ino = no_formal_ino; error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); if (unlikely(error)) @@ -175,21 +174,22 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, gfs2_glock_put(io_gl); io_gl = NULL; + /* Lowest possible timestamp; will be overwritten in gfs2_dinode_in. */ + inode->i_atime.tv_sec = 1LL << (8 * sizeof(inode->i_atime.tv_sec) - 1); + inode->i_atime.tv_nsec = 0; + if (type == DT_UNKNOWN) { /* Inode glock must be locked already */ error = gfs2_inode_refresh(GFS2_I(inode)); if (error) goto fail_refresh; } else { + ip->i_no_formal_ino = no_formal_ino; inode->i_mode = DT2IF(type); } gfs2_set_iop(inode); - /* Lowest possible timestamp; will be overwritten in gfs2_dinode_in. */ - inode->i_atime.tv_sec = 1LL << (8 * sizeof(inode->i_atime.tv_sec) - 1); - inode->i_atime.tv_nsec = 0; - unlock_new_inode(inode); } diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index eb3f2e7b8085..00a2e721a374 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -37,7 +37,6 @@ static void gfs2_log_shutdown(struct gfs2_sbd *sdp); * gfs2_struct2blk - compute stuff * @sdp: the filesystem * @nstruct: the number of structures - * @ssize: the size of the structures * * Compute the number of log descriptor blocks needed to hold a certain number * of structures of a certain size. @@ -45,18 +44,16 @@ static void gfs2_log_shutdown(struct gfs2_sbd *sdp); * Returns: the number of blocks needed (minimum is always 1) */ -unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, - unsigned int ssize) +unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct) { unsigned int blks; unsigned int first, second; blks = 1; - first = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / ssize; + first = sdp->sd_ldptrs; if (nstruct > first) { - second = (sdp->sd_sb.sb_bsize - - sizeof(struct gfs2_meta_header)) / ssize; + second = sdp->sd_inptrs; blks += DIV_ROUND_UP(nstruct - first, second); } @@ -472,9 +469,8 @@ static unsigned int calc_reserved(struct gfs2_sbd *sdp) reserved += DIV_ROUND_UP(dbuf, databuf_limit(sdp)); } - if (sdp->sd_log_commited_revoke > 0) - reserved += gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke, - sizeof(u64)); + if (sdp->sd_log_committed_revoke > 0) + reserved += gfs2_struct2blk(sdp, sdp->sd_log_committed_revoke); /* One for the overall header */ if (reserved) reserved++; @@ -829,7 +825,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) if (unlikely(state == SFS_FROZEN)) gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); gfs2_assert_withdraw(sdp, - sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke); + sdp->sd_log_num_revoke == sdp->sd_log_committed_revoke); gfs2_ordered_write(sdp); lops_before_commit(sdp, tr); @@ -848,7 +844,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) gfs2_log_lock(sdp); sdp->sd_log_head = sdp->sd_log_flush_head; sdp->sd_log_blks_reserved = 0; - sdp->sd_log_commited_revoke = 0; + sdp->sd_log_committed_revoke = 0; spin_lock(&sdp->sd_ail_lock); if (tr && !list_empty(&tr->tr_ail1_list)) { @@ -899,6 +895,7 @@ static void gfs2_merge_trans(struct gfs2_trans *old, struct gfs2_trans *new) old->tr_num_buf_rm += new->tr_num_buf_rm; old->tr_num_databuf_rm += new->tr_num_databuf_rm; old->tr_num_revoke += new->tr_num_revoke; + old->tr_num_revoke_rm += new->tr_num_revoke_rm; list_splice_tail_init(&new->tr_databuf, &old->tr_databuf); list_splice_tail_init(&new->tr_buf, &old->tr_buf); @@ -920,7 +917,7 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) set_bit(TR_ATTACHED, &tr->tr_flags); } - sdp->sd_log_commited_revoke += tr->tr_num_revoke; + sdp->sd_log_committed_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm; reserved = calc_reserved(sdp); maxres = sdp->sd_log_blks_reserved + tr->tr_reserved; gfs2_assert_withdraw(sdp, maxres >= reserved); diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 2ff163a8dce1..c0a65e5a126b 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -60,9 +60,9 @@ static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip) spin_unlock(&sdp->sd_ordered_lock); } } + extern void gfs2_ordered_del_inode(struct gfs2_inode *ip); -extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, - unsigned int ssize); +extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct); extern void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 55fed7daf2b1..d9431724b788 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -259,7 +259,7 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno, struct super_block *sb = sdp->sd_vfs; struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); - bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9); + bio->bi_iter.bi_sector = blkno << (sb->s_blocksize_bits - 9); bio_set_dev(bio, sb->s_bdev); bio->bi_end_io = end_io; bio->bi_private = sdp; @@ -472,6 +472,20 @@ static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index, put_page(page); /* Once more for find_or_create_page */ } +static struct bio *gfs2_chain_bio(struct bio *prev, unsigned int nr_iovecs) +{ + struct bio *new; + + new = bio_alloc(GFP_NOIO, nr_iovecs); + bio_copy_dev(new, prev); + new->bi_iter.bi_sector = bio_end_sector(prev); + new->bi_opf = prev->bi_opf; + new->bi_write_hint = prev->bi_write_hint; + bio_chain(new, prev); + submit_bio(prev); + return new; +} + /** * gfs2_find_jhead - find the head of a log * @jd: The journal descriptor @@ -488,15 +502,15 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct address_space *mapping = jd->jd_inode->i_mapping; unsigned int block = 0, blocks_submitted = 0, blocks_read = 0; - unsigned int bsize = sdp->sd_sb.sb_bsize; + unsigned int bsize = sdp->sd_sb.sb_bsize, off; unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift; unsigned int shift = PAGE_SHIFT - bsize_shift; - unsigned int readhead_blocks = BIO_MAX_PAGES << shift; + unsigned int readahead_blocks = BIO_MAX_PAGES << shift; struct gfs2_journal_extent *je; int sz, ret = 0; struct bio *bio = NULL; struct page *page = NULL; - bool done = false; + bool bio_chained = false, done = false; errseq_t since; memset(head, 0, sizeof(*head)); @@ -505,9 +519,9 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, since = filemap_sample_wb_err(mapping); list_for_each_entry(je, &jd->extent_list, list) { - for (; block < je->lblock + je->blocks; block++) { - u64 dblock; + u64 dblock = je->dblock; + for (; block < je->lblock + je->blocks; block++, dblock++) { if (!page) { page = find_or_create_page(mapping, block >> shift, GFP_NOFS); @@ -516,35 +530,41 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, done = true; goto out; } + off = 0; } - if (bio) { - unsigned int off; - - off = (block << bsize_shift) & ~PAGE_MASK; + if (!bio || (bio_chained && !off)) { + /* start new bio */ + } else { sz = bio_add_page(bio, page, bsize, off); - if (sz == bsize) { /* block added */ - if (off + bsize == PAGE_SIZE) { - page = NULL; - goto page_added; - } - continue; + if (sz == bsize) + goto block_added; + if (off) { + unsigned int blocks = + (PAGE_SIZE - off) >> bsize_shift; + + bio = gfs2_chain_bio(bio, blocks); + bio_chained = true; + goto add_block_to_new_bio; } + } + + if (bio) { blocks_submitted = block + 1; submit_bio(bio); - bio = NULL; } - dblock = je->dblock + (block - je->lblock); bio = gfs2_log_alloc_bio(sdp, dblock, gfs2_end_log_read); bio->bi_opf = REQ_OP_READ; - sz = bio_add_page(bio, page, bsize, 0); - gfs2_assert_warn(sdp, sz == bsize); - if (bsize == PAGE_SIZE) + bio_chained = false; +add_block_to_new_bio: + sz = bio_add_page(bio, page, bsize, off); + BUG_ON(sz != bsize); +block_added: + off += bsize; + if (off == PAGE_SIZE) page = NULL; - -page_added: - if (blocks_submitted < blocks_read + readhead_blocks) { + if (blocks_submitted < blocks_read + readahead_blocks) { /* Keep at least one bio in flight */ continue; } @@ -846,7 +866,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) if (!sdp->sd_log_num_revoke) return; - length = gfs2_struct2blk(sdp, sdp->sd_log_num_revoke, sizeof(u64)); + length = gfs2_struct2blk(sdp, sdp->sd_log_num_revoke); page = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_REVOKE, length, sdp->sd_log_num_revoke); offset = sizeof(struct gfs2_log_descriptor); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index e8b7b0ce8404..b3e904bcc02c 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -298,6 +298,8 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent) sizeof(struct gfs2_dinode)) / sizeof(u64); sdp->sd_inptrs = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64); + sdp->sd_ldptrs = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_log_descriptor)) / sizeof(u64); sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header); sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2; sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 2466bb44a23c..e7bf91ec231c 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -36,16 +36,6 @@ #define BFITNOENT ((u32)~0) #define NO_BLOCK ((u64)~0) -#if BITS_PER_LONG == 32 -#define LBITMASK (0x55555555UL) -#define LBITSKIP55 (0x55555555UL) -#define LBITSKIP00 (0x00000000UL) -#else -#define LBITMASK (0x5555555555555555UL) -#define LBITSKIP55 (0x5555555555555555UL) -#define LBITSKIP00 (0x0000000000000000UL) -#endif - /* * These routines are used by the resource group routines (rgrp.c) * to keep track of block allocation. Each block is represented by two diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 9d4227330de4..a685637a5b55 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -49,8 +49,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, if (blocks) tr->tr_reserved += 6 + blocks; if (revokes) - tr->tr_reserved += gfs2_struct2blk(sdp, revokes, - sizeof(u64)); + tr->tr_reserved += gfs2_struct2blk(sdp, revokes); INIT_LIST_HEAD(&tr->tr_databuf); INIT_LIST_HEAD(&tr->tr_buf); @@ -77,10 +76,10 @@ static void gfs2_print_trans(struct gfs2_sbd *sdp, const struct gfs2_trans *tr) fs_warn(sdp, "blocks=%u revokes=%u reserved=%u touched=%u\n", tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, test_bit(TR_TOUCHED, &tr->tr_flags)); - fs_warn(sdp, "Buf %u/%u Databuf %u/%u Revoke %u\n", + fs_warn(sdp, "Buf %u/%u Databuf %u/%u Revoke %u/%u\n", tr->tr_num_buf_new, tr->tr_num_buf_rm, tr->tr_num_databuf_new, tr->tr_num_databuf_rm, - tr->tr_num_revoke); + tr->tr_num_revoke, tr->tr_num_revoke_rm); } void gfs2_trans_end(struct gfs2_sbd *sdp) @@ -265,7 +264,7 @@ void gfs2_trans_remove_revoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) if (bd->bd_gl) gfs2_glock_remove_revoke(bd->bd_gl); kmem_cache_free(gfs2_bufdata_cachep, bd); - tr->tr_num_revoke--; + tr->tr_num_revoke_rm++; if (--n == 0) break; } diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 6d0783e2e276..f71c384064c8 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -242,19 +242,35 @@ extern void hfs_mark_mdb_dirty(struct super_block *sb); /* * There are two time systems. Both are based on seconds since * a particular time/date. - * Unix: unsigned lil-endian since 00:00 GMT, Jan. 1, 1970 + * Unix: signed little-endian since 00:00 GMT, Jan. 1, 1970 * mac: unsigned big-endian since 00:00 GMT, Jan. 1, 1904 * + * HFS implementations are highly inconsistent, this one matches the + * traditional behavior of 64-bit Linux, giving the most useful + * time range between 1970 and 2106, by treating any on-disk timestamp + * under HFS_UTC_OFFSET (Jan 1 1970) as a time between 2040 and 2106. */ -#define __hfs_u_to_mtime(sec) cpu_to_be32(sec + 2082844800U - sys_tz.tz_minuteswest * 60) -#define __hfs_m_to_utime(sec) (be32_to_cpu(sec) - 2082844800U + sys_tz.tz_minuteswest * 60) +#define HFS_UTC_OFFSET 2082844800U +static inline time64_t __hfs_m_to_utime(__be32 mt) +{ + time64_t ut = (u32)(be32_to_cpu(mt) - HFS_UTC_OFFSET); + + return ut + sys_tz.tz_minuteswest * 60; +} + +static inline __be32 __hfs_u_to_mtime(time64_t ut) +{ + ut -= sys_tz.tz_minuteswest * 60; + + return cpu_to_be32(lower_32_bits(ut) + HFS_UTC_OFFSET); +} #define HFS_I(inode) (container_of(inode, struct hfs_inode_info, vfs_inode)) #define HFS_SB(sb) ((struct hfs_sb_info *)(sb)->s_fs_info) -#define hfs_m_to_utime(time) (struct timespec){ .tv_sec = __hfs_m_to_utime(time) } -#define hfs_u_to_mtime(time) __hfs_u_to_mtime((time).tv_sec) -#define hfs_mtime() __hfs_u_to_mtime(get_seconds()) +#define hfs_m_to_utime(time) (struct timespec64){ .tv_sec = __hfs_m_to_utime(time) } +#define hfs_u_to_mtime(time) __hfs_u_to_mtime((time).tv_sec) +#define hfs_mtime() __hfs_u_to_mtime(ktime_get_real_seconds()) static inline const char *hfs_mdb_name(struct super_block *sb) { diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index da243c84e93b..2f224b98ee94 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -351,7 +351,7 @@ static int hfs_read_inode(struct inode *inode, void *data) inode->i_mode &= ~hsb->s_file_umask; inode->i_mode |= S_IFREG; inode->i_ctime = inode->i_atime = inode->i_mtime = - timespec_to_timespec64(hfs_m_to_utime(rec->file.MdDat)); + hfs_m_to_utime(rec->file.MdDat); inode->i_op = &hfs_file_inode_operations; inode->i_fop = &hfs_file_operations; inode->i_mapping->a_ops = &hfs_aops; @@ -362,7 +362,7 @@ static int hfs_read_inode(struct inode *inode, void *data) HFS_I(inode)->fs_blocks = 0; inode->i_mode = S_IFDIR | (S_IRWXUGO & ~hsb->s_dir_umask); inode->i_ctime = inode->i_atime = inode->i_mtime = - timespec_to_timespec64(hfs_m_to_utime(rec->dir.MdDat)); + hfs_m_to_utime(rec->dir.MdDat); inode->i_op = &hfs_dir_inode_operations; inode->i_fop = &hfs_dir_operations; break; diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index b8471bf05def..3b03fff68543 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -533,13 +533,31 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, void *buf, void **data, int op, int op_flags); int hfsplus_read_wrapper(struct super_block *sb); -/* time macros */ -#define __hfsp_mt2ut(t) (be32_to_cpu(t) - 2082844800U) -#define __hfsp_ut2mt(t) (cpu_to_be32(t + 2082844800U)) +/* + * time helpers: convert between 1904-base and 1970-base timestamps + * + * HFS+ implementations are highly inconsistent, this one matches the + * traditional behavior of 64-bit Linux, giving the most useful + * time range between 1970 and 2106, by treating any on-disk timestamp + * under HFSPLUS_UTC_OFFSET (Jan 1 1970) as a time between 2040 and 2106. + */ +#define HFSPLUS_UTC_OFFSET 2082844800U + +static inline time64_t __hfsp_mt2ut(__be32 mt) +{ + time64_t ut = (u32)(be32_to_cpu(mt) - HFSPLUS_UTC_OFFSET); + + return ut; +} + +static inline __be32 __hfsp_ut2mt(time64_t ut) +{ + return cpu_to_be32(lower_32_bits(ut) + HFSPLUS_UTC_OFFSET); +} /* compatibility */ -#define hfsp_mt2ut(t) (struct timespec){ .tv_sec = __hfsp_mt2ut(t) } +#define hfsp_mt2ut(t) (struct timespec64){ .tv_sec = __hfsp_mt2ut(t) } #define hfsp_ut2mt(t) __hfsp_ut2mt((t).tv_sec) -#define hfsp_now2mt() __hfsp_ut2mt(get_seconds()) +#define hfsp_now2mt() __hfsp_ut2mt(ktime_get_real_seconds()) #endif diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index d131c8ea7eb6..94bd83b36644 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -504,9 +504,9 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) hfsplus_get_perms(inode, &folder->permissions, 1); set_nlink(inode, 1); inode->i_size = 2 + be32_to_cpu(folder->valence); - inode->i_atime = timespec_to_timespec64(hfsp_mt2ut(folder->access_date)); - inode->i_mtime = timespec_to_timespec64(hfsp_mt2ut(folder->content_mod_date)); - inode->i_ctime = timespec_to_timespec64(hfsp_mt2ut(folder->attribute_mod_date)); + inode->i_atime = hfsp_mt2ut(folder->access_date); + inode->i_mtime = hfsp_mt2ut(folder->content_mod_date); + inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date); HFSPLUS_I(inode)->create_date = folder->create_date; HFSPLUS_I(inode)->fs_blocks = 0; if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) { @@ -542,9 +542,9 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) init_special_inode(inode, inode->i_mode, be32_to_cpu(file->permissions.dev)); } - inode->i_atime = timespec_to_timespec64(hfsp_mt2ut(file->access_date)); - inode->i_mtime = timespec_to_timespec64(hfsp_mt2ut(file->content_mod_date)); - inode->i_ctime = timespec_to_timespec64(hfsp_mt2ut(file->attribute_mod_date)); + inode->i_atime = hfsp_mt2ut(file->access_date); + inode->i_mtime = hfsp_mt2ut(file->content_mod_date); + inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date); HFSPLUS_I(inode)->create_date = file->create_date; } else { pr_err("bad catalog entry used to create inode\n"); diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h index f4295aa19350..69cb796f6270 100644 --- a/fs/hostfs/hostfs.h +++ b/fs/hostfs/hostfs.h @@ -37,16 +37,20 @@ * is on, and remove the appropriate bits from attr->ia_mode (attr is a * "struct iattr *"). -BlaisorBlade */ +struct hostfs_timespec { + long long tv_sec; + long long tv_nsec; +}; struct hostfs_iattr { - unsigned int ia_valid; - unsigned short ia_mode; - uid_t ia_uid; - gid_t ia_gid; - loff_t ia_size; - struct timespec ia_atime; - struct timespec ia_mtime; - struct timespec ia_ctime; + unsigned int ia_valid; + unsigned short ia_mode; + uid_t ia_uid; + gid_t ia_gid; + loff_t ia_size; + struct hostfs_timespec ia_atime; + struct hostfs_timespec ia_mtime; + struct hostfs_timespec ia_ctime; }; struct hostfs_stat { @@ -56,7 +60,7 @@ struct hostfs_stat { unsigned int uid; unsigned int gid; unsigned long long size; - struct timespec atime, mtime, ctime; + struct hostfs_timespec atime, mtime, ctime; unsigned int blksize; unsigned long long blocks; unsigned int maj; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 5a7eb0c79839..e6b8c49076bb 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -549,9 +549,9 @@ static int read_name(struct inode *ino, char *name) set_nlink(ino, st.nlink); i_uid_write(ino, st.uid); i_gid_write(ino, st.gid); - ino->i_atime = timespec_to_timespec64(st.atime); - ino->i_mtime = timespec_to_timespec64(st.mtime); - ino->i_ctime = timespec_to_timespec64(st.ctime); + ino->i_atime = (struct timespec64){ st.atime.tv_sec, st.atime.tv_nsec }; + ino->i_mtime = (struct timespec64){ st.mtime.tv_sec, st.mtime.tv_nsec }; + ino->i_ctime = (struct timespec64){ st.ctime.tv_sec, st.ctime.tv_nsec }; ino->i_size = st.size; ino->i_blocks = st.blocks; return 0; @@ -820,15 +820,18 @@ static int hostfs_setattr(struct dentry *dentry, struct iattr *attr) } if (attr->ia_valid & ATTR_ATIME) { attrs.ia_valid |= HOSTFS_ATTR_ATIME; - attrs.ia_atime = timespec64_to_timespec(attr->ia_atime); + attrs.ia_atime = (struct hostfs_timespec) + { attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec }; } if (attr->ia_valid & ATTR_MTIME) { attrs.ia_valid |= HOSTFS_ATTR_MTIME; - attrs.ia_mtime = timespec64_to_timespec(attr->ia_mtime); + attrs.ia_mtime = (struct hostfs_timespec) + { attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec }; } if (attr->ia_valid & ATTR_CTIME) { attrs.ia_valid |= HOSTFS_ATTR_CTIME; - attrs.ia_ctime = timespec64_to_timespec(attr->ia_ctime); + attrs.ia_ctime = (struct hostfs_timespec) + { attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec }; } if (attr->ia_valid & ATTR_ATIME_SET) { attrs.ia_valid |= HOSTFS_ATTR_ATIME_SET; diff --git a/fs/internal.h b/fs/internal.h index e3fa69544b66..f3f280b952a3 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -124,6 +124,8 @@ extern struct file *do_filp_open(int dfd, struct filename *pathname, const struct open_flags *op); extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, const char *, const struct open_flags *); +extern struct open_how build_open_how(int flags, umode_t mode); +extern int build_open_flags(const struct open_how *how, struct open_flags *op); long do_sys_ftruncate(unsigned int fd, loff_t length, int small); long do_faccessat(int dfd, const char __user *filename, int mode); @@ -180,11 +182,11 @@ extern void mnt_pin_kill(struct mount *m); */ extern const struct dentry_operations ns_dentry_operations; -/* - * fs/ioctl.c - */ -extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd, - unsigned long arg); - /* direct-io.c: */ int sb_init_dio_done_wq(struct super_block *sb); + +/* + * fs/stat.c: + */ +unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags, int flags); +int cp_statx(const struct kstat *stat, struct statx __user *buffer); diff --git a/fs/io-wq.c b/fs/io-wq.c index 5147d2213b01..cb60a42b9fdf 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -56,7 +56,8 @@ struct io_worker { struct rcu_head rcu; struct mm_struct *mm; - const struct cred *creds; + const struct cred *cur_creds; + const struct cred *saved_creds; struct files_struct *restore_files; }; @@ -109,10 +110,10 @@ struct io_wq { struct task_struct *manager; struct user_struct *user; - const struct cred *creds; - struct mm_struct *mm; refcount_t refs; struct completion done; + + refcount_t use_refs; }; static bool io_worker_get(struct io_worker *worker) @@ -135,9 +136,9 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) { bool dropped_lock = false; - if (worker->creds) { - revert_creds(worker->creds); - worker->creds = NULL; + if (worker->saved_creds) { + revert_creds(worker->saved_creds); + worker->cur_creds = worker->saved_creds = NULL; } if (current->files != worker->restore_files) { @@ -396,6 +397,43 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash) return NULL; } +static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work) +{ + if (worker->mm) { + unuse_mm(worker->mm); + mmput(worker->mm); + worker->mm = NULL; + } + if (!work->mm) { + set_fs(KERNEL_DS); + return; + } + if (mmget_not_zero(work->mm)) { + use_mm(work->mm); + if (!worker->mm) + set_fs(USER_DS); + worker->mm = work->mm; + /* hang on to this mm */ + work->mm = NULL; + return; + } + + /* failed grabbing mm, ensure work gets cancelled */ + work->flags |= IO_WQ_WORK_CANCEL; +} + +static void io_wq_switch_creds(struct io_worker *worker, + struct io_wq_work *work) +{ + const struct cred *old_creds = override_creds(work->creds); + + worker->cur_creds = work->creds; + if (worker->saved_creds) + put_cred(old_creds); /* creds set by previous switch */ + else + worker->saved_creds = old_creds; +} + static void io_worker_handle_work(struct io_worker *worker) __releases(wqe->lock) { @@ -438,24 +476,19 @@ next: if (work->flags & IO_WQ_WORK_CB) work->func(&work); - if ((work->flags & IO_WQ_WORK_NEEDS_FILES) && - current->files != work->files) { + if (work->files && current->files != work->files) { task_lock(current); current->files = work->files; task_unlock(current); } - if ((work->flags & IO_WQ_WORK_NEEDS_USER) && !worker->mm && - wq->mm) { - if (mmget_not_zero(wq->mm)) { - use_mm(wq->mm); - set_fs(USER_DS); - worker->mm = wq->mm; - } else { - work->flags |= IO_WQ_WORK_CANCEL; - } - } - if (!worker->creds) - worker->creds = override_creds(wq->creds); + if (work->mm != worker->mm) + io_wq_switch_mm(worker, work); + if (worker->cur_creds != work->creds) + io_wq_switch_creds(worker, work); + /* + * OK to set IO_WQ_WORK_CANCEL even for uncancellable work, + * the worker function will do the right thing. + */ if (test_bit(IO_WQ_BIT_CANCEL, &wq->state)) work->flags |= IO_WQ_WORK_CANCEL; if (worker->mm) @@ -720,6 +753,7 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct, static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) { struct io_wqe_acct *acct = io_work_get_acct(wqe, work); + int work_flags; unsigned long flags; /* @@ -734,12 +768,14 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) return; } + work_flags = work->flags; spin_lock_irqsave(&wqe->lock, flags); wq_list_add_tail(&work->list, &wqe->work_list); wqe->flags &= ~IO_WQE_FLAG_STALLED; spin_unlock_irqrestore(&wqe->lock, flags); - if (!atomic_read(&acct->nr_running)) + if ((work_flags & IO_WQ_WORK_CONCURRENT) || + !atomic_read(&acct->nr_running)) io_wqe_wake_worker(wqe, acct); } @@ -828,6 +864,7 @@ static bool io_work_cancel(struct io_worker *worker, void *cancel_data) */ spin_lock_irqsave(&worker->lock, flags); if (worker->cur_work && + !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) && data->cancel(worker->cur_work, data->caller_data)) { send_sig(SIGINT, worker->task, 1); ret = true; @@ -902,7 +939,8 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) return false; spin_lock_irqsave(&worker->lock, flags); - if (worker->cur_work == work) { + if (worker->cur_work == work && + !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL)) { send_sig(SIGINT, worker->task, 1); ret = true; } @@ -1026,7 +1064,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) /* caller must already hold a reference to this */ wq->user = data->user; - wq->creds = data->creds; for_each_node(node) { struct io_wqe *wqe; @@ -1053,9 +1090,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) init_completion(&wq->done); - /* caller must have already done mmgrab() on this mm */ - wq->mm = data->mm; - wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager"); if (!IS_ERR(wq->manager)) { wake_up_process(wq->manager); @@ -1064,6 +1098,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) ret = -ENOMEM; goto err; } + refcount_set(&wq->use_refs, 1); reinit_completion(&wq->done); return wq; } @@ -1078,13 +1113,21 @@ err: return ERR_PTR(ret); } +bool io_wq_get(struct io_wq *wq, struct io_wq_data *data) +{ + if (data->get_work != wq->get_work || data->put_work != wq->put_work) + return false; + + return refcount_inc_not_zero(&wq->use_refs); +} + static bool io_wq_worker_wake(struct io_worker *worker, void *data) { wake_up_process(worker->task); return false; } -void io_wq_destroy(struct io_wq *wq) +static void __io_wq_destroy(struct io_wq *wq) { int node; @@ -1104,3 +1147,9 @@ void io_wq_destroy(struct io_wq *wq) kfree(wq->wqes); kfree(wq); } + +void io_wq_destroy(struct io_wq *wq) +{ + if (refcount_dec_and_test(&wq->use_refs)) + __io_wq_destroy(wq); +} diff --git a/fs/io-wq.h b/fs/io-wq.h index 3f5e356de980..50b3378febf2 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -7,11 +7,11 @@ enum { IO_WQ_WORK_CANCEL = 1, IO_WQ_WORK_HAS_MM = 2, IO_WQ_WORK_HASHED = 4, - IO_WQ_WORK_NEEDS_USER = 8, - IO_WQ_WORK_NEEDS_FILES = 16, IO_WQ_WORK_UNBOUND = 32, IO_WQ_WORK_INTERNAL = 64, IO_WQ_WORK_CB = 128, + IO_WQ_WORK_NO_CANCEL = 256, + IO_WQ_WORK_CONCURRENT = 512, IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */ }; @@ -72,6 +72,8 @@ struct io_wq_work { }; void (*func)(struct io_wq_work **); struct files_struct *files; + struct mm_struct *mm; + const struct cred *creds; unsigned flags; }; @@ -81,21 +83,22 @@ struct io_wq_work { (work)->func = _func; \ (work)->flags = 0; \ (work)->files = NULL; \ + (work)->mm = NULL; \ + (work)->creds = NULL; \ } while (0) \ typedef void (get_work_fn)(struct io_wq_work *); typedef void (put_work_fn)(struct io_wq_work *); struct io_wq_data { - struct mm_struct *mm; struct user_struct *user; - const struct cred *creds; get_work_fn *get_work; put_work_fn *put_work; }; struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data); +bool io_wq_get(struct io_wq *wq, struct io_wq_data *data); void io_wq_destroy(struct io_wq *wq); void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); diff --git a/fs/io_uring.c b/fs/io_uring.c index e54556b0fcc6..1806afddfea5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -46,6 +46,7 @@ #include <linux/compat.h> #include <linux/refcount.h> #include <linux/uio.h> +#include <linux/bits.h> #include <linux/sched/signal.h> #include <linux/fs.h> @@ -70,6 +71,10 @@ #include <linux/sizes.h> #include <linux/hugetlb.h> #include <linux/highmem.h> +#include <linux/namei.h> +#include <linux/fsnotify.h> +#include <linux/fadvise.h> +#include <linux/eventpoll.h> #define CREATE_TRACE_POINTS #include <trace/events/io_uring.h> @@ -177,6 +182,21 @@ struct fixed_file_table { struct file **files; }; +enum { + FFD_F_ATOMIC, +}; + +struct fixed_file_data { + struct fixed_file_table *table; + struct io_ring_ctx *ctx; + + struct percpu_ref refs; + struct llist_head put_llist; + unsigned long state; + struct work_struct ref_work; + struct completion done; +}; + struct io_ring_ctx { struct { struct percpu_ref refs; @@ -184,10 +204,11 @@ struct io_ring_ctx { struct { unsigned int flags; - bool compat; - bool account_mem; - bool cq_overflow_flushed; - bool drain_next; + int compat: 1; + int account_mem: 1; + int cq_overflow_flushed: 1; + int drain_next: 1; + int eventfd_async: 1; /* * Ring buffer of indices into array of io_uring_sqe, which is @@ -207,13 +228,14 @@ struct io_ring_ctx { unsigned sq_thread_idle; unsigned cached_sq_dropped; atomic_t cached_cq_overflow; - struct io_uring_sqe *sq_sqes; + unsigned long sq_check_overflow; struct list_head defer_list; struct list_head timeout_list; struct list_head cq_overflow_list; wait_queue_head_t inflight_wait; + struct io_uring_sqe *sq_sqes; } ____cacheline_aligned_in_smp; struct io_rings *rings; @@ -229,8 +251,10 @@ struct io_ring_ctx { * readers must ensure that ->refs is alive as long as the file* is * used. Only updated through io_uring_register(2). */ - struct fixed_file_table *file_table; + struct fixed_file_data *file_data; unsigned nr_user_files; + int ring_fd; + struct file *ring_file; /* if used, fixed mapped user buffers */ unsigned nr_user_bufs; @@ -250,11 +274,14 @@ struct io_ring_ctx { struct socket *ring_sock; #endif + struct idr personality_idr; + struct { unsigned cached_cq_tail; unsigned cq_entries; unsigned cq_mask; atomic_t cq_timeouts; + unsigned long cq_check_overflow; struct wait_queue_head cq_wait; struct fasync_struct *cq_fasync; struct eventfd_ctx *cq_ev_fd; @@ -267,7 +294,8 @@ struct io_ring_ctx { struct { spinlock_t completion_lock; - bool poll_multi_file; + struct llist_head poll_llist; + /* * ->poll_list is protected by the ctx->uring_lock for * io_uring instances that don't use IORING_SETUP_SQPOLL. @@ -277,6 +305,7 @@ struct io_ring_ctx { struct list_head poll_list; struct hlist_head *cancel_hash; unsigned cancel_hash_bits; + bool poll_multi_file; spinlock_t inflight_lock; struct list_head inflight_list; @@ -299,6 +328,12 @@ struct io_poll_iocb { struct wait_queue_entry wait; }; +struct io_close { + struct file *file; + struct file *put_file; + int fd; +}; + struct io_timeout_data { struct io_kiocb *req; struct hrtimer timer; @@ -319,6 +354,7 @@ struct io_sync { loff_t len; loff_t off; int flags; + int mode; }; struct io_cancel { @@ -348,8 +384,52 @@ struct io_connect { struct io_sr_msg { struct file *file; - struct user_msghdr __user *msg; + union { + struct user_msghdr __user *msg; + void __user *buf; + }; int msg_flags; + size_t len; +}; + +struct io_open { + struct file *file; + int dfd; + union { + unsigned mask; + }; + struct filename *filename; + struct statx __user *buffer; + struct open_how how; +}; + +struct io_files_update { + struct file *file; + u64 arg; + u32 nr_args; + u32 offset; +}; + +struct io_fadvise { + struct file *file; + u64 offset; + u32 len; + u32 advice; +}; + +struct io_madvise { + struct file *file; + u64 addr; + u32 len; + u32 advice; +}; + +struct io_epoll { + struct file *file; + int epfd; + int op; + int fd; + struct epoll_event event; }; struct io_async_connect { @@ -370,15 +450,79 @@ struct io_async_rw { ssize_t size; }; +struct io_async_open { + struct filename *filename; +}; + struct io_async_ctx { union { struct io_async_rw rw; struct io_async_msghdr msg; struct io_async_connect connect; struct io_timeout_data timeout; + struct io_async_open open; }; }; +enum { + REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, + REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, + REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, + REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, + REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, + + REQ_F_LINK_NEXT_BIT, + REQ_F_FAIL_LINK_BIT, + REQ_F_INFLIGHT_BIT, + REQ_F_CUR_POS_BIT, + REQ_F_NOWAIT_BIT, + REQ_F_IOPOLL_COMPLETED_BIT, + REQ_F_LINK_TIMEOUT_BIT, + REQ_F_TIMEOUT_BIT, + REQ_F_ISREG_BIT, + REQ_F_MUST_PUNT_BIT, + REQ_F_TIMEOUT_NOSEQ_BIT, + REQ_F_COMP_LOCKED_BIT, +}; + +enum { + /* ctx owns file */ + REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT), + /* drain existing IO first */ + REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT), + /* linked sqes */ + REQ_F_LINK = BIT(REQ_F_LINK_BIT), + /* doesn't sever on completion < 0 */ + REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), + /* IOSQE_ASYNC */ + REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), + + /* already grabbed next link */ + REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT), + /* fail rest of links */ + REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT), + /* on inflight list */ + REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), + /* read/write uses file position */ + REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), + /* must not punt to workers */ + REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), + /* polled IO has completed */ + REQ_F_IOPOLL_COMPLETED = BIT(REQ_F_IOPOLL_COMPLETED_BIT), + /* has linked timeout */ + REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), + /* timeout request */ + REQ_F_TIMEOUT = BIT(REQ_F_TIMEOUT_BIT), + /* regular file */ + REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), + /* must be punted even for NONBLOCK */ + REQ_F_MUST_PUNT = BIT(REQ_F_MUST_PUNT_BIT), + /* no timeout sequence */ + REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT), + /* completion under lock */ + REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT), +}; + /* * NOTE! Each of the iocb union members has the file pointer * as the first entry in their struct definition. So you can @@ -396,11 +540,19 @@ struct io_kiocb { struct io_timeout timeout; struct io_connect connect; struct io_sr_msg sr_msg; + struct io_open open; + struct io_close close; + struct io_files_update files_update; + struct io_fadvise fadvise; + struct io_madvise madvise; + struct io_epoll epoll; }; struct io_async_ctx *io; - struct file *ring_file; - int ring_fd; + /* + * llist_node is only used for poll deferred completions + */ + struct llist_node llist_node; bool has_user; bool in_async; bool needs_fixed_file; @@ -414,23 +566,6 @@ struct io_kiocb { struct list_head link_list; unsigned int flags; refcount_t refs; -#define REQ_F_NOWAIT 1 /* must not punt to workers */ -#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ -#define REQ_F_FIXED_FILE 4 /* ctx owns file */ -#define REQ_F_LINK_NEXT 8 /* already grabbed next link */ -#define REQ_F_IO_DRAIN 16 /* drain existing IO first */ -#define REQ_F_IO_DRAINED 32 /* drain done */ -#define REQ_F_LINK 64 /* linked sqes */ -#define REQ_F_LINK_TIMEOUT 128 /* has linked timeout */ -#define REQ_F_FAIL_LINK 256 /* fail rest of links */ -#define REQ_F_DRAIN_LINK 512 /* link should be fully drained */ -#define REQ_F_TIMEOUT 1024 /* timeout request */ -#define REQ_F_ISREG 2048 /* regular file */ -#define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */ -#define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */ -#define REQ_F_INFLIGHT 16384 /* on inflight list */ -#define REQ_F_COMP_LOCKED 32768 /* completion under lock */ -#define REQ_F_HARDLINK 65536 /* doesn't sever on completion < 0 */ u64 user_data; u32 result; u32 sequence; @@ -463,14 +598,162 @@ struct io_submit_state { unsigned int ios_left; }; +struct io_op_def { + /* needs req->io allocated for deferral/async */ + unsigned async_ctx : 1; + /* needs current->mm setup, does mm access */ + unsigned needs_mm : 1; + /* needs req->file assigned */ + unsigned needs_file : 1; + /* needs req->file assigned IFF fd is >= 0 */ + unsigned fd_non_neg : 1; + /* hash wq insertion if file is a regular file */ + unsigned hash_reg_file : 1; + /* unbound wq insertion if file is a non-regular file */ + unsigned unbound_nonreg_file : 1; + /* opcode is not supported by this kernel */ + unsigned not_supported : 1; + /* needs file table */ + unsigned file_table : 1; +}; + +static const struct io_op_def io_op_defs[] = { + [IORING_OP_NOP] = {}, + [IORING_OP_READV] = { + .async_ctx = 1, + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_WRITEV] = { + .async_ctx = 1, + .needs_mm = 1, + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_FSYNC] = { + .needs_file = 1, + }, + [IORING_OP_READ_FIXED] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_WRITE_FIXED] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_POLL_ADD] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_POLL_REMOVE] = {}, + [IORING_OP_SYNC_FILE_RANGE] = { + .needs_file = 1, + }, + [IORING_OP_SENDMSG] = { + .async_ctx = 1, + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_RECVMSG] = { + .async_ctx = 1, + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_TIMEOUT] = { + .async_ctx = 1, + .needs_mm = 1, + }, + [IORING_OP_TIMEOUT_REMOVE] = {}, + [IORING_OP_ACCEPT] = { + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + .file_table = 1, + }, + [IORING_OP_ASYNC_CANCEL] = {}, + [IORING_OP_LINK_TIMEOUT] = { + .async_ctx = 1, + .needs_mm = 1, + }, + [IORING_OP_CONNECT] = { + .async_ctx = 1, + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_FALLOCATE] = { + .needs_file = 1, + }, + [IORING_OP_OPENAT] = { + .needs_file = 1, + .fd_non_neg = 1, + .file_table = 1, + }, + [IORING_OP_CLOSE] = { + .needs_file = 1, + .file_table = 1, + }, + [IORING_OP_FILES_UPDATE] = { + .needs_mm = 1, + .file_table = 1, + }, + [IORING_OP_STATX] = { + .needs_mm = 1, + .needs_file = 1, + .fd_non_neg = 1, + }, + [IORING_OP_READ] = { + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_WRITE] = { + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_FADVISE] = { + .needs_file = 1, + }, + [IORING_OP_MADVISE] = { + .needs_mm = 1, + }, + [IORING_OP_SEND] = { + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_RECV] = { + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_OPENAT2] = { + .needs_file = 1, + .fd_non_neg = 1, + .file_table = 1, + }, + [IORING_OP_EPOLL_CTL] = { + .unbound_nonreg_file = 1, + .file_table = 1, + }, +}; + static void io_wq_submit_work(struct io_wq_work **workptr); static void io_cqring_fill_event(struct io_kiocb *req, long res); -static void __io_free_req(struct io_kiocb *req); static void io_put_req(struct io_kiocb *req); -static void io_double_put_req(struct io_kiocb *req); static void __io_double_put_req(struct io_kiocb *req); static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req); +static int __io_sqe_files_update(struct io_ring_ctx *ctx, + struct io_uring_files_update *ip, + unsigned nr_args); +static int io_grab_files(struct io_kiocb *req); static struct kmem_cache *req_cachep; @@ -537,9 +820,11 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->cq_overflow_list); init_completion(&ctx->completions[0]); init_completion(&ctx->completions[1]); + idr_init(&ctx->personality_idr); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); + init_llist_head(&ctx->poll_llist); INIT_LIST_HEAD(&ctx->poll_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); @@ -566,7 +851,7 @@ static inline bool __req_need_defer(struct io_kiocb *req) static inline bool req_need_defer(struct io_kiocb *req) { - if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) == REQ_F_IO_DRAIN) + if (unlikely(req->flags & REQ_F_IO_DRAIN)) return __req_need_defer(req); return false; @@ -606,53 +891,53 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; - if (ctx->cached_cq_tail != READ_ONCE(rings->cq.tail)) { - /* order cqe stores with ring update */ - smp_store_release(&rings->cq.tail, ctx->cached_cq_tail); + /* order cqe stores with ring update */ + smp_store_release(&rings->cq.tail, ctx->cached_cq_tail); - if (wq_has_sleeper(&ctx->cq_wait)) { - wake_up_interruptible(&ctx->cq_wait); - kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); - } + if (wq_has_sleeper(&ctx->cq_wait)) { + wake_up_interruptible(&ctx->cq_wait); + kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); + } +} + +static inline void io_req_work_grab_env(struct io_kiocb *req, + const struct io_op_def *def) +{ + if (!req->work.mm && def->needs_mm) { + mmgrab(current->mm); + req->work.mm = current->mm; } + if (!req->work.creds) + req->work.creds = get_current_cred(); } -static inline bool io_req_needs_user(struct io_kiocb *req) +static inline void io_req_work_drop_env(struct io_kiocb *req) { - return !(req->opcode == IORING_OP_READ_FIXED || - req->opcode == IORING_OP_WRITE_FIXED); + if (req->work.mm) { + mmdrop(req->work.mm); + req->work.mm = NULL; + } + if (req->work.creds) { + put_cred(req->work.creds); + req->work.creds = NULL; + } } static inline bool io_prep_async_work(struct io_kiocb *req, struct io_kiocb **link) { + const struct io_op_def *def = &io_op_defs[req->opcode]; bool do_hashed = false; - switch (req->opcode) { - case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - /* only regular files should be hashed for writes */ - if (req->flags & REQ_F_ISREG) + if (req->flags & REQ_F_ISREG) { + if (def->hash_reg_file) do_hashed = true; - /* fall-through */ - case IORING_OP_READV: - case IORING_OP_READ_FIXED: - case IORING_OP_SENDMSG: - case IORING_OP_RECVMSG: - case IORING_OP_ACCEPT: - case IORING_OP_POLL_ADD: - case IORING_OP_CONNECT: - /* - * We know REQ_F_ISREG is not set on some of these - * opcodes, but this enables us to keep the check in - * just one place. - */ - if (!(req->flags & REQ_F_ISREG)) + } else { + if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; - break; } - if (io_req_needs_user(req)) - req->work.flags |= IO_WQ_WORK_NEEDS_USER; + + io_req_work_grab_env(req, def); *link = io_prep_linked_timeout(req); return do_hashed; @@ -711,10 +996,8 @@ static void io_commit_cqring(struct io_ring_ctx *ctx) __io_commit_cqring(ctx); - while ((req = io_get_deferred_req(ctx)) != NULL) { - req->flags |= REQ_F_IO_DRAINED; + while ((req = io_get_deferred_req(ctx)) != NULL) io_queue_async_work(req); - } } static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) @@ -735,13 +1018,20 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) return &rings->cqes[tail & ctx->cq_mask]; } +static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) +{ + if (!ctx->eventfd_async) + return true; + return io_wq_current_is_worker() || in_interrupt(); +} + static void io_cqring_ev_posted(struct io_ring_ctx *ctx) { if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); if (waitqueue_active(&ctx->sqo_wait)) wake_up(&ctx->sqo_wait); - if (ctx->cq_ev_fd) + if (ctx->cq_ev_fd && io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); } @@ -766,7 +1056,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) /* if force is set, the ring is going away. always drop after that */ if (force) - ctx->cq_overflow_flushed = true; + ctx->cq_overflow_flushed = 1; cqe = NULL; while (!list_empty(&ctx->cq_overflow_list)) { @@ -788,6 +1078,10 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) } io_commit_cqring(ctx); + if (cqe) { + clear_bit(0, &ctx->sq_check_overflow); + clear_bit(0, &ctx->cq_check_overflow); + } spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); @@ -821,6 +1115,10 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res) WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); } else { + if (list_empty(&ctx->cq_overflow_list)) { + set_bit(0, &ctx->sq_check_overflow); + set_bit(0, &ctx->cq_check_overflow); + } refcount_inc(&req->refs); req->result = res; list_add_tail(&req->list, &ctx->cq_overflow_list); @@ -863,9 +1161,6 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct io_kiocb *req; - if (!percpu_ref_tryget(&ctx->refs)) - return NULL; - if (!state) { req = kmem_cache_alloc(req_cachep, gfp); if (unlikely(!req)) @@ -898,7 +1193,6 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, got_it: req->io = NULL; - req->ring_file = NULL; req->file = NULL; req->ctx = ctx; req->flags = 0; @@ -915,24 +1209,35 @@ fallback: return NULL; } -static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr) +static void __io_req_do_free(struct io_kiocb *req) { - if (*nr) { - kmem_cache_free_bulk(req_cachep, *nr, reqs); - percpu_ref_put_many(&ctx->refs, *nr); - *nr = 0; + if (likely(!io_is_fallback_req(req))) + kmem_cache_free(req_cachep, req); + else + clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req); +} + +static void __io_req_aux_free(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + + kfree(req->io); + if (req->file) { + if (req->flags & REQ_F_FIXED_FILE) + percpu_ref_put(&ctx->file_data->refs); + else + fput(req->file); } + + io_req_work_drop_env(req); } static void __io_free_req(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; + __io_req_aux_free(req); - if (req->io) - kfree(req->io); - if (req->file && !(req->flags & REQ_F_FIXED_FILE)) - fput(req->file); if (req->flags & REQ_F_INFLIGHT) { + struct io_ring_ctx *ctx = req->ctx; unsigned long flags; spin_lock_irqsave(&ctx->inflight_lock, flags); @@ -941,11 +1246,63 @@ static void __io_free_req(struct io_kiocb *req) wake_up(&ctx->inflight_wait); spin_unlock_irqrestore(&ctx->inflight_lock, flags); } - percpu_ref_put(&ctx->refs); - if (likely(!io_is_fallback_req(req))) - kmem_cache_free(req_cachep, req); - else - clear_bit_unlock(0, (unsigned long *) ctx->fallback_req); + + percpu_ref_put(&req->ctx->refs); + __io_req_do_free(req); +} + +struct req_batch { + void *reqs[IO_IOPOLL_BATCH]; + int to_free; + int need_iter; +}; + +static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb) +{ + int fixed_refs = rb->to_free; + + if (!rb->to_free) + return; + if (rb->need_iter) { + int i, inflight = 0; + unsigned long flags; + + fixed_refs = 0; + for (i = 0; i < rb->to_free; i++) { + struct io_kiocb *req = rb->reqs[i]; + + if (req->flags & REQ_F_FIXED_FILE) { + req->file = NULL; + fixed_refs++; + } + if (req->flags & REQ_F_INFLIGHT) + inflight++; + __io_req_aux_free(req); + } + if (!inflight) + goto do_free; + + spin_lock_irqsave(&ctx->inflight_lock, flags); + for (i = 0; i < rb->to_free; i++) { + struct io_kiocb *req = rb->reqs[i]; + + if (req->flags & REQ_F_INFLIGHT) { + list_del(&req->inflight_entry); + if (!--inflight) + break; + } + } + spin_unlock_irqrestore(&ctx->inflight_lock, flags); + + if (waitqueue_active(&ctx->inflight_wait)) + wake_up(&ctx->inflight_wait); + } +do_free: + kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs); + if (fixed_refs) + percpu_ref_put_many(&ctx->file_data->refs, fixed_refs); + percpu_ref_put_many(&ctx->refs, rb->to_free); + rb->to_free = rb->need_iter = 0; } static bool io_link_cancel_timeout(struct io_kiocb *req) @@ -1118,19 +1475,21 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush) { struct io_rings *rings = ctx->rings; - /* - * noflush == true is from the waitqueue handler, just ensure we wake - * up the task, and the next invocation will flush the entries. We - * cannot safely to it from here. - */ - if (noflush && !list_empty(&ctx->cq_overflow_list)) - return -1U; + if (test_bit(0, &ctx->cq_check_overflow)) { + /* + * noflush == true is from the waitqueue handler, just ensure + * we wake up the task, and the next invocation will flush the + * entries. We cannot safely to it from here. + */ + if (noflush && !list_empty(&ctx->cq_overflow_list)) + return -1U; - io_cqring_overflow_flush(ctx, false); + io_cqring_overflow_flush(ctx, false); + } /* See comment at the top of this file */ smp_rmb(); - return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head); + return ctx->cached_cq_tail - READ_ONCE(rings->cq.head); } static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) @@ -1141,17 +1500,30 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; } +static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req) +{ + if ((req->flags & REQ_F_LINK) || io_is_fallback_req(req)) + return false; + + if (!(req->flags & REQ_F_FIXED_FILE) || req->io) + rb->need_iter++; + + rb->reqs[rb->to_free++] = req; + if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) + io_free_req_many(req->ctx, rb); + return true; +} + /* * Find and free completed poll iocbs */ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, struct list_head *done) { - void *reqs[IO_IOPOLL_BATCH]; + struct req_batch rb; struct io_kiocb *req; - int to_free; - to_free = 0; + rb.to_free = rb.need_iter = 0; while (!list_empty(done)) { req = list_first_entry(done, struct io_kiocb, list); list_del(&req->list); @@ -1159,26 +1531,13 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, io_cqring_fill_event(req, req->result); (*nr_events)++; - if (refcount_dec_and_test(&req->refs)) { - /* If we're not using fixed files, we have to pair the - * completion part with the file put. Use regular - * completions for those, only batch free for fixed - * file and non-linked commands. - */ - if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) == - REQ_F_FIXED_FILE) && !io_is_fallback_req(req) && - !req->io) { - reqs[to_free++] = req; - if (to_free == ARRAY_SIZE(reqs)) - io_free_req_many(ctx, reqs, &to_free); - } else { - io_free_req(req); - } - } + if (refcount_dec_and_test(&req->refs) && + !io_req_multi_free(&rb, req)) + io_free_req(req); } io_commit_cqring(ctx); - io_free_req_many(ctx, reqs, &to_free); + io_free_req_many(ctx, &rb); } static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, @@ -1503,6 +1862,10 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, req->flags |= REQ_F_ISREG; kiocb->ki_pos = READ_ONCE(sqe->off); + if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) { + req->flags |= REQ_F_CUR_POS; + kiocb->ki_pos = req->file->f_pos; + } kiocb->ki_flags = iocb_flags(kiocb->ki_filp); kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); @@ -1574,6 +1937,10 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt, bool in_async) { + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); + + if (req->flags & REQ_F_CUR_POS) + req->file->f_pos = kiocb->ki_pos; if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw) *nxt = __io_complete_rw(kiocb, ret); else @@ -1671,6 +2038,13 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, if (req->rw.kiocb.private) return -EINVAL; + if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { + ssize_t ret; + ret = import_single_range(rw, buf, sqe_len, *iovec, iter); + *iovec = NULL; + return ret; + } + if (req->io) { struct io_async_rw *iorw = &req->io->rw; @@ -1767,6 +2141,8 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size, static int io_alloc_async_ctx(struct io_kiocb *req) { + if (!io_op_defs[req->opcode].async_ctx) + return 0; req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); return req->io == NULL; } @@ -1786,8 +2162,7 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, struct iovec *iovec, struct iovec *fast_iov, struct iov_iter *iter) { - if (req->opcode == IORING_OP_READ_FIXED || - req->opcode == IORING_OP_WRITE_FIXED) + if (!io_op_defs[req->opcode].async_ctx) return 0; if (!req->io && io_alloc_async_ctx(req)) return -ENOMEM; @@ -2101,6 +2476,421 @@ static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt, return 0; } +static void io_fallocate_finish(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct io_kiocb *nxt = NULL; + int ret; + + ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, + req->sync.len); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, &nxt); + if (nxt) + io_wq_assign_next(workptr, nxt); +} + +static int io_fallocate_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + if (sqe->ioprio || sqe->buf_index || sqe->rw_flags) + return -EINVAL; + + req->sync.off = READ_ONCE(sqe->off); + req->sync.len = READ_ONCE(sqe->addr); + req->sync.mode = READ_ONCE(sqe->len); + return 0; +} + +static int io_fallocate(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct io_wq_work *work, *old_work; + + /* fallocate always requiring blocking context */ + if (force_nonblock) { + io_put_req(req); + req->work.func = io_fallocate_finish; + return -EAGAIN; + } + + work = old_work = &req->work; + io_fallocate_finish(&work); + if (work && work != old_work) + *nxt = container_of(work, struct io_kiocb, work); + + return 0; +} + +static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + const char __user *fname; + int ret; + + if (sqe->ioprio || sqe->buf_index) + return -EINVAL; + + req->open.dfd = READ_ONCE(sqe->fd); + req->open.how.mode = READ_ONCE(sqe->len); + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + req->open.how.flags = READ_ONCE(sqe->open_flags); + + req->open.filename = getname(fname); + if (IS_ERR(req->open.filename)) { + ret = PTR_ERR(req->open.filename); + req->open.filename = NULL; + return ret; + } + + return 0; +} + +static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct open_how __user *how; + const char __user *fname; + size_t len; + int ret; + + if (sqe->ioprio || sqe->buf_index) + return -EINVAL; + + req->open.dfd = READ_ONCE(sqe->fd); + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + len = READ_ONCE(sqe->len); + + if (len < OPEN_HOW_SIZE_VER0) + return -EINVAL; + + ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how, + len); + if (ret) + return ret; + + if (!(req->open.how.flags & O_PATH) && force_o_largefile()) + req->open.how.flags |= O_LARGEFILE; + + req->open.filename = getname(fname); + if (IS_ERR(req->open.filename)) { + ret = PTR_ERR(req->open.filename); + req->open.filename = NULL; + return ret; + } + + return 0; +} + +static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct open_flags op; + struct file *file; + int ret; + + if (force_nonblock) + return -EAGAIN; + + ret = build_open_flags(&req->open.how, &op); + if (ret) + goto err; + + ret = get_unused_fd_flags(req->open.how.flags); + if (ret < 0) + goto err; + + file = do_filp_open(req->open.dfd, req->open.filename, &op); + if (IS_ERR(file)) { + put_unused_fd(ret); + ret = PTR_ERR(file); + } else { + fsnotify_open(file); + fd_install(ret, file); + } +err: + putname(req->open.filename); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +} + +static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + req->open.how = build_open_how(req->open.how.flags, req->open.how.mode); + return io_openat2(req, nxt, force_nonblock); +} + +static int io_epoll_ctl_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ +#if defined(CONFIG_EPOLL) + if (sqe->ioprio || sqe->buf_index) + return -EINVAL; + + req->epoll.epfd = READ_ONCE(sqe->fd); + req->epoll.op = READ_ONCE(sqe->len); + req->epoll.fd = READ_ONCE(sqe->off); + + if (ep_op_has_event(req->epoll.op)) { + struct epoll_event __user *ev; + + ev = u64_to_user_ptr(READ_ONCE(sqe->addr)); + if (copy_from_user(&req->epoll.event, ev, sizeof(*ev))) + return -EFAULT; + } + + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static int io_epoll_ctl(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ +#if defined(CONFIG_EPOLL) + struct io_epoll *ie = &req->epoll; + int ret; + + ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ +#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) + if (sqe->ioprio || sqe->buf_index || sqe->off) + return -EINVAL; + + req->madvise.addr = READ_ONCE(sqe->addr); + req->madvise.len = READ_ONCE(sqe->len); + req->madvise.advice = READ_ONCE(sqe->fadvise_advice); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static int io_madvise(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ +#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) + struct io_madvise *ma = &req->madvise; + int ret; + + if (force_nonblock) + return -EAGAIN; + + ret = do_madvise(ma->addr, ma->len, ma->advice); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + if (sqe->ioprio || sqe->buf_index || sqe->addr) + return -EINVAL; + + req->fadvise.offset = READ_ONCE(sqe->off); + req->fadvise.len = READ_ONCE(sqe->len); + req->fadvise.advice = READ_ONCE(sqe->fadvise_advice); + return 0; +} + +static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct io_fadvise *fa = &req->fadvise; + int ret; + + /* DONTNEED may block, others _should_ not */ + if (fa->advice == POSIX_FADV_DONTNEED && force_nonblock) + return -EAGAIN; + + ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +} + +static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + const char __user *fname; + unsigned lookup_flags; + int ret; + + if (sqe->ioprio || sqe->buf_index) + return -EINVAL; + + req->open.dfd = READ_ONCE(sqe->fd); + req->open.mask = READ_ONCE(sqe->len); + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + req->open.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + req->open.how.flags = READ_ONCE(sqe->statx_flags); + + if (vfs_stat_set_lookup_flags(&lookup_flags, req->open.how.flags)) + return -EINVAL; + + req->open.filename = getname_flags(fname, lookup_flags, NULL); + if (IS_ERR(req->open.filename)) { + ret = PTR_ERR(req->open.filename); + req->open.filename = NULL; + return ret; + } + + return 0; +} + +static int io_statx(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct io_open *ctx = &req->open; + unsigned lookup_flags; + struct path path; + struct kstat stat; + int ret; + + if (force_nonblock) + return -EAGAIN; + + if (vfs_stat_set_lookup_flags(&lookup_flags, ctx->how.flags)) + return -EINVAL; + +retry: + /* filename_lookup() drops it, keep a reference */ + ctx->filename->refcnt++; + + ret = filename_lookup(ctx->dfd, ctx->filename, lookup_flags, &path, + NULL); + if (ret) + goto err; + + ret = vfs_getattr(&path, &stat, ctx->mask, ctx->how.flags); + path_put(&path); + if (retry_estale(ret, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } + if (!ret) + ret = cp_statx(&stat, ctx->buffer); +err: + putname(ctx->filename); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +} + +static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + /* + * If we queue this for async, it must not be cancellable. That would + * leave the 'file' in an undeterminate state. + */ + req->work.flags |= IO_WQ_WORK_NO_CANCEL; + + if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || + sqe->rw_flags || sqe->buf_index) + return -EINVAL; + if (sqe->flags & IOSQE_FIXED_FILE) + return -EINVAL; + + req->close.fd = READ_ONCE(sqe->fd); + if (req->file->f_op == &io_uring_fops || + req->close.fd == req->ctx->ring_fd) + return -EBADF; + + return 0; +} + +static void io_close_finish(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct io_kiocb *nxt = NULL; + + /* Invoked with files, we need to do the close */ + if (req->work.files) { + int ret; + + ret = filp_close(req->close.put_file, req->work.files); + if (ret < 0) { + req_set_fail_links(req); + } + io_cqring_add_event(req, ret); + } + + fput(req->close.put_file); + + /* we bypassed the re-issue, drop the submission reference */ + io_put_req(req); + io_put_req_find_next(req, &nxt); + if (nxt) + io_wq_assign_next(workptr, nxt); +} + +static int io_close(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + int ret; + + req->close.put_file = NULL; + ret = __close_fd_get_file(req->close.fd, &req->close.put_file); + if (ret < 0) + return ret; + + /* if the file has a flush method, be safe and punt to async */ + if (req->close.put_file->f_op->flush && !io_wq_current_is_worker()) + goto eagain; + + /* + * No ->flush(), safely close from here and just punt the + * fput() to async context. + */ + ret = filp_close(req->close.put_file, current->files); + + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + + if (io_wq_current_is_worker()) { + struct io_wq_work *old_work, *work; + + old_work = work = &req->work; + io_close_finish(&work); + if (work && work != old_work) + *nxt = container_of(work, struct io_kiocb, work); + return 0; + } + +eagain: + req->work.func = io_close_finish; + return -EAGAIN; +} + static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; @@ -2178,8 +2968,9 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->msg_flags = READ_ONCE(sqe->msg_flags); sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + sr->len = READ_ONCE(sqe->len); - if (!io) + if (!io || req->opcode == IORING_OP_SEND) return 0; io->msg.iov = io->msg.fast_iov; @@ -2259,6 +3050,56 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, #endif } +static int io_send(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ +#if defined(CONFIG_NET) + struct socket *sock; + int ret; + + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + + sock = sock_from_file(req->file, &ret); + if (sock) { + struct io_sr_msg *sr = &req->sr_msg; + struct msghdr msg; + struct iovec iov; + unsigned flags; + + ret = import_single_range(WRITE, sr->buf, sr->len, &iov, + &msg.msg_iter); + if (ret) + return ret; + + msg.msg_name = NULL; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; + + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + ret = __sys_sendmsg_sock(sock, &msg, flags); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; + } + + io_cqring_add_event(req, ret); + if (ret < 0) + req_set_fail_links(req); + io_put_req_find_next(req, nxt); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -2269,7 +3110,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, sr->msg_flags = READ_ONCE(sqe->msg_flags); sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); - if (!io) + if (!io || req->opcode == IORING_OP_RECV) return 0; io->msg.iov = io->msg.fast_iov; @@ -2351,6 +3192,59 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, #endif } +static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ +#if defined(CONFIG_NET) + struct socket *sock; + int ret; + + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + + sock = sock_from_file(req->file, &ret); + if (sock) { + struct io_sr_msg *sr = &req->sr_msg; + struct msghdr msg; + struct iovec iov; + unsigned flags; + + ret = import_single_range(READ, sr->buf, sr->len, &iov, + &msg.msg_iter); + if (ret) + return ret; + + msg.msg_name = NULL; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; + msg.msg_iocb = NULL; + msg.msg_flags = 0; + + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + ret = __sys_recvmsg_sock(sock, &msg, NULL, NULL, flags); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; + } + + io_cqring_add_event(req, ret); + if (ret < 0) + req_set_fail_links(req); + io_put_req_find_next(req, nxt); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + + static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_NET) @@ -2414,7 +3308,6 @@ static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt, ret = __io_accept(req, nxt, force_nonblock); if (ret == -EAGAIN && force_nonblock) { req->work.func = io_accept_finish; - req->work.flags |= IO_WQ_WORK_NEEDS_FILES; io_put_req(req); return -EAGAIN; } @@ -2635,6 +3528,39 @@ static void io_poll_complete_work(struct io_wq_work **workptr) io_wq_assign_next(workptr, nxt); } +static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes) +{ + struct io_kiocb *req, *tmp; + struct req_batch rb; + + rb.to_free = rb.need_iter = 0; + spin_lock_irq(&ctx->completion_lock); + llist_for_each_entry_safe(req, tmp, nodes, llist_node) { + hash_del(&req->hash_node); + io_poll_complete(req, req->result, 0); + + if (refcount_dec_and_test(&req->refs) && + !io_req_multi_free(&rb, req)) { + req->flags |= REQ_F_COMP_LOCKED; + io_free_req(req); + } + } + spin_unlock_irq(&ctx->completion_lock); + + io_cqring_ev_posted(ctx); + io_free_req_many(ctx, &rb); +} + +static void io_poll_flush(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct llist_node *nodes; + + nodes = llist_del_all(&req->ctx->poll_llist); + if (nodes) + __io_poll_flush(req->ctx, nodes); +} + static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { @@ -2642,7 +3568,6 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, struct io_kiocb *req = container_of(poll, struct io_kiocb, poll); struct io_ring_ctx *ctx = req->ctx; __poll_t mask = key_to_poll(key); - unsigned long flags; /* for instances that support it check for an event match first: */ if (mask && !(mask & poll->events)) @@ -2656,17 +3581,31 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, * If we have a link timeout we're going to need the completion_lock * for finalizing the request, mark us as having grabbed that already. */ - if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) { - hash_del(&req->hash_node); - io_poll_complete(req, mask, 0); - req->flags |= REQ_F_COMP_LOCKED; - io_put_req(req); - spin_unlock_irqrestore(&ctx->completion_lock, flags); + if (mask) { + unsigned long flags; - io_cqring_ev_posted(ctx); - } else { - io_queue_async_work(req); + if (llist_empty(&ctx->poll_llist) && + spin_trylock_irqsave(&ctx->completion_lock, flags)) { + hash_del(&req->hash_node); + io_poll_complete(req, mask, 0); + req->flags |= REQ_F_COMP_LOCKED; + io_put_req(req); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + + io_cqring_ev_posted(ctx); + req = NULL; + } else { + req->result = mask; + req->llist_node.next = NULL; + /* if the list wasn't empty, we're done */ + if (!llist_add(&req->llist_node, &ctx->poll_llist)) + req = NULL; + else + req->work.func = io_poll_flush; + } } + if (req) + io_queue_async_work(req); return 1; } @@ -3071,20 +4010,67 @@ static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt) return 0; } +static int io_files_update_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + if (sqe->flags || sqe->ioprio || sqe->rw_flags) + return -EINVAL; + + req->files_update.offset = READ_ONCE(sqe->off); + req->files_update.nr_args = READ_ONCE(sqe->len); + if (!req->files_update.nr_args) + return -EINVAL; + req->files_update.arg = READ_ONCE(sqe->addr); + return 0; +} + +static int io_files_update(struct io_kiocb *req, bool force_nonblock) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_uring_files_update up; + int ret; + + if (force_nonblock) + return -EAGAIN; + + up.offset = req->files_update.offset; + up.fds = req->files_update.arg; + + mutex_lock(&ctx->uring_lock); + ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args); + mutex_unlock(&ctx->uring_lock); + + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req(req); + return 0; +} + static int io_req_defer_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { ssize_t ret = 0; + if (io_op_defs[req->opcode].file_table) { + ret = io_grab_files(req); + if (unlikely(ret)) + return ret; + } + + io_req_work_grab_env(req, &io_op_defs[req->opcode]); + switch (req->opcode) { case IORING_OP_NOP: break; case IORING_OP_READV: case IORING_OP_READ_FIXED: + case IORING_OP_READ: ret = io_read_prep(req, sqe, true); break; case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: + case IORING_OP_WRITE: ret = io_write_prep(req, sqe, true); break; case IORING_OP_POLL_ADD: @@ -3100,9 +4086,11 @@ static int io_req_defer_prep(struct io_kiocb *req, ret = io_prep_sfr(req, sqe); break; case IORING_OP_SENDMSG: + case IORING_OP_SEND: ret = io_sendmsg_prep(req, sqe); break; case IORING_OP_RECVMSG: + case IORING_OP_RECV: ret = io_recvmsg_prep(req, sqe); break; case IORING_OP_CONNECT: @@ -3123,6 +4111,33 @@ static int io_req_defer_prep(struct io_kiocb *req, case IORING_OP_ACCEPT: ret = io_accept_prep(req, sqe); break; + case IORING_OP_FALLOCATE: + ret = io_fallocate_prep(req, sqe); + break; + case IORING_OP_OPENAT: + ret = io_openat_prep(req, sqe); + break; + case IORING_OP_CLOSE: + ret = io_close_prep(req, sqe); + break; + case IORING_OP_FILES_UPDATE: + ret = io_files_update_prep(req, sqe); + break; + case IORING_OP_STATX: + ret = io_statx_prep(req, sqe); + break; + case IORING_OP_FADVISE: + ret = io_fadvise_prep(req, sqe); + break; + case IORING_OP_MADVISE: + ret = io_madvise_prep(req, sqe); + break; + case IORING_OP_OPENAT2: + ret = io_openat2_prep(req, sqe); + break; + case IORING_OP_EPOLL_CTL: + ret = io_epoll_ctl_prep(req, sqe); + break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", req->opcode); @@ -3173,6 +4188,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, break; case IORING_OP_READV: case IORING_OP_READ_FIXED: + case IORING_OP_READ: if (sqe) { ret = io_read_prep(req, sqe, force_nonblock); if (ret < 0) @@ -3182,6 +4198,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, break; case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: + case IORING_OP_WRITE: if (sqe) { ret = io_write_prep(req, sqe, force_nonblock); if (ret < 0) @@ -3222,20 +4239,28 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, ret = io_sync_file_range(req, nxt, force_nonblock); break; case IORING_OP_SENDMSG: + case IORING_OP_SEND: if (sqe) { ret = io_sendmsg_prep(req, sqe); if (ret < 0) break; } - ret = io_sendmsg(req, nxt, force_nonblock); + if (req->opcode == IORING_OP_SENDMSG) + ret = io_sendmsg(req, nxt, force_nonblock); + else + ret = io_send(req, nxt, force_nonblock); break; case IORING_OP_RECVMSG: + case IORING_OP_RECV: if (sqe) { ret = io_recvmsg_prep(req, sqe); if (ret) break; } - ret = io_recvmsg(req, nxt, force_nonblock); + if (req->opcode == IORING_OP_RECVMSG) + ret = io_recvmsg(req, nxt, force_nonblock); + else + ret = io_recv(req, nxt, force_nonblock); break; case IORING_OP_TIMEOUT: if (sqe) { @@ -3277,6 +4302,78 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } ret = io_async_cancel(req, nxt); break; + case IORING_OP_FALLOCATE: + if (sqe) { + ret = io_fallocate_prep(req, sqe); + if (ret) + break; + } + ret = io_fallocate(req, nxt, force_nonblock); + break; + case IORING_OP_OPENAT: + if (sqe) { + ret = io_openat_prep(req, sqe); + if (ret) + break; + } + ret = io_openat(req, nxt, force_nonblock); + break; + case IORING_OP_CLOSE: + if (sqe) { + ret = io_close_prep(req, sqe); + if (ret) + break; + } + ret = io_close(req, nxt, force_nonblock); + break; + case IORING_OP_FILES_UPDATE: + if (sqe) { + ret = io_files_update_prep(req, sqe); + if (ret) + break; + } + ret = io_files_update(req, force_nonblock); + break; + case IORING_OP_STATX: + if (sqe) { + ret = io_statx_prep(req, sqe); + if (ret) + break; + } + ret = io_statx(req, nxt, force_nonblock); + break; + case IORING_OP_FADVISE: + if (sqe) { + ret = io_fadvise_prep(req, sqe); + if (ret) + break; + } + ret = io_fadvise(req, nxt, force_nonblock); + break; + case IORING_OP_MADVISE: + if (sqe) { + ret = io_madvise_prep(req, sqe); + if (ret) + break; + } + ret = io_madvise(req, nxt, force_nonblock); + break; + case IORING_OP_OPENAT2: + if (sqe) { + ret = io_openat2_prep(req, sqe); + if (ret) + break; + } + ret = io_openat2(req, nxt, force_nonblock); + break; + case IORING_OP_EPOLL_CTL: + if (sqe) { + ret = io_epoll_ctl_prep(req, sqe); + if (ret) + break; + } + ret = io_epoll_ctl(req, nxt, force_nonblock); + break; default: ret = -EINVAL; break; @@ -3311,8 +4408,11 @@ static void io_wq_submit_work(struct io_wq_work **workptr) struct io_kiocb *nxt = NULL; int ret = 0; - if (work->flags & IO_WQ_WORK_CANCEL) + /* if NO_CANCEL is set, we must still run the work */ + if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) == + IO_WQ_WORK_CANCEL) { ret = -ECANCELED; + } if (!ret) { req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0; @@ -3344,26 +4444,13 @@ static void io_wq_submit_work(struct io_wq_work **workptr) io_wq_assign_next(workptr, nxt); } -static bool io_req_op_valid(int op) -{ - return op >= IORING_OP_NOP && op < IORING_OP_LAST; -} - -static int io_req_needs_file(struct io_kiocb *req) +static int io_req_needs_file(struct io_kiocb *req, int fd) { - switch (req->opcode) { - case IORING_OP_NOP: - case IORING_OP_POLL_REMOVE: - case IORING_OP_TIMEOUT: - case IORING_OP_TIMEOUT_REMOVE: - case IORING_OP_ASYNC_CANCEL: - case IORING_OP_LINK_TIMEOUT: + if (!io_op_defs[req->opcode].needs_file) return 0; - default: - if (io_req_op_valid(req->opcode)) - return 1; - return -EINVAL; - } + if (fd == -1 && io_op_defs[req->opcode].fd_non_neg) + return 0; + return 1; } static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, @@ -3371,8 +4458,8 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, { struct fixed_file_table *table; - table = &ctx->file_table[index >> IORING_FILE_TABLE_SHIFT]; - return table->files[index & IORING_FILE_TABLE_MASK]; + table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT]; + return table->files[index & IORING_FILE_TABLE_MASK];; } static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, @@ -3380,20 +4467,16 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, { struct io_ring_ctx *ctx = req->ctx; unsigned flags; - int fd, ret; + int fd; flags = READ_ONCE(sqe->flags); fd = READ_ONCE(sqe->fd); - if (flags & IOSQE_IO_DRAIN) - req->flags |= REQ_F_IO_DRAIN; - - ret = io_req_needs_file(req); - if (ret <= 0) - return ret; + if (!io_req_needs_file(req, fd)) + return 0; if (flags & IOSQE_FIXED_FILE) { - if (unlikely(!ctx->file_table || + if (unlikely(!ctx->file_data || (unsigned) fd >= ctx->nr_user_files)) return -EBADF; fd = array_index_nospec(fd, ctx->nr_user_files); @@ -3401,6 +4484,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, if (!req->file) return -EBADF; req->flags |= REQ_F_FIXED_FILE; + percpu_ref_get(&ctx->file_data->refs); } else { if (req->needs_fixed_file) return -EBADF; @@ -3418,6 +4502,11 @@ static int io_grab_files(struct io_kiocb *req) int ret = -EBADF; struct io_ring_ctx *ctx = req->ctx; + if (req->work.files) + return 0; + if (!ctx->ring_file) + return -EBADF; + rcu_read_lock(); spin_lock_irq(&ctx->inflight_lock); /* @@ -3426,7 +4515,7 @@ static int io_grab_files(struct io_kiocb *req) * the fd has changed since we started down this path, and disallow * this operation if it has. */ - if (fcheck(req->ring_fd) == req->ring_file) { + if (fcheck(ctx->ring_fd) == ctx->ring_file) { list_add(&req->inflight_entry, &ctx->inflight_list); req->flags |= REQ_F_INFLIGHT; req->work.files = current->files; @@ -3532,7 +4621,8 @@ again: */ if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || (req->flags & REQ_F_MUST_PUNT))) { - if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) { +punt: + if (io_op_defs[req->opcode].file_table) { ret = io_grab_files(req); if (ret) goto err; @@ -3567,6 +4657,9 @@ done_req: if (nxt) { req = nxt; nxt = NULL; + + if (req->flags & REQ_F_FORCE_ASYNC) + goto punt; goto again; } } @@ -3575,21 +4668,27 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) { int ret; - if (unlikely(req->ctx->drain_next)) { - req->flags |= REQ_F_IO_DRAIN; - req->ctx->drain_next = false; - } - req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK); - ret = io_req_defer(req, sqe); if (ret) { if (ret != -EIOCBQUEUED) { +fail_req: io_cqring_add_event(req, ret); req_set_fail_links(req); io_double_put_req(req); } - } else + } else if (req->flags & REQ_F_FORCE_ASYNC) { + ret = io_req_defer_prep(req, sqe); + if (unlikely(ret < 0)) + goto fail_req; + /* + * Never try inline submit of IOSQE_ASYNC is set, go straight + * to async execution. + */ + req->work.flags |= IO_WQ_WORK_CONCURRENT; + io_queue_async_work(req); + } else { __io_queue_sqe(req, sqe); + } } static inline void io_queue_link_head(struct io_kiocb *req) @@ -3602,25 +4701,47 @@ static inline void io_queue_link_head(struct io_kiocb *req) } #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ - IOSQE_IO_HARDLINK) + IOSQE_IO_HARDLINK | IOSQE_ASYNC) static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, struct io_submit_state *state, struct io_kiocb **link) { + const struct cred *old_creds = NULL; struct io_ring_ctx *ctx = req->ctx; - int ret; + unsigned int sqe_flags; + int ret, id; + + sqe_flags = READ_ONCE(sqe->flags); /* enforce forwards compatibility on users */ - if (unlikely(sqe->flags & ~SQE_VALID_FLAGS)) { + if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) { ret = -EINVAL; goto err_req; } + id = READ_ONCE(sqe->personality); + if (id) { + const struct cred *personality_creds; + + personality_creds = idr_find(&ctx->personality_idr, id); + if (unlikely(!personality_creds)) { + ret = -EINVAL; + goto err_req; + } + old_creds = override_creds(personality_creds); + } + + /* same numerical values with corresponding REQ_F_*, safe to copy */ + req->flags |= sqe_flags & (IOSQE_IO_DRAIN|IOSQE_IO_HARDLINK| + IOSQE_ASYNC); + ret = io_req_set_file(state, req, sqe); if (unlikely(ret)) { err_req: io_cqring_add_event(req, ret); io_double_put_req(req); + if (old_creds) + revert_creds(old_creds); return false; } @@ -3632,14 +4753,19 @@ err_req: * conditions are true (normal request), then just queue it. */ if (*link) { - struct io_kiocb *prev = *link; - - if (sqe->flags & IOSQE_IO_DRAIN) - (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN; - - if (sqe->flags & IOSQE_IO_HARDLINK) - req->flags |= REQ_F_HARDLINK; + struct io_kiocb *head = *link; + /* + * Taking sequential execution of a link, draining both sides + * of the link also fullfils IOSQE_IO_DRAIN semantics for all + * requests in the link. So, it drains the head and the + * next after the link request. The last one is done via + * drain_next flag to persist the effect across calls. + */ + if (sqe_flags & IOSQE_IO_DRAIN) { + head->flags |= REQ_F_IO_DRAIN; + ctx->drain_next = 1; + } if (io_alloc_async_ctx(req)) { ret = -EAGAIN; goto err_req; @@ -3648,25 +4774,36 @@ err_req: ret = io_req_defer_prep(req, sqe); if (ret) { /* fail even hard links since we don't submit */ - prev->flags |= REQ_F_FAIL_LINK; + head->flags |= REQ_F_FAIL_LINK; goto err_req; } - trace_io_uring_link(ctx, req, prev); - list_add_tail(&req->link_list, &prev->link_list); - } else if (sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) { - req->flags |= REQ_F_LINK; - if (sqe->flags & IOSQE_IO_HARDLINK) - req->flags |= REQ_F_HARDLINK; - - INIT_LIST_HEAD(&req->link_list); - ret = io_req_defer_prep(req, sqe); - if (ret) - req->flags |= REQ_F_FAIL_LINK; - *link = req; + trace_io_uring_link(ctx, req, head); + list_add_tail(&req->link_list, &head->link_list); + + /* last request of a link, enqueue the link */ + if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK))) { + io_queue_link_head(head); + *link = NULL; + } } else { - io_queue_sqe(req, sqe); + if (unlikely(ctx->drain_next)) { + req->flags |= REQ_F_IO_DRAIN; + req->ctx->drain_next = 0; + } + if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) { + req->flags |= REQ_F_LINK; + INIT_LIST_HEAD(&req->link_list); + ret = io_req_defer_prep(req, sqe); + if (ret) + req->flags |= REQ_F_FAIL_LINK; + *link = req; + } else { + io_queue_sqe(req, sqe); + } } + if (old_creds) + revert_creds(old_creds); return true; } @@ -3698,14 +4835,12 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; - if (ctx->cached_sq_head != READ_ONCE(rings->sq.head)) { - /* - * Ensure any loads from the SQEs are done at this point, - * since once we write the new head, the application could - * write new data to them. - */ - smp_store_release(&rings->sq.head, ctx->cached_sq_head); - } + /* + * Ensure any loads from the SQEs are done at this point, + * since once we write the new head, the application could + * write new data to them. + */ + smp_store_release(&rings->sq.head, ctx->cached_sq_head); } /* @@ -3719,7 +4854,6 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe **sqe_ptr) { - struct io_rings *rings = ctx->rings; u32 *sq_array = ctx->sq_array; unsigned head; @@ -3731,12 +4865,7 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req, * 2) allows the kernel side to track the head on its own, even * though the application is the one updating it. */ - head = ctx->cached_sq_head; - /* make sure SQ entry isn't read before tail */ - if (unlikely(head == smp_load_acquire(&rings->sq.tail))) - return false; - - head = READ_ONCE(sq_array[head & ctx->sq_mask]); + head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]); if (likely(head < ctx->sq_entries)) { /* * All io need record the previous position, if LINK vs DARIN, @@ -3754,7 +4883,7 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req, /* drop invalid entries */ ctx->cached_sq_head++; ctx->cached_sq_dropped++; - WRITE_ONCE(rings->sq_dropped, ctx->cached_sq_dropped); + WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped); return false; } @@ -3768,19 +4897,29 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, bool mm_fault = false; /* if we have a backlog and couldn't flush it all, return BUSY */ - if (!list_empty(&ctx->cq_overflow_list) && - !io_cqring_overflow_flush(ctx, false)) - return -EBUSY; + if (test_bit(0, &ctx->sq_check_overflow)) { + if (!list_empty(&ctx->cq_overflow_list) && + !io_cqring_overflow_flush(ctx, false)) + return -EBUSY; + } + + /* make sure SQ entry isn't read before tail */ + nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx)); + + if (!percpu_ref_tryget_many(&ctx->refs, nr)) + return -EAGAIN; if (nr > IO_PLUG_THRESHOLD) { io_submit_state_start(&state, nr); statep = &state; } + ctx->ring_fd = ring_fd; + ctx->ring_file = ring_file; + for (i = 0; i < nr; i++) { const struct io_uring_sqe *sqe; struct io_kiocb *req; - unsigned int sqe_flags; req = io_get_req(ctx, statep); if (unlikely(!req)) { @@ -3789,11 +4928,20 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, break; } if (!io_get_sqring(ctx, req, &sqe)) { - __io_free_req(req); + __io_req_do_free(req); break; } - if (io_req_needs_user(req) && !*mm) { + /* will complete beyond this point, count as submitted */ + submitted++; + + if (unlikely(req->opcode >= IORING_OP_LAST)) { + io_cqring_add_event(req, -EINVAL); + io_double_put_req(req); + break; + } + + if (io_op_defs[req->opcode].needs_mm && !*mm) { mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm); if (!mm_fault) { use_mm(ctx->sqo_mm); @@ -3801,27 +4949,20 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, } } - submitted++; - sqe_flags = sqe->flags; - - req->ring_file = ring_file; - req->ring_fd = ring_fd; req->has_user = *mm != NULL; req->in_async = async; req->needs_fixed_file = async; - trace_io_uring_submit_sqe(ctx, req->user_data, true, async); + trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data, + true, async); if (!io_submit_sqe(req, sqe, statep, &link)) break; - /* - * If previous wasn't linked and we have a linked command, - * that's the end of the chain. Submit the previous link. - */ - if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) && link) { - io_queue_link_head(link); - link = NULL; - } } + if (unlikely(submitted != nr)) { + int ref_used = (submitted == -EAGAIN) ? 0 : submitted; + + percpu_ref_put_many(&ctx->refs, nr - ref_used); + } if (link) io_queue_link_head(link); if (statep) @@ -3944,7 +5085,6 @@ static int io_sq_thread(void *data) ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; } - to_submit = min(to_submit, ctx->sq_entries); mutex_lock(&ctx->uring_lock); ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true); mutex_unlock(&ctx->uring_lock); @@ -4075,19 +5215,40 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) #endif } +static void io_file_ref_kill(struct percpu_ref *ref) +{ + struct fixed_file_data *data; + + data = container_of(ref, struct fixed_file_data, refs); + complete(&data->done); +} + static int io_sqe_files_unregister(struct io_ring_ctx *ctx) { + struct fixed_file_data *data = ctx->file_data; unsigned nr_tables, i; - if (!ctx->file_table) + if (!data) return -ENXIO; + /* protect against inflight atomic switch, which drops the ref */ + percpu_ref_get(&data->refs); + /* wait for existing switches */ + flush_work(&data->ref_work); + percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill); + wait_for_completion(&data->done); + percpu_ref_put(&data->refs); + /* flush potential new switch */ + flush_work(&data->ref_work); + percpu_ref_exit(&data->refs); + __io_sqe_files_unregister(ctx); nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE); for (i = 0; i < nr_tables; i++) - kfree(ctx->file_table[i].files); - kfree(ctx->file_table); - ctx->file_table = NULL; + kfree(data->table[i].files); + kfree(data->table); + kfree(data); + ctx->file_data = NULL; ctx->nr_user_files = 0; return 0; } @@ -4118,16 +5279,6 @@ static void io_finish_async(struct io_ring_ctx *ctx) } #if defined(CONFIG_UNIX) -static void io_destruct_skb(struct sk_buff *skb) -{ - struct io_ring_ctx *ctx = skb->sk->sk_user_data; - - if (ctx->io_wq) - io_wq_flush(ctx->io_wq); - - unix_destruct_scm(skb); -} - /* * Ensure the UNIX gc is aware of our file set, so we are certain that * the io_uring can be safely unregistered on process exit, even if we have @@ -4175,7 +5326,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) fpl->max = SCM_MAX_FD; fpl->count = nr_files; UNIXCB(skb).fp = fpl; - skb->destructor = io_destruct_skb; + skb->destructor = unix_destruct_scm; refcount_add(skb->truesize, &sk->sk_wmem_alloc); skb_queue_head(&sk->sk_receive_queue, skb); @@ -4237,7 +5388,7 @@ static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables, int i; for (i = 0; i < nr_tables; i++) { - struct fixed_file_table *table = &ctx->file_table[i]; + struct fixed_file_table *table = &ctx->file_data->table[i]; unsigned this_files; this_files = min(nr_files, IORING_MAX_FILES_TABLE); @@ -4252,36 +5403,159 @@ static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables, return 0; for (i = 0; i < nr_tables; i++) { - struct fixed_file_table *table = &ctx->file_table[i]; + struct fixed_file_table *table = &ctx->file_data->table[i]; kfree(table->files); } return 1; } +static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file) +{ +#if defined(CONFIG_UNIX) + struct sock *sock = ctx->ring_sock->sk; + struct sk_buff_head list, *head = &sock->sk_receive_queue; + struct sk_buff *skb; + int i; + + __skb_queue_head_init(&list); + + /* + * Find the skb that holds this file in its SCM_RIGHTS. When found, + * remove this entry and rearrange the file array. + */ + skb = skb_dequeue(head); + while (skb) { + struct scm_fp_list *fp; + + fp = UNIXCB(skb).fp; + for (i = 0; i < fp->count; i++) { + int left; + + if (fp->fp[i] != file) + continue; + + unix_notinflight(fp->user, fp->fp[i]); + left = fp->count - 1 - i; + if (left) { + memmove(&fp->fp[i], &fp->fp[i + 1], + left * sizeof(struct file *)); + } + fp->count--; + if (!fp->count) { + kfree_skb(skb); + skb = NULL; + } else { + __skb_queue_tail(&list, skb); + } + fput(file); + file = NULL; + break; + } + + if (!file) + break; + + __skb_queue_tail(&list, skb); + + skb = skb_dequeue(head); + } + + if (skb_peek(&list)) { + spin_lock_irq(&head->lock); + while ((skb = __skb_dequeue(&list)) != NULL) + __skb_queue_tail(head, skb); + spin_unlock_irq(&head->lock); + } +#else + fput(file); +#endif +} + +struct io_file_put { + struct llist_node llist; + struct file *file; + struct completion *done; +}; + +static void io_ring_file_ref_switch(struct work_struct *work) +{ + struct io_file_put *pfile, *tmp; + struct fixed_file_data *data; + struct llist_node *node; + + data = container_of(work, struct fixed_file_data, ref_work); + + while ((node = llist_del_all(&data->put_llist)) != NULL) { + llist_for_each_entry_safe(pfile, tmp, node, llist) { + io_ring_file_put(data->ctx, pfile->file); + if (pfile->done) + complete(pfile->done); + else + kfree(pfile); + } + } + + percpu_ref_get(&data->refs); + percpu_ref_switch_to_percpu(&data->refs); +} + +static void io_file_data_ref_zero(struct percpu_ref *ref) +{ + struct fixed_file_data *data; + + data = container_of(ref, struct fixed_file_data, refs); + + /* we can't safely switch from inside this context, punt to wq */ + queue_work(system_wq, &data->ref_work); +} + static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { __s32 __user *fds = (__s32 __user *) arg; unsigned nr_tables; + struct file *file; int fd, ret = 0; unsigned i; - if (ctx->file_table) + if (ctx->file_data) return -EBUSY; if (!nr_args) return -EINVAL; if (nr_args > IORING_MAX_FIXED_FILES) return -EMFILE; + ctx->file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL); + if (!ctx->file_data) + return -ENOMEM; + ctx->file_data->ctx = ctx; + init_completion(&ctx->file_data->done); + nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE); - ctx->file_table = kcalloc(nr_tables, sizeof(struct fixed_file_table), + ctx->file_data->table = kcalloc(nr_tables, + sizeof(struct fixed_file_table), GFP_KERNEL); - if (!ctx->file_table) + if (!ctx->file_data->table) { + kfree(ctx->file_data); + ctx->file_data = NULL; return -ENOMEM; + } + + if (percpu_ref_init(&ctx->file_data->refs, io_file_data_ref_zero, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { + kfree(ctx->file_data->table); + kfree(ctx->file_data); + ctx->file_data = NULL; + return -ENOMEM; + } + ctx->file_data->put_llist.first = NULL; + INIT_WORK(&ctx->file_data->ref_work, io_ring_file_ref_switch); if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) { - kfree(ctx->file_table); - ctx->file_table = NULL; + percpu_ref_exit(&ctx->file_data->refs); + kfree(ctx->file_data->table); + kfree(ctx->file_data); + ctx->file_data = NULL; return -ENOMEM; } @@ -4298,13 +5572,14 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, continue; } - table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT]; + table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT]; index = i & IORING_FILE_TABLE_MASK; - table->files[index] = fget(fd); + file = fget(fd); ret = -EBADF; - if (!table->files[index]) + if (!file) break; + /* * Don't allow io_uring instances to be registered. If UNIX * isn't enabled, then this causes a reference cycle and this @@ -4312,26 +5587,26 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, * handle it just fine, but there's still no point in allowing * a ring fd as it doesn't support regular read/write anyway. */ - if (table->files[index]->f_op == &io_uring_fops) { - fput(table->files[index]); + if (file->f_op == &io_uring_fops) { + fput(file); break; } ret = 0; + table->files[index] = file; } if (ret) { for (i = 0; i < ctx->nr_user_files; i++) { - struct file *file; - file = io_file_from_index(ctx, i); if (file) fput(file); } for (i = 0; i < nr_tables; i++) - kfree(ctx->file_table[i].files); + kfree(ctx->file_data->table[i].files); - kfree(ctx->file_table); - ctx->file_table = NULL; + kfree(ctx->file_data->table); + kfree(ctx->file_data); + ctx->file_data = NULL; ctx->nr_user_files = 0; return ret; } @@ -4343,69 +5618,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return ret; } -static void io_sqe_file_unregister(struct io_ring_ctx *ctx, int index) -{ -#if defined(CONFIG_UNIX) - struct file *file = io_file_from_index(ctx, index); - struct sock *sock = ctx->ring_sock->sk; - struct sk_buff_head list, *head = &sock->sk_receive_queue; - struct sk_buff *skb; - int i; - - __skb_queue_head_init(&list); - - /* - * Find the skb that holds this file in its SCM_RIGHTS. When found, - * remove this entry and rearrange the file array. - */ - skb = skb_dequeue(head); - while (skb) { - struct scm_fp_list *fp; - - fp = UNIXCB(skb).fp; - for (i = 0; i < fp->count; i++) { - int left; - - if (fp->fp[i] != file) - continue; - - unix_notinflight(fp->user, fp->fp[i]); - left = fp->count - 1 - i; - if (left) { - memmove(&fp->fp[i], &fp->fp[i + 1], - left * sizeof(struct file *)); - } - fp->count--; - if (!fp->count) { - kfree_skb(skb); - skb = NULL; - } else { - __skb_queue_tail(&list, skb); - } - fput(file); - file = NULL; - break; - } - - if (!file) - break; - - __skb_queue_tail(&list, skb); - - skb = skb_dequeue(head); - } - - if (skb_peek(&list)) { - spin_lock_irq(&head->lock); - while ((skb = __skb_dequeue(&list)) != NULL) - __skb_queue_tail(head, skb); - spin_unlock_irq(&head->lock); - } -#else - fput(io_file_from_index(ctx, index)); -#endif -} - static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, int index) { @@ -4449,29 +5661,65 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, #endif } -static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, - unsigned nr_args) +static void io_atomic_switch(struct percpu_ref *ref) { - struct io_uring_files_update up; + struct fixed_file_data *data; + + data = container_of(ref, struct fixed_file_data, refs); + clear_bit(FFD_F_ATOMIC, &data->state); +} + +static bool io_queue_file_removal(struct fixed_file_data *data, + struct file *file) +{ + struct io_file_put *pfile, pfile_stack; + DECLARE_COMPLETION_ONSTACK(done); + + /* + * If we fail allocating the struct we need for doing async reomval + * of this file, just punt to sync and wait for it. + */ + pfile = kzalloc(sizeof(*pfile), GFP_KERNEL); + if (!pfile) { + pfile = &pfile_stack; + pfile->done = &done; + } + + pfile->file = file; + llist_add(&pfile->llist, &data->put_llist); + + if (pfile == &pfile_stack) { + if (!test_and_set_bit(FFD_F_ATOMIC, &data->state)) { + percpu_ref_put(&data->refs); + percpu_ref_switch_to_atomic(&data->refs, + io_atomic_switch); + } + wait_for_completion(&done); + flush_work(&data->ref_work); + return false; + } + + return true; +} + +static int __io_sqe_files_update(struct io_ring_ctx *ctx, + struct io_uring_files_update *up, + unsigned nr_args) +{ + struct fixed_file_data *data = ctx->file_data; + bool ref_switch = false; + struct file *file; __s32 __user *fds; int fd, i, err; __u32 done; - if (!ctx->file_table) - return -ENXIO; - if (!nr_args) - return -EINVAL; - if (copy_from_user(&up, arg, sizeof(up))) - return -EFAULT; - if (up.resv) - return -EINVAL; - if (check_add_overflow(up.offset, nr_args, &done)) + if (check_add_overflow(up->offset, nr_args, &done)) return -EOVERFLOW; if (done > ctx->nr_user_files) return -EINVAL; done = 0; - fds = u64_to_user_ptr(up.fds); + fds = u64_to_user_ptr(up->fds); while (nr_args) { struct fixed_file_table *table; unsigned index; @@ -4481,16 +5729,16 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, err = -EFAULT; break; } - i = array_index_nospec(up.offset, ctx->nr_user_files); - table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT]; + i = array_index_nospec(up->offset, ctx->nr_user_files); + table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT]; index = i & IORING_FILE_TABLE_MASK; if (table->files[index]) { - io_sqe_file_unregister(ctx, i); + file = io_file_from_index(ctx, index); table->files[index] = NULL; + if (io_queue_file_removal(data, file)) + ref_switch = true; } if (fd != -1) { - struct file *file; - file = fget(fd); if (!file) { err = -EBADF; @@ -4516,11 +5764,32 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, } nr_args--; done++; - up.offset++; + up->offset++; + } + + if (ref_switch && !test_and_set_bit(FFD_F_ATOMIC, &data->state)) { + percpu_ref_put(&data->refs); + percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch); } return done ? done : err; } +static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, + unsigned nr_args) +{ + struct io_uring_files_update up; + + if (!ctx->file_data) + return -ENXIO; + if (!nr_args) + return -EINVAL; + if (copy_from_user(&up, arg, sizeof(up))) + return -EFAULT; + if (up.resv) + return -EINVAL; + + return __io_sqe_files_update(ctx, &up, nr_args); +} static void io_put_work(struct io_wq_work *work) { @@ -4536,11 +5805,56 @@ static void io_get_work(struct io_wq_work *work) refcount_inc(&req->refs); } +static int io_init_wq_offload(struct io_ring_ctx *ctx, + struct io_uring_params *p) +{ + struct io_wq_data data; + struct fd f; + struct io_ring_ctx *ctx_attach; + unsigned int concurrency; + int ret = 0; + + data.user = ctx->user; + data.get_work = io_get_work; + data.put_work = io_put_work; + + if (!(p->flags & IORING_SETUP_ATTACH_WQ)) { + /* Do QD, or 4 * CPUS, whatever is smallest */ + concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); + + ctx->io_wq = io_wq_create(concurrency, &data); + if (IS_ERR(ctx->io_wq)) { + ret = PTR_ERR(ctx->io_wq); + ctx->io_wq = NULL; + } + return ret; + } + + f = fdget(p->wq_fd); + if (!f.file) + return -EBADF; + + if (f.file->f_op != &io_uring_fops) { + ret = -EINVAL; + goto out_fput; + } + + ctx_attach = f.file->private_data; + /* @io_wq is protected by holding the fd */ + if (!io_wq_get(ctx_attach->io_wq, &data)) { + ret = -EINVAL; + goto out_fput; + } + + ctx->io_wq = ctx_attach->io_wq; +out_fput: + fdput(f); + return ret; +} + static int io_sq_offload_start(struct io_ring_ctx *ctx, struct io_uring_params *p) { - struct io_wq_data data; - unsigned concurrency; int ret; init_waitqueue_head(&ctx->sqo_wait); @@ -4584,20 +5898,9 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, goto err; } - data.mm = ctx->sqo_mm; - data.user = ctx->user; - data.creds = ctx->creds; - data.get_work = io_get_work; - data.put_work = io_put_work; - - /* Do QD, or 4 * CPUS, whatever is smallest */ - concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); - ctx->io_wq = io_wq_create(concurrency, &data); - if (IS_ERR(ctx->io_wq)) { - ret = PTR_ERR(ctx->io_wq); - ctx->io_wq = NULL; + ret = io_init_wq_offload(ctx, p); + if (ret) goto err; - } return 0; err: @@ -4702,7 +6005,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) struct io_mapped_ubuf *imu = &ctx->user_bufs[i]; for (j = 0; j < imu->nr_bvecs; j++) - put_user_page(imu->bvec[j].bv_page); + unpin_user_page(imu->bvec[j].bv_page); if (ctx->account_mem) io_unaccount_mem(ctx->user, imu->nr_bvecs); @@ -4823,7 +6126,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, ret = 0; down_read(¤t->mm->mmap_sem); - pret = get_user_pages(ubuf, nr_pages, + pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, pages, vmas); if (pret == nr_pages) { @@ -4847,7 +6150,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, * release any pages we did get */ if (pret > 0) - put_user_pages(pages, pret); + unpin_user_pages(pages, pret); if (ctx->account_mem) io_unaccount_mem(ctx->user, nr_pages); kvfree(imu->bvec); @@ -4975,6 +6278,17 @@ static int io_uring_fasync(int fd, struct file *file, int on) return fasync_helper(fd, file, on, &ctx->cq_fasync); } +static int io_remove_personalities(int id, void *p, void *data) +{ + struct io_ring_ctx *ctx = data; + const struct cred *cred; + + cred = idr_remove(&ctx->personality_idr, id); + if (cred) + put_cred(cred); + return 0; +} + static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { mutex_lock(&ctx->uring_lock); @@ -4991,6 +6305,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) /* if we failed setting up the ctx, we might not have any rings */ if (ctx->rings) io_cqring_overflow_flush(ctx, true); + idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); wait_for_completion(&ctx->completions[0]); io_ring_ctx_free(ctx); } @@ -5157,7 +6472,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, } else if (to_submit) { struct mm_struct *cur_mm; - to_submit = min(to_submit, ctx->sq_entries); mutex_lock(&ctx->uring_lock); /* already have mm, so io_submit_sqes() won't try to grab it */ cur_mm = ctx->sqo_mm; @@ -5273,7 +6587,6 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx) #if defined(CONFIG_UNIX) ctx->ring_sock->file = file; - ctx->ring_sock->sk->sk_user_data = ctx; #endif fd_install(ret, file); return ret; @@ -5292,8 +6605,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) bool account_mem; int ret; - if (!entries || entries > IORING_MAX_ENTRIES) + if (!entries) return -EINVAL; + if (entries > IORING_MAX_ENTRIES) { + if (!(p->flags & IORING_SETUP_CLAMP)) + return -EINVAL; + entries = IORING_MAX_ENTRIES; + } /* * Use twice as many entries for the CQ ring. It's possible for the @@ -5310,8 +6628,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) * to a power-of-two, if it isn't already. We do NOT impose * any cq vs sq ring sizing. */ - if (p->cq_entries < p->sq_entries || p->cq_entries > IORING_MAX_CQ_ENTRIES) + if (p->cq_entries < p->sq_entries) return -EINVAL; + if (p->cq_entries > IORING_MAX_CQ_ENTRIES) { + if (!(p->flags & IORING_SETUP_CLAMP)) + return -EINVAL; + p->cq_entries = IORING_MAX_CQ_ENTRIES; + } p->cq_entries = roundup_pow_of_two(p->cq_entries); } else { p->cq_entries = 2 * p->sq_entries; @@ -5376,7 +6699,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) goto err; p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | - IORING_FEAT_SUBMIT_STABLE; + IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | + IORING_FEAT_CUR_PERSONALITY; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: @@ -5403,7 +6727,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) } if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | - IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE)) + IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | + IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ)) return -EINVAL; ret = io_uring_create(entries, &p); @@ -5422,6 +6747,84 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries, return io_uring_setup(entries, params); } +static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) +{ + struct io_uring_probe *p; + size_t size; + int i, ret; + + size = struct_size(p, ops, nr_args); + if (size == SIZE_MAX) + return -EOVERFLOW; + p = kzalloc(size, GFP_KERNEL); + if (!p) + return -ENOMEM; + + ret = -EFAULT; + if (copy_from_user(p, arg, size)) + goto out; + ret = -EINVAL; + if (memchr_inv(p, 0, size)) + goto out; + + p->last_op = IORING_OP_LAST - 1; + if (nr_args > IORING_OP_LAST) + nr_args = IORING_OP_LAST; + + for (i = 0; i < nr_args; i++) { + p->ops[i].op = i; + if (!io_op_defs[i].not_supported) + p->ops[i].flags = IO_URING_OP_SUPPORTED; + } + p->ops_len = i; + + ret = 0; + if (copy_to_user(arg, p, size)) + ret = -EFAULT; +out: + kfree(p); + return ret; +} + +static int io_register_personality(struct io_ring_ctx *ctx) +{ + const struct cred *creds = get_current_cred(); + int id; + + id = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1, + USHRT_MAX, GFP_KERNEL); + if (id < 0) + put_cred(creds); + return id; +} + +static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) +{ + const struct cred *old_creds; + + old_creds = idr_remove(&ctx->personality_idr, id); + if (old_creds) { + put_cred(old_creds); + return 0; + } + + return -EINVAL; +} + +static bool io_register_op_must_quiesce(int op) +{ + switch (op) { + case IORING_UNREGISTER_FILES: + case IORING_REGISTER_FILES_UPDATE: + case IORING_REGISTER_PROBE: + case IORING_REGISTER_PERSONALITY: + case IORING_UNREGISTER_PERSONALITY: + return false; + default: + return true; + } +} + static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -5437,18 +6840,26 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, if (percpu_ref_is_dying(&ctx->refs)) return -ENXIO; - percpu_ref_kill(&ctx->refs); + if (io_register_op_must_quiesce(opcode)) { + percpu_ref_kill(&ctx->refs); - /* - * Drop uring mutex before waiting for references to exit. If another - * thread is currently inside io_uring_enter() it might need to grab - * the uring_lock to make progress. If we hold it here across the drain - * wait, then we can deadlock. It's safe to drop the mutex here, since - * no new references will come in after we've killed the percpu ref. - */ - mutex_unlock(&ctx->uring_lock); - wait_for_completion(&ctx->completions[0]); - mutex_lock(&ctx->uring_lock); + /* + * Drop uring mutex before waiting for references to exit. If + * another thread is currently inside io_uring_enter() it might + * need to grab the uring_lock to make progress. If we hold it + * here across the drain wait, then we can deadlock. It's safe + * to drop the mutex here, since no new references will come in + * after we've killed the percpu ref. + */ + mutex_unlock(&ctx->uring_lock); + ret = wait_for_completion_interruptible(&ctx->completions[0]); + mutex_lock(&ctx->uring_lock); + if (ret) { + percpu_ref_resurrect(&ctx->refs); + ret = -EINTR; + goto out; + } + } switch (opcode) { case IORING_REGISTER_BUFFERS: @@ -5473,10 +6884,17 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, ret = io_sqe_files_update(ctx, arg, nr_args); break; case IORING_REGISTER_EVENTFD: + case IORING_REGISTER_EVENTFD_ASYNC: ret = -EINVAL; if (nr_args != 1) break; ret = io_eventfd_register(ctx, arg); + if (ret) + break; + if (opcode == IORING_REGISTER_EVENTFD_ASYNC) + ctx->eventfd_async = 1; + else + ctx->eventfd_async = 0; break; case IORING_UNREGISTER_EVENTFD: ret = -EINVAL; @@ -5484,14 +6902,35 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_eventfd_unregister(ctx); break; + case IORING_REGISTER_PROBE: + ret = -EINVAL; + if (!arg || nr_args > 256) + break; + ret = io_probe(ctx, arg, nr_args); + break; + case IORING_REGISTER_PERSONALITY: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_register_personality(ctx); + break; + case IORING_UNREGISTER_PERSONALITY: + ret = -EINVAL; + if (arg) + break; + ret = io_unregister_personality(ctx, nr_args); + break; default: ret = -EINVAL; break; } - /* bring the ctx back to life */ - reinit_completion(&ctx->completions[0]); - percpu_ref_reinit(&ctx->refs); + if (io_register_op_must_quiesce(opcode)) { + /* bring the ctx back to life */ + percpu_ref_reinit(&ctx->refs); +out: + reinit_completion(&ctx->completions[0]); + } return ret; } @@ -5524,6 +6963,7 @@ out_fput: static int __init io_uring_init(void) { + BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC); return 0; }; diff --git a/fs/ioctl.c b/fs/ioctl.c index 2f5e4e5b97e1..7c9a5df5a597 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -467,7 +467,7 @@ EXPORT_SYMBOL(generic_block_fiemap); * Only the l_start, l_len and l_whence fields of the 'struct space_resv' * are used here, rest are ignored. */ -int ioctl_preallocate(struct file *filp, int mode, void __user *argp) +static int ioctl_preallocate(struct file *filp, int mode, void __user *argp) { struct inode *inode = file_inode(filp); struct space_resv sr; @@ -495,8 +495,8 @@ int ioctl_preallocate(struct file *filp, int mode, void __user *argp) /* on ia32 l_start is on a 32-bit boundary */ #if defined CONFIG_COMPAT && defined(CONFIG_X86_64) /* just account for different alignment */ -int compat_ioctl_preallocate(struct file *file, int mode, - struct space_resv_32 __user *argp) +static int compat_ioctl_preallocate(struct file *file, int mode, + struct space_resv_32 __user *argp) { struct inode *inode = file_inode(file); struct space_resv_32 sr; @@ -521,11 +521,9 @@ int compat_ioctl_preallocate(struct file *file, int mode, } #endif -static int file_ioctl(struct file *filp, unsigned int cmd, - unsigned long arg) +static int file_ioctl(struct file *filp, unsigned int cmd, int __user *p) { struct inode *inode = file_inode(filp); - int __user *p = (int __user *)arg; switch (cmd) { case FIBMAP: @@ -542,7 +540,7 @@ static int file_ioctl(struct file *filp, unsigned int cmd, return ioctl_preallocate(filp, FALLOC_FL_ZERO_RANGE, p); } - return vfs_ioctl(filp, cmd, arg); + return -ENOIOCTLCMD; } static int ioctl_fionbio(struct file *filp, int __user *argp) @@ -661,53 +659,48 @@ out: } /* - * When you add any new common ioctls to the switches above and below - * please update compat_sys_ioctl() too. - * * do_vfs_ioctl() is not for drivers and not intended to be EXPORT_SYMBOL()'d. * It's just a simple helper for sys_ioctl and compat_sys_ioctl. + * + * When you add any new common ioctls to the switches above and below, + * please ensure they have compatible arguments in compat mode. */ -int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, - unsigned long arg) +static int do_vfs_ioctl(struct file *filp, unsigned int fd, + unsigned int cmd, unsigned long arg) { - int error = 0; void __user *argp = (void __user *)arg; struct inode *inode = file_inode(filp); switch (cmd) { case FIOCLEX: set_close_on_exec(fd, 1); - break; + return 0; case FIONCLEX: set_close_on_exec(fd, 0); - break; + return 0; case FIONBIO: - error = ioctl_fionbio(filp, argp); - break; + return ioctl_fionbio(filp, argp); case FIOASYNC: - error = ioctl_fioasync(fd, filp, argp); - break; + return ioctl_fioasync(fd, filp, argp); case FIOQSIZE: if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) { loff_t res = inode_get_bytes(inode); - error = copy_to_user(argp, &res, sizeof(res)) ? - -EFAULT : 0; - } else - error = -ENOTTY; - break; + return copy_to_user(argp, &res, sizeof(res)) ? + -EFAULT : 0; + } + + return -ENOTTY; case FIFREEZE: - error = ioctl_fsfreeze(filp); - break; + return ioctl_fsfreeze(filp); case FITHAW: - error = ioctl_fsthaw(filp); - break; + return ioctl_fsthaw(filp); case FS_IOC_FIEMAP: return ioctl_fiemap(filp, argp); @@ -716,6 +709,7 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, /* anon_bdev filesystems may not have a block size */ if (!inode->i_sb->s_blocksize) return -EINVAL; + return put_user(inode->i_sb->s_blocksize, (int __user *)argp); case FICLONE: @@ -729,24 +723,30 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, default: if (S_ISREG(inode->i_mode)) - error = file_ioctl(filp, cmd, arg); - else - error = vfs_ioctl(filp, cmd, arg); + return file_ioctl(filp, cmd, argp); break; } - return error; + + return -ENOIOCTLCMD; } int ksys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) { - int error; struct fd f = fdget(fd); + int error; if (!f.file) return -EBADF; + error = security_file_ioctl(f.file, cmd, arg); - if (!error) - error = do_vfs_ioctl(f.file, fd, cmd, arg); + if (error) + goto out; + + error = do_vfs_ioctl(f.file, fd, cmd, arg); + if (error == -ENOIOCTLCMD) + error = vfs_ioctl(f.file, cmd, arg); + +out: fdput(f); return error; } @@ -788,4 +788,65 @@ long compat_ptr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return file->f_op->unlocked_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); } EXPORT_SYMBOL(compat_ptr_ioctl); + +COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, + compat_ulong_t, arg) +{ + struct fd f = fdget(fd); + int error; + + if (!f.file) + return -EBADF; + + /* RED-PEN how should LSM module know it's handling 32bit? */ + error = security_file_ioctl(f.file, cmd, arg); + if (error) + goto out; + + switch (cmd) { + /* FICLONE takes an int argument, so don't use compat_ptr() */ + case FICLONE: + error = ioctl_file_clone(f.file, arg, 0, 0, 0); + break; + +#if defined(CONFIG_X86_64) + /* these get messy on amd64 due to alignment differences */ + case FS_IOC_RESVSP_32: + case FS_IOC_RESVSP64_32: + error = compat_ioctl_preallocate(f.file, 0, compat_ptr(arg)); + break; + case FS_IOC_UNRESVSP_32: + case FS_IOC_UNRESVSP64_32: + error = compat_ioctl_preallocate(f.file, FALLOC_FL_PUNCH_HOLE, + compat_ptr(arg)); + break; + case FS_IOC_ZERO_RANGE_32: + error = compat_ioctl_preallocate(f.file, FALLOC_FL_ZERO_RANGE, + compat_ptr(arg)); + break; +#endif + + /* + * everything else in do_vfs_ioctl() takes either a compatible + * pointer argument or no argument -- call it with a modified + * argument. + */ + default: + error = do_vfs_ioctl(f.file, fd, cmd, + (unsigned long)compat_ptr(arg)); + if (error != -ENOIOCTLCMD) + break; + + if (f.file->f_op->compat_ioctl) + error = f.file->f_op->compat_ioctl(f.file, cmd, arg); + if (error == -ENOIOCTLCMD) + error = -ENOTTY; + break; + } + + out: + fdput(f); + + return error; +} #endif diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 828444e14d09..7c84c4c027c4 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1077,24 +1077,16 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); unsigned long length; - loff_t offset, size; + loff_t offset; ssize_t ret; lock_page(page); - size = i_size_read(inode); - offset = page_offset(page); - if (page->mapping != inode->i_mapping || offset > size) { - /* We overload EFAULT to mean page got truncated */ - ret = -EFAULT; + ret = page_mkwrite_check_truncate(page, inode); + if (ret < 0) goto out_unlock; - } - - /* page is wholly or partially inside EOF */ - if (offset > size - PAGE_SIZE) - length = offset_in_page(size); - else - length = PAGE_SIZE; + length = ret; + offset = page_offset(page); while (length > 0) { ret = iomap_apply(inode, offset, length, IOMAP_WRITE | IOMAP_FAULT, ops, page, diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 8fff6677a5da..96bf33986d03 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -164,7 +164,7 @@ void __jbd2_log_wait_for_space(journal_t *journal) "journal space in %s\n", __func__, journal->j_devname); WARN_ON(1); - jbd2_journal_abort(journal, 0); + jbd2_journal_abort(journal, -EIO); } write_lock(&journal->j_state_lock); } else { diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 7f0b362b3842..2494095e0340 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -782,7 +782,7 @@ start_journal_io: err = journal_submit_commit_record(journal, commit_transaction, &cbh, crc32_sum); if (err) - __jbd2_journal_abort_hard(journal); + jbd2_journal_abort(journal, err); } blk_finish_plug(&plug); @@ -875,7 +875,7 @@ start_journal_io: err = journal_submit_commit_record(journal, commit_transaction, &cbh, crc32_sum); if (err) - __jbd2_journal_abort_hard(journal); + jbd2_journal_abort(journal, err); } if (cbh) err = journal_wait_on_commit_record(journal, cbh); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 5e408ee24a1a..60bf8ff78913 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -96,7 +96,6 @@ EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); EXPORT_SYMBOL(jbd2_inode_cache); -static void __journal_abort_soft (journal_t *journal, int errno); static int jbd2_journal_create_slab(size_t slab_size); #ifdef CONFIG_JBD2_DEBUG @@ -805,7 +804,7 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr, "at offset %lu on %s\n", __func__, blocknr, journal->j_devname); err = -EIO; - __journal_abort_soft(journal, err); + jbd2_journal_abort(journal, err); } } else { *retp = blocknr; /* +journal->j_blk_offset */ @@ -982,6 +981,7 @@ static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos) static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos) { + (*pos)++; return NULL; } @@ -1710,6 +1710,11 @@ int jbd2_journal_load(journal_t *journal) journal->j_devname); return -EFSCORRUPTED; } + /* + * clear JBD2_ABORT flag initialized in journal_init_common + * here to update log tail information with the newest seq. + */ + journal->j_flags &= ~JBD2_ABORT; /* OK, we've finished with the dynamic journal bits: * reinitialise the dynamic contents of the superblock in memory @@ -1717,7 +1722,6 @@ int jbd2_journal_load(journal_t *journal) if (journal_reset(journal)) goto recovery_error; - journal->j_flags &= ~JBD2_ABORT; journal->j_flags |= JBD2_LOADED; return 0; @@ -2098,67 +2102,6 @@ int jbd2_journal_wipe(journal_t *journal, int write) return err; } -/* - * Journal abort has very specific semantics, which we describe - * for journal abort. - * - * Two internal functions, which provide abort to the jbd layer - * itself are here. - */ - -/* - * Quick version for internal journal use (doesn't lock the journal). - * Aborts hard --- we mark the abort as occurred, but do _nothing_ else, - * and don't attempt to make any other journal updates. - */ -void __jbd2_journal_abort_hard(journal_t *journal) -{ - transaction_t *transaction; - - if (journal->j_flags & JBD2_ABORT) - return; - - printk(KERN_ERR "Aborting journal on device %s.\n", - journal->j_devname); - - write_lock(&journal->j_state_lock); - journal->j_flags |= JBD2_ABORT; - transaction = journal->j_running_transaction; - if (transaction) - __jbd2_log_start_commit(journal, transaction->t_tid); - write_unlock(&journal->j_state_lock); -} - -/* Soft abort: record the abort error status in the journal superblock, - * but don't do any other IO. */ -static void __journal_abort_soft (journal_t *journal, int errno) -{ - int old_errno; - - write_lock(&journal->j_state_lock); - old_errno = journal->j_errno; - if (!journal->j_errno || errno == -ESHUTDOWN) - journal->j_errno = errno; - - if (journal->j_flags & JBD2_ABORT) { - write_unlock(&journal->j_state_lock); - if (!old_errno && old_errno != -ESHUTDOWN && - errno == -ESHUTDOWN) - jbd2_journal_update_sb_errno(journal); - return; - } - write_unlock(&journal->j_state_lock); - - __jbd2_journal_abort_hard(journal); - - if (errno) { - jbd2_journal_update_sb_errno(journal); - write_lock(&journal->j_state_lock); - journal->j_flags |= JBD2_REC_ERR; - write_unlock(&journal->j_state_lock); - } -} - /** * void jbd2_journal_abort () - Shutdown the journal immediately. * @journal: the journal to shutdown. @@ -2198,16 +2141,51 @@ static void __journal_abort_soft (journal_t *journal, int errno) * failure to disk. ext3_error, for example, now uses this * functionality. * - * Errors which originate from within the journaling layer will NOT - * supply an errno; a null errno implies that absolutely no further - * writes are done to the journal (unless there are any already in - * progress). - * */ void jbd2_journal_abort(journal_t *journal, int errno) { - __journal_abort_soft(journal, errno); + transaction_t *transaction; + + /* + * ESHUTDOWN always takes precedence because a file system check + * caused by any other journal abort error is not required after + * a shutdown triggered. + */ + write_lock(&journal->j_state_lock); + if (journal->j_flags & JBD2_ABORT) { + int old_errno = journal->j_errno; + + write_unlock(&journal->j_state_lock); + if (old_errno != -ESHUTDOWN && errno == -ESHUTDOWN) { + journal->j_errno = errno; + jbd2_journal_update_sb_errno(journal); + } + return; + } + + /* + * Mark the abort as occurred and start current running transaction + * to release all journaled buffer. + */ + pr_err("Aborting journal on device %s.\n", journal->j_devname); + + journal->j_flags |= JBD2_ABORT; + journal->j_errno = errno; + transaction = journal->j_running_transaction; + if (transaction) + __jbd2_log_start_commit(journal, transaction->t_tid); + write_unlock(&journal->j_state_lock); + + /* + * Record errno to the journal super block, so that fsck and jbd2 + * layer could realise that a filesystem check is needed. + */ + jbd2_journal_update_sb_errno(journal); + + write_lock(&journal->j_state_lock); + journal->j_flags |= JBD2_REC_ERR; + write_unlock(&journal->j_state_lock); } /** @@ -2556,7 +2534,6 @@ static void __journal_remove_journal_head(struct buffer_head *bh) { struct journal_head *jh = bh2jh(bh); - J_ASSERT_JH(jh, jh->b_jcount >= 0); J_ASSERT_JH(jh, jh->b_transaction == NULL); J_ASSERT_JH(jh, jh->b_next_transaction == NULL); J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 27b9f9dee434..e77a5a0b4e46 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -525,7 +525,7 @@ EXPORT_SYMBOL(jbd2__journal_start); * modified buffers in the log. We block until the log can guarantee * that much space. Additionally, if rsv_blocks > 0, we also create another * handle with rsv_blocks reserved blocks in the journal. This handle is - * is stored in h_rsv_handle. It is not attached to any particular transaction + * stored in h_rsv_handle. It is not attached to any particular transaction * and thus doesn't block transaction commit. If the caller uses this reserved * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop() * on the parent handle will dispose the reserved one. Reserved handle has to @@ -1595,7 +1595,7 @@ out: * Allow this call even if the handle has aborted --- it may be part of * the caller's cleanup after an abort. */ -int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) +int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; journal_t *journal; diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 9d96e6871e1a..9aec80b9d7c6 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -1266,7 +1266,7 @@ void kernfs_activate(struct kernfs_node *kn) pos = NULL; while ((pos = kernfs_next_descendant_post(pos, kn))) { - if (!pos || (pos->flags & KERNFS_ACTIVATED)) + if (pos->flags & KERNFS_ACTIVATED) continue; WARN_ON_ONCE(pos->parent && RB_EMPTY_NODE(&pos->rb)); diff --git a/fs/namei.c b/fs/namei.c index 4fb61e0754ed..db6565c99825 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -491,7 +491,7 @@ struct nameidata { struct path root; struct inode *inode; /* path.dentry.d_inode */ unsigned int flags; - unsigned seq, m_seq; + unsigned seq, m_seq, r_seq; int last_type; unsigned depth; int total_link_count; @@ -641,6 +641,14 @@ static bool legitimize_links(struct nameidata *nd) static bool legitimize_root(struct nameidata *nd) { + /* + * For scoped-lookups (where nd->root has been zeroed), we need to + * restart the whole lookup from scratch -- because set_root() is wrong + * for these lookups (nd->dfd is the root, not the filesystem root). + */ + if (!nd->root.mnt && (nd->flags & LOOKUP_IS_SCOPED)) + return false; + /* Nothing to do if nd->root is zero or is managed by the VFS user. */ if (!nd->root.mnt || (nd->flags & LOOKUP_ROOT)) return true; nd->flags |= LOOKUP_ROOT_GRABBED; @@ -776,12 +784,37 @@ static int complete_walk(struct nameidata *nd) int status; if (nd->flags & LOOKUP_RCU) { - if (!(nd->flags & LOOKUP_ROOT)) + /* + * We don't want to zero nd->root for scoped-lookups or + * externally-managed nd->root. + */ + if (!(nd->flags & (LOOKUP_ROOT | LOOKUP_IS_SCOPED))) nd->root.mnt = NULL; if (unlikely(unlazy_walk(nd))) return -ECHILD; } + if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) { + /* + * While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't + * ever step outside the root during lookup" and should already + * be guaranteed by the rest of namei, we want to avoid a namei + * BUG resulting in userspace being given a path that was not + * scoped within the root at some point during the lookup. + * + * So, do a final sanity-check to make sure that in the + * worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED) + * we won't silently return an fd completely outside of the + * requested root to userspace. + * + * Userspace could move the path outside the root after this + * check, but as discussed elsewhere this is not a concern (the + * resolved file was inside the root at some point). + */ + if (!path_is_under(&nd->path, &nd->root)) + return -EXDEV; + } + if (likely(!(nd->flags & LOOKUP_JUMPED))) return 0; @@ -798,10 +831,18 @@ static int complete_walk(struct nameidata *nd) return status; } -static void set_root(struct nameidata *nd) +static int set_root(struct nameidata *nd) { struct fs_struct *fs = current->fs; + /* + * Jumping to the real root in a scoped-lookup is a BUG in namei, but we + * still have to ensure it doesn't happen because it will cause a breakout + * from the dirfd. + */ + if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED)) + return -ENOTRECOVERABLE; + if (nd->flags & LOOKUP_RCU) { unsigned seq; @@ -814,6 +855,7 @@ static void set_root(struct nameidata *nd) get_fs_root(fs, &nd->root); nd->flags |= LOOKUP_ROOT_GRABBED; } + return 0; } static void path_put_conditional(struct path *path, struct nameidata *nd) @@ -837,6 +879,18 @@ static inline void path_to_nameidata(const struct path *path, static int nd_jump_root(struct nameidata *nd) { + if (unlikely(nd->flags & LOOKUP_BENEATH)) + return -EXDEV; + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) { + /* Absolute path arguments to path_init() are allowed. */ + if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt) + return -EXDEV; + } + if (!nd->root.mnt) { + int error = set_root(nd); + if (error) + return error; + } if (nd->flags & LOOKUP_RCU) { struct dentry *d; nd->path = nd->root; @@ -859,14 +913,32 @@ static int nd_jump_root(struct nameidata *nd) * Helper to directly jump to a known parsed path from ->get_link, * caller must have taken a reference to path beforehand. */ -void nd_jump_link(struct path *path) +int nd_jump_link(struct path *path) { + int error = -ELOOP; struct nameidata *nd = current->nameidata; - path_put(&nd->path); + if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS)) + goto err; + + error = -EXDEV; + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) { + if (nd->path.mnt != path->mnt) + goto err; + } + /* Not currently safe for scoped-lookups. */ + if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) + goto err; + + path_put(&nd->path); nd->path = *path; nd->inode = nd->path.dentry->d_inode; nd->flags |= LOOKUP_JUMPED; + return 0; + +err: + path_put(path); + return error; } static inline void put_link(struct nameidata *nd) @@ -1050,6 +1122,9 @@ const char *get_link(struct nameidata *nd) int error; const char *res; + if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS)) + return ERR_PTR(-ELOOP); + if (!(nd->flags & LOOKUP_RCU)) { touch_atime(&last->link); cond_resched(); @@ -1084,10 +1159,9 @@ const char *get_link(struct nameidata *nd) return res; } if (*res == '/') { - if (!nd->root.mnt) - set_root(nd); - if (unlikely(nd_jump_root(nd))) - return ERR_PTR(-ECHILD); + error = nd_jump_root(nd); + if (unlikely(error)) + return ERR_PTR(error); while (unlikely(*++res == '/')) ; } @@ -1269,10 +1343,14 @@ static int follow_managed(struct path *path, struct nameidata *nd) break; } - if (need_mntput && path->mnt == mnt) - mntput(path->mnt); - if (need_mntput) - nd->flags |= LOOKUP_JUMPED; + if (need_mntput) { + if (path->mnt == mnt) + mntput(path->mnt); + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) + ret = -EXDEV; + else + nd->flags |= LOOKUP_JUMPED; + } if (ret == -EISDIR || !ret) ret = 1; if (ret > 0 && unlikely(d_flags_negative(flags))) @@ -1333,6 +1411,8 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, mounted = __lookup_mnt(path->mnt, path->dentry); if (!mounted) break; + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) + return false; path->mnt = &mounted->mnt; path->dentry = mounted->mnt.mnt_root; nd->flags |= LOOKUP_JUMPED; @@ -1353,8 +1433,11 @@ static int follow_dotdot_rcu(struct nameidata *nd) struct inode *inode = nd->inode; while (1) { - if (path_equal(&nd->path, &nd->root)) + if (path_equal(&nd->path, &nd->root)) { + if (unlikely(nd->flags & LOOKUP_BENEATH)) + return -ECHILD; break; + } if (nd->path.dentry != nd->path.mnt->mnt_root) { struct dentry *old = nd->path.dentry; struct dentry *parent = old->d_parent; @@ -1367,7 +1450,7 @@ static int follow_dotdot_rcu(struct nameidata *nd) nd->path.dentry = parent; nd->seq = seq; if (unlikely(!path_connected(&nd->path))) - return -ENOENT; + return -ECHILD; break; } else { struct mount *mnt = real_mount(nd->path.mnt); @@ -1379,6 +1462,8 @@ static int follow_dotdot_rcu(struct nameidata *nd) return -ECHILD; if (&mparent->mnt == nd->path.mnt) break; + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) + return -ECHILD; /* we know that mountpoint was pinned */ nd->path.dentry = mountpoint; nd->path.mnt = &mparent->mnt; @@ -1393,6 +1478,8 @@ static int follow_dotdot_rcu(struct nameidata *nd) return -ECHILD; if (!mounted) break; + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) + return -ECHILD; nd->path.mnt = &mounted->mnt; nd->path.dentry = mounted->mnt.mnt_root; inode = nd->path.dentry->d_inode; @@ -1480,9 +1567,12 @@ static int path_parent_directory(struct path *path) static int follow_dotdot(struct nameidata *nd) { - while(1) { - if (path_equal(&nd->path, &nd->root)) + while (1) { + if (path_equal(&nd->path, &nd->root)) { + if (unlikely(nd->flags & LOOKUP_BENEATH)) + return -EXDEV; break; + } if (nd->path.dentry != nd->path.mnt->mnt_root) { int ret = path_parent_directory(&nd->path); if (ret) @@ -1491,6 +1581,8 @@ static int follow_dotdot(struct nameidata *nd) } if (!follow_up(&nd->path)) break; + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) + return -EXDEV; } follow_mount(&nd->path); nd->inode = nd->path.dentry->d_inode; @@ -1699,12 +1791,33 @@ static inline int may_lookup(struct nameidata *nd) static inline int handle_dots(struct nameidata *nd, int type) { if (type == LAST_DOTDOT) { - if (!nd->root.mnt) - set_root(nd); - if (nd->flags & LOOKUP_RCU) { - return follow_dotdot_rcu(nd); - } else - return follow_dotdot(nd); + int error = 0; + + if (!nd->root.mnt) { + error = set_root(nd); + if (error) + return error; + } + if (nd->flags & LOOKUP_RCU) + error = follow_dotdot_rcu(nd); + else + error = follow_dotdot(nd); + if (error) + return error; + + if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) { + /* + * If there was a racing rename or mount along our + * path, then we can't be sure that ".." hasn't jumped + * above nd->root (and so userspace should retry or use + * some fallback). + */ + smp_rmb(); + if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq))) + return -EAGAIN; + if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq))) + return -EAGAIN; + } } return 0; } @@ -2158,6 +2271,7 @@ OK: /* must be paired with terminate_walk() */ static const char *path_init(struct nameidata *nd, unsigned flags) { + int error; const char *s = nd->name->name; if (!*s) @@ -2168,6 +2282,11 @@ static const char *path_init(struct nameidata *nd, unsigned flags) nd->last_type = LAST_ROOT; /* if there are only slashes... */ nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT; nd->depth = 0; + + nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount); + nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount); + smp_rmb(); + if (flags & LOOKUP_ROOT) { struct dentry *root = nd->root.dentry; struct inode *inode = root->d_inode; @@ -2176,9 +2295,8 @@ static const char *path_init(struct nameidata *nd, unsigned flags) nd->path = nd->root; nd->inode = inode; if (flags & LOOKUP_RCU) { - nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); nd->root_seq = nd->seq; - nd->m_seq = read_seqbegin(&mount_lock); } else { path_get(&nd->path); } @@ -2189,13 +2307,16 @@ static const char *path_init(struct nameidata *nd, unsigned flags) nd->path.mnt = NULL; nd->path.dentry = NULL; - nd->m_seq = read_seqbegin(&mount_lock); - if (*s == '/') { - set_root(nd); - if (likely(!nd_jump_root(nd))) - return s; - return ERR_PTR(-ECHILD); - } else if (nd->dfd == AT_FDCWD) { + /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */ + if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) { + error = nd_jump_root(nd); + if (unlikely(error)) + return ERR_PTR(error); + return s; + } + + /* Relative pathname -- get the starting-point it is relative to. */ + if (nd->dfd == AT_FDCWD) { if (flags & LOOKUP_RCU) { struct fs_struct *fs = current->fs; unsigned seq; @@ -2210,7 +2331,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags) get_fs_pwd(current->fs, &nd->path); nd->inode = nd->path.dentry->d_inode; } - return s; } else { /* Caller must check execute permissions on the starting path component */ struct fd f = fdget_raw(nd->dfd); @@ -2235,8 +2355,19 @@ static const char *path_init(struct nameidata *nd, unsigned flags) nd->inode = nd->path.dentry->d_inode; } fdput(f); - return s; } + + /* For scoped-lookups we need to set the root to the dirfd as well. */ + if (flags & LOOKUP_IS_SCOPED) { + nd->root = nd->path; + if (flags & LOOKUP_RCU) { + nd->root_seq = nd->seq; + } else { + path_get(&nd->root); + nd->flags |= LOOKUP_ROOT_GRABBED; + } + } + return s; } static const char *trailing_symlink(struct nameidata *nd) @@ -3202,8 +3333,8 @@ static int do_last(struct nameidata *nd, struct file *file, const struct open_flags *op) { struct dentry *dir = nd->path.dentry; - kuid_t dir_uid = dir->d_inode->i_uid; - umode_t dir_mode = dir->d_inode->i_mode; + kuid_t dir_uid = nd->inode->i_uid; + umode_t dir_mode = nd->inode->i_mode; int open_flag = op->open_flag; bool will_truncate = (open_flag & O_TRUNC) != 0; bool got_write = false; diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c index 15f271401dcc..573b1da9342c 100644 --- a/fs/nfs/fscache-index.c +++ b/fs/nfs/fscache-index.c @@ -84,8 +84,10 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data, return FSCACHE_CHECKAUX_OBSOLETE; memset(&auxdata, 0, sizeof(auxdata)); - auxdata.mtime = timespec64_to_timespec(nfsi->vfs_inode.i_mtime); - auxdata.ctime = timespec64_to_timespec(nfsi->vfs_inode.i_ctime); + auxdata.mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec; + auxdata.mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec; + auxdata.ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec; + auxdata.ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec; if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) auxdata.change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode); diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 3800ab6f08fa..7def925d3af5 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -238,8 +238,10 @@ void nfs_fscache_init_inode(struct inode *inode) return; memset(&auxdata, 0, sizeof(auxdata)); - auxdata.mtime = timespec64_to_timespec(nfsi->vfs_inode.i_mtime); - auxdata.ctime = timespec64_to_timespec(nfsi->vfs_inode.i_ctime); + auxdata.mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec; + auxdata.mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec; + auxdata.ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec; + auxdata.ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec; if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) auxdata.change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode); @@ -263,8 +265,10 @@ void nfs_fscache_clear_inode(struct inode *inode) dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie); memset(&auxdata, 0, sizeof(auxdata)); - auxdata.mtime = timespec64_to_timespec(nfsi->vfs_inode.i_mtime); - auxdata.ctime = timespec64_to_timespec(nfsi->vfs_inode.i_ctime); + auxdata.mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec; + auxdata.mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec; + auxdata.ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec; + auxdata.ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec; fscache_relinquish_cookie(cookie, &auxdata, false); nfsi->fscache = NULL; } @@ -305,8 +309,10 @@ void nfs_fscache_open_file(struct inode *inode, struct file *filp) return; memset(&auxdata, 0, sizeof(auxdata)); - auxdata.mtime = timespec64_to_timespec(nfsi->vfs_inode.i_mtime); - auxdata.ctime = timespec64_to_timespec(nfsi->vfs_inode.i_ctime); + auxdata.mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec; + auxdata.mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec; + auxdata.ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec; + auxdata.ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec; if (inode_is_open_for_write(inode)) { dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi); diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index ad041cfbf9ec..6754c8607230 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -62,9 +62,11 @@ struct nfs_fscache_key { * cache object. */ struct nfs_fscache_inode_auxdata { - struct timespec mtime; - struct timespec ctime; - u64 change_attr; + s64 mtime_sec; + s64 mtime_nsec; + s64 ctime_sec; + s64 ctime_nsec; + u64 change_attr; }; /* diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 936c57779ff4..728d88b6a698 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4097,7 +4097,7 @@ static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, str status = NFS_ATTR_FATTR_ATIME; bitmap[1] &= ~FATTR4_WORD1_TIME_ACCESS; } - dprintk("%s: atime=%ld\n", __func__, (long)time->tv_sec); + dprintk("%s: atime=%lld\n", __func__, time->tv_sec); return status; } @@ -4115,7 +4115,7 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s status = NFS_ATTR_FATTR_CTIME; bitmap[1] &= ~FATTR4_WORD1_TIME_METADATA; } - dprintk("%s: ctime=%ld\n", __func__, (long)time->tv_sec); + dprintk("%s: ctime=%lld\n", __func__, time->tv_sec); return status; } @@ -4132,8 +4132,8 @@ static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap, status = decode_attr_time(xdr, time); bitmap[1] &= ~FATTR4_WORD1_TIME_DELTA; } - dprintk("%s: time_delta=%ld %ld\n", __func__, (long)time->tv_sec, - (long)time->tv_nsec); + dprintk("%s: time_delta=%lld %ld\n", __func__, time->tv_sec, + time->tv_nsec); return status; } @@ -4197,7 +4197,7 @@ static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, str status = NFS_ATTR_FATTR_MTIME; bitmap[1] &= ~FATTR4_WORD1_TIME_MODIFY; } - dprintk("%s: mtime=%ld\n", __func__, (long)time->tv_sec); + dprintk("%s: mtime=%lld\n", __func__, time->tv_sec); return status; } diff --git a/fs/nsfs.c b/fs/nsfs.c index f75767bd623a..b13bfd406820 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -55,7 +55,7 @@ static void nsfs_evict(struct inode *inode) ns->ops->put(ns); } -static void *__ns_get_path(struct path *path, struct ns_common *ns) +static int __ns_get_path(struct path *path, struct ns_common *ns) { struct vfsmount *mnt = nsfs_mnt; struct dentry *dentry; @@ -74,13 +74,13 @@ static void *__ns_get_path(struct path *path, struct ns_common *ns) got_it: path->mnt = mntget(mnt); path->dentry = dentry; - return NULL; + return 0; slow: rcu_read_unlock(); inode = new_inode_pseudo(mnt->mnt_sb); if (!inode) { ns->ops->put(ns); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } inode->i_ino = ns->inum; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); @@ -92,7 +92,7 @@ slow: dentry = d_alloc_anon(mnt->mnt_sb); if (!dentry) { iput(inode); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } d_instantiate(dentry, inode); dentry->d_fsdata = (void *)ns->ops; @@ -101,23 +101,22 @@ slow: d_delete(dentry); /* make sure ->d_prune() does nothing */ dput(dentry); cpu_relax(); - return ERR_PTR(-EAGAIN); + return -EAGAIN; } goto got_it; } -void *ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb, +int ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb, void *private_data) { - void *ret; + int ret; do { struct ns_common *ns = ns_get_cb(private_data); if (!ns) - return ERR_PTR(-ENOENT); - + return -ENOENT; ret = __ns_get_path(path, ns); - } while (ret == ERR_PTR(-EAGAIN)); + } while (ret == -EAGAIN); return ret; } @@ -134,7 +133,7 @@ static struct ns_common *ns_get_path_task(void *private_data) return args->ns_ops->get(args->task); } -void *ns_get_path(struct path *path, struct task_struct *task, +int ns_get_path(struct path *path, struct task_struct *task, const struct proc_ns_operations *ns_ops) { struct ns_get_path_task_args args = { @@ -150,7 +149,7 @@ int open_related_ns(struct ns_common *ns, { struct path path = {}; struct file *f; - void *err; + int err; int fd; fd = get_unused_fd_flags(O_CLOEXEC); @@ -167,11 +166,11 @@ int open_related_ns(struct ns_common *ns, } err = __ns_get_path(&path, relative); - } while (err == ERR_PTR(-EAGAIN)); + } while (err == -EAGAIN); - if (IS_ERR(err)) { + if (err) { put_unused_fd(fd); - return PTR_ERR(err); + return err; } f = dentry_open(&path, O_RDONLY, current_cred()); diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index 5c424a099280..1ef24574f481 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -73,7 +73,7 @@ static void o2quo_fence_self(void) "system by restarting ***\n"); emergency_restart(); break; - }; + } } /* Indicate that a timeout occurred on a heartbeat region write. The diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile index 38b224372776..5e700b45d32d 100644 --- a/fs/ocfs2/dlm/Makefile +++ b/fs/ocfs2/dlm/Makefile @@ -1,6 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only -ccflags-y := -I $(srctree)/$(src)/.. - obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \ diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index 4de89af96abf..6abaded3ff6b 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -23,15 +23,15 @@ #include <linux/spinlock.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" #define MLOG_MASK_PREFIX ML_DLM -#include "cluster/masklog.h" +#include "../cluster/masklog.h" static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, struct dlm_lock *lock); diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index aaf24548b02a..0463dce65bb2 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -688,10 +688,6 @@ struct dlm_begin_reco __be32 pad2; }; - -#define BITS_PER_BYTE 8 -#define BITS_TO_BYTES(bits) (((bits)+BITS_PER_BYTE-1)/BITS_PER_BYTE) - struct dlm_query_join_request { u8 node_idx; diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index 965f45dbe17b..6051edc33aef 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -23,9 +23,9 @@ #include <linux/spinlock.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" @@ -33,7 +33,7 @@ #include "dlmconvert.h" #define MLOG_MASK_PREFIX ML_DLM -#include "cluster/masklog.h" +#include "../cluster/masklog.h" /* NOTE: __dlmconvert_master is the only function in here that * needs a spinlock held on entry (res->spinlock) and it is the diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 4d0b452012b2..c5c6efba7b5e 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -17,9 +17,9 @@ #include <linux/debugfs.h> #include <linux/export.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" @@ -27,7 +27,7 @@ #include "dlmdebug.h" #define MLOG_MASK_PREFIX ML_DLM -#include "cluster/masklog.h" +#include "../cluster/masklog.h" static int stringify_lockname(const char *lockname, int locklen, char *buf, int len); diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index ee6f459f9770..357cfc702ce3 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -20,9 +20,9 @@ #include <linux/debugfs.h> #include <linux/sched/signal.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" @@ -30,7 +30,7 @@ #include "dlmdebug.h" #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) -#include "cluster/masklog.h" +#include "../cluster/masklog.h" /* * ocfs2 node maps are array of long int, which limits to send them freely diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index baff087f3863..83f0760e4fba 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -25,9 +25,9 @@ #include <linux/delay.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" @@ -35,7 +35,7 @@ #include "dlmconvert.h" #define MLOG_MASK_PREFIX ML_DLM -#include "cluster/masklog.h" +#include "../cluster/masklog.h" static struct kmem_cache *dlm_lock_cache; diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 74b768ca1cd8..900f7e466d11 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -25,9 +25,9 @@ #include <linux/delay.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" @@ -35,7 +35,7 @@ #include "dlmdebug.h" #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER) -#include "cluster/masklog.h" +#include "../cluster/masklog.h" static void dlm_mle_node_down(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle, @@ -2554,8 +2554,6 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, if (!dlm_grab(dlm)) return -EINVAL; - BUG_ON(target == O2NM_MAX_NODES); - name = res->lockname.name; namelen = res->lockname.len; diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 064ce5bbc3f6..4b566e88582f 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -26,16 +26,16 @@ #include <linux/delay.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" #include "dlmdomain.h" #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_RECOVERY) -#include "cluster/masklog.h" +#include "../cluster/masklog.h" static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node); @@ -1668,7 +1668,7 @@ static int dlm_lockres_master_requery(struct dlm_ctxt *dlm, int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, u8 nodenum, u8 *real_master) { - int ret = -EINVAL; + int ret; struct dlm_master_requery req; int status = DLM_LOCK_RES_OWNER_UNKNOWN; diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 61c51c268460..fd40c17cd022 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -25,16 +25,16 @@ #include <linux/delay.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" #include "dlmdomain.h" #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_THREAD) -#include "cluster/masklog.h" +#include "../cluster/masklog.h" static int dlm_thread(void *data); static void dlm_flush_asts(struct dlm_ctxt *dlm); diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 3883633e82eb..dcb17ca8ae74 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -23,15 +23,15 @@ #include <linux/spinlock.h> #include <linux/delay.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" #define MLOG_MASK_PREFIX ML_DLM -#include "cluster/masklog.h" +#include "../cluster/masklog.h" #define DLM_UNLOCK_FREE_LOCK 0x00000001 #define DLM_UNLOCK_CALL_AST 0x00000002 diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile index a9874e441bd4..c7895f65be0e 100644 --- a/fs/ocfs2/dlmfs/Makefile +++ b/fs/ocfs2/dlmfs/Makefile @@ -1,6 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only -ccflags-y := -I $(srctree)/$(src)/.. - obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o ocfs2_dlmfs-objs := userdlm.o dlmfs.o diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 4f1668c81e1f..8e4f1ace467c 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -33,11 +33,11 @@ #include <linux/uaccess.h> -#include "stackglue.h" +#include "../stackglue.h" #include "userdlm.h" #define MLOG_MASK_PREFIX ML_DLMFS -#include "cluster/masklog.h" +#include "../cluster/masklog.h" static const struct super_operations dlmfs_ops; diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c index 525b14ddfba5..3df5be25bfb1 100644 --- a/fs/ocfs2/dlmfs/userdlm.c +++ b/fs/ocfs2/dlmfs/userdlm.c @@ -21,12 +21,12 @@ #include <linux/types.h> #include <linux/crc32.h> -#include "ocfs2_lockingver.h" -#include "stackglue.h" +#include "../ocfs2_lockingver.h" +#include "../stackglue.h" #include "userdlm.h" #define MLOG_MASK_PREFIX ML_DLMFS -#include "cluster/masklog.h" +#include "../cluster/masklog.h" static inline struct user_lock_res *user_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index cda1027d0819..cb9e6a73bea9 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -570,7 +570,7 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, mlog_bug_on_msg(1, "type: %d\n", type); ops = NULL; /* thanks, gcc */ break; - }; + } ocfs2_build_lock_name(type, OCFS2_I(inode)->ip_blkno, generation, res->l_name); diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 3103ba7f97a2..bfe611ed1b1d 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -597,9 +597,11 @@ static inline void ocfs2_update_inode_fsync_trans(handle_t *handle, { struct ocfs2_inode_info *oi = OCFS2_I(inode); - oi->i_sync_tid = handle->h_transaction->t_tid; - if (datasync) - oi->i_datasync_tid = handle->h_transaction->t_tid; + if (!is_handle_aborted(handle)) { + oi->i_sync_tid = handle->h_transaction->t_tid; + if (datasync) + oi->i_datasync_tid = handle->h_transaction->t_tid; + } } #endif /* OCFS2_JOURNAL_H */ diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 8ea51cf27b97..da65251ef815 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -586,8 +586,7 @@ static int __ocfs2_mknod_locked(struct inode *dir, mlog_errno(status); } - oi->i_sync_tid = handle->h_transaction->t_tid; - oi->i_datasync_tid = handle->h_transaction->t_tid; + ocfs2_update_inode_fsync_trans(handle, inode, 1); leave: if (status < 0) { diff --git a/fs/open.c b/fs/open.c index b62f5c0923a8..0788b3715731 100644 --- a/fs/open.c +++ b/fs/open.c @@ -955,48 +955,83 @@ struct file *open_with_fake_path(const struct path *path, int flags, } EXPORT_SYMBOL(open_with_fake_path); -static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op) +#define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE)) +#define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC) + +inline struct open_how build_open_how(int flags, umode_t mode) +{ + struct open_how how = { + .flags = flags & VALID_OPEN_FLAGS, + .mode = mode & S_IALLUGO, + }; + + /* O_PATH beats everything else. */ + if (how.flags & O_PATH) + how.flags &= O_PATH_FLAGS; + /* Modes should only be set for create-like flags. */ + if (!WILL_CREATE(how.flags)) + how.mode = 0; + return how; +} + +inline int build_open_flags(const struct open_how *how, struct open_flags *op) { + int flags = how->flags; int lookup_flags = 0; int acc_mode = ACC_MODE(flags); + /* Must never be set by userspace */ + flags &= ~(FMODE_NONOTIFY | O_CLOEXEC); + /* - * Clear out all open flags we don't know about so that we don't report - * them in fcntl(F_GETFD) or similar interfaces. + * Older syscalls implicitly clear all of the invalid flags or argument + * values before calling build_open_flags(), but openat2(2) checks all + * of its arguments. */ - flags &= VALID_OPEN_FLAGS; + if (flags & ~VALID_OPEN_FLAGS) + return -EINVAL; + if (how->resolve & ~VALID_RESOLVE_FLAGS) + return -EINVAL; - if (flags & (O_CREAT | __O_TMPFILE)) - op->mode = (mode & S_IALLUGO) | S_IFREG; - else + /* Deal with the mode. */ + if (WILL_CREATE(flags)) { + if (how->mode & ~S_IALLUGO) + return -EINVAL; + op->mode = how->mode | S_IFREG; + } else { + if (how->mode != 0) + return -EINVAL; op->mode = 0; - - /* Must never be set by userspace */ - flags &= ~FMODE_NONOTIFY & ~O_CLOEXEC; + } /* - * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only - * check for O_DSYNC if the need any syncing at all we enforce it's - * always set instead of having to deal with possibly weird behaviour - * for malicious applications setting only __O_SYNC. + * In order to ensure programs get explicit errors when trying to use + * O_TMPFILE on old kernels, O_TMPFILE is implemented such that it + * looks like (O_DIRECTORY|O_RDWR & ~O_CREAT) to old kernels. But we + * have to require userspace to explicitly set it. */ - if (flags & __O_SYNC) - flags |= O_DSYNC; - if (flags & __O_TMPFILE) { if ((flags & O_TMPFILE_MASK) != O_TMPFILE) return -EINVAL; if (!(acc_mode & MAY_WRITE)) return -EINVAL; - } else if (flags & O_PATH) { - /* - * If we have O_PATH in the open flag. Then we - * cannot have anything other than the below set of flags - */ - flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH; + } + if (flags & O_PATH) { + /* O_PATH only permits certain other flags to be set. */ + if (flags & ~O_PATH_FLAGS) + return -EINVAL; acc_mode = 0; } + /* + * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only + * check for O_DSYNC if the need any syncing at all we enforce it's + * always set instead of having to deal with possibly weird behaviour + * for malicious applications setting only __O_SYNC. + */ + if (flags & __O_SYNC) + flags |= O_DSYNC; + op->open_flag = flags; /* O_TRUNC implies we need access checks for write permissions */ @@ -1022,6 +1057,18 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o lookup_flags |= LOOKUP_DIRECTORY; if (!(flags & O_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; + + if (how->resolve & RESOLVE_NO_XDEV) + lookup_flags |= LOOKUP_NO_XDEV; + if (how->resolve & RESOLVE_NO_MAGICLINKS) + lookup_flags |= LOOKUP_NO_MAGICLINKS; + if (how->resolve & RESOLVE_NO_SYMLINKS) + lookup_flags |= LOOKUP_NO_SYMLINKS; + if (how->resolve & RESOLVE_BENEATH) + lookup_flags |= LOOKUP_BENEATH; + if (how->resolve & RESOLVE_IN_ROOT) + lookup_flags |= LOOKUP_IN_ROOT; + op->lookup_flags = lookup_flags; return 0; } @@ -1040,8 +1087,11 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o struct file *file_open_name(struct filename *name, int flags, umode_t mode) { struct open_flags op; - int err = build_open_flags(flags, mode, &op); - return err ? ERR_PTR(err) : do_filp_open(AT_FDCWD, name, &op); + struct open_how how = build_open_how(flags, mode); + int err = build_open_flags(&how, &op); + if (err) + return ERR_PTR(err); + return do_filp_open(AT_FDCWD, name, &op); } /** @@ -1072,17 +1122,19 @@ struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, const char *filename, int flags, umode_t mode) { struct open_flags op; - int err = build_open_flags(flags, mode, &op); + struct open_how how = build_open_how(flags, mode); + int err = build_open_flags(&how, &op); if (err) return ERR_PTR(err); return do_file_open_root(dentry, mnt, filename, &op); } EXPORT_SYMBOL(file_open_root); -long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) +static long do_sys_openat2(int dfd, const char __user *filename, + struct open_how *how) { struct open_flags op; - int fd = build_open_flags(flags, mode, &op); + int fd = build_open_flags(how, &op); struct filename *tmp; if (fd) @@ -1092,7 +1144,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) if (IS_ERR(tmp)) return PTR_ERR(tmp); - fd = get_unused_fd_flags(flags); + fd = get_unused_fd_flags(how->flags); if (fd >= 0) { struct file *f = do_filp_open(dfd, tmp, &op); if (IS_ERR(f)) { @@ -1107,12 +1159,16 @@ long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) return fd; } -SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) +long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) { - if (force_o_largefile()) - flags |= O_LARGEFILE; + struct open_how how = build_open_how(flags, mode); + return do_sys_openat2(dfd, filename, &how); +} - return do_sys_open(AT_FDCWD, filename, flags, mode); + +SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) +{ + return ksys_open(filename, flags, mode); } SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, @@ -1120,10 +1176,32 @@ SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, { if (force_o_largefile()) flags |= O_LARGEFILE; - return do_sys_open(dfd, filename, flags, mode); } +SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename, + struct open_how __user *, how, size_t, usize) +{ + int err; + struct open_how tmp; + + BUILD_BUG_ON(sizeof(struct open_how) < OPEN_HOW_SIZE_VER0); + BUILD_BUG_ON(sizeof(struct open_how) != OPEN_HOW_SIZE_LATEST); + + if (unlikely(usize < OPEN_HOW_SIZE_VER0)) + return -EINVAL; + + err = copy_struct_from_user(&tmp, sizeof(tmp), how, usize); + if (err) + return err; + + /* O_LARGEFILE is only allowed for non-O_PATH. */ + if (!(tmp.flags & O_PATH) && force_o_largefile()) + tmp.flags |= O_LARGEFILE; + + return do_sys_openat2(dfd, filename, &tmp); +} + #ifdef CONFIG_COMPAT /* * Exactly like sys_open(), except that it doesn't set the diff --git a/fs/proc/base.c b/fs/proc/base.c index 915686772f0e..c7c64272b0fa 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1718,8 +1718,7 @@ static const char *proc_pid_get_link(struct dentry *dentry, if (error) goto out; - nd_jump_link(&path); - return NULL; + error = nd_jump_link(&path); out: return ERR_PTR(error); } diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 8b5c720fe5d7..8e159fc78c0a 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -46,22 +46,26 @@ static const char *proc_ns_get_link(struct dentry *dentry, const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; struct task_struct *task; struct path ns_path; - void *error = ERR_PTR(-EACCES); + int error = -EACCES; if (!dentry) return ERR_PTR(-ECHILD); task = get_proc_task(inode); if (!task) - return error; + return ERR_PTR(-EACCES); - if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { - error = ns_get_path(&ns_path, task, ns_ops); - if (!error) - nd_jump_link(&ns_path); - } + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) + goto out; + + error = ns_get_path(&ns_path, task, ns_ops); + if (error) + goto out; + + error = nd_jump_link(&ns_path); +out: put_task_struct(task); - return error; + return ERR_PTR(error); } static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen) diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c index 53429c29c784..58fc2a7c7fd1 100644 --- a/fs/quota/quota_v2.c +++ b/fs/quota/quota_v2.c @@ -22,8 +22,6 @@ MODULE_AUTHOR("Jan Kara"); MODULE_DESCRIPTION("Quota format v2 support"); MODULE_LICENSE("GPL"); -#define __QUOTA_V2_PARANOIA - static void v2r0_mem2diskdqb(void *dp, struct dquot *dquot); static void v2r0_disk2memdqb(struct dquot *dquot, void *dp); static int v2r0_is_id(void *dp, struct dquot *dquot); diff --git a/fs/quota/quotaio_v1.h b/fs/quota/quotaio_v1.h index bd11e2c08119..31dca9a89176 100644 --- a/fs/quota/quotaio_v1.h +++ b/fs/quota/quotaio_v1.h @@ -25,8 +25,10 @@ struct v1_disk_dqblk { __u32 dqb_ihardlimit; /* absolute limit on allocated inodes */ __u32 dqb_isoftlimit; /* preferred inode limit */ __u32 dqb_curinodes; /* current # allocated inodes */ - time_t dqb_btime; /* time limit for excessive disk use */ - time_t dqb_itime; /* time limit for excessive inode use */ + + /* below fields differ in length on 32-bit vs 64-bit architectures */ + unsigned long dqb_btime; /* time limit for excessive disk use */ + unsigned long dqb_itime; /* time limit for excessive inode use */ }; #define v1_dqoff(UID) ((loff_t)((UID) * sizeof (struct v1_disk_dqblk))) diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 4b3e3e73b512..072156c4f895 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -56,8 +56,6 @@ /* gets a struct reiserfs_journal_list * from a list head */ #define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \ j_list)) -#define JOURNAL_WORK_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \ - j_working_list)) /* must be correct to keep the desc and commit structs at 4k */ #define JOURNAL_TRANS_HALF 1018 diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index f2cf3441fdfc..ff336513c254 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -63,7 +63,6 @@ static int show_version(struct seq_file *m, void *unused) #define MAP( i ) D4C( objectid_map( sb, rs )[ i ] ) #define DJF( x ) le32_to_cpu( rs -> x ) -#define DJV( x ) le32_to_cpu( s_v1 -> x ) #define DJP( x ) le32_to_cpu( jp -> x ) #define JF( x ) ( r -> s_journal -> x ) diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index da9ebe33882b..8bf88d690729 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -918,12 +918,6 @@ int comp_items(const struct item_head *stored_ih, const struct treepath *path) return memcmp(stored_ih, ih, IH_SIZE); } -/* unformatted nodes are not logged anymore, ever. This is safe now */ -#define held_by_others(bh) (atomic_read(&(bh)->b_count) > 1) - -/* block can not be forgotten as it is in I/O or held by someone */ -#define block_in_use(bh) (buffer_locked(bh) || (held_by_others(bh))) - /* prepare for delete or cut of direct item */ static inline int prepare_for_direct_item(struct treepath *path, struct item_head *le_ih, @@ -2246,7 +2240,8 @@ error_out: /* also releases the path */ unfix_nodes(&s_ins_balance); #ifdef REISERQUOTA_DEBUG - reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE, + if (inode) + reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE, "reiserquota insert_item(): freeing %u id=%u type=%c", quota_bytes, inode->i_uid, head2type(ih)); #endif diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 3244037b1286..a6bce5b1fb1d 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -629,6 +629,7 @@ static void reiserfs_put_super(struct super_block *s) reiserfs_write_unlock(s); mutex_destroy(&REISERFS_SB(s)->lock); destroy_workqueue(REISERFS_SB(s)->commit_wq); + kfree(REISERFS_SB(s)->s_jdev); kfree(s->s_fs_info); s->s_fs_info = NULL; } @@ -1947,7 +1948,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) if (!sbi->s_jdev) { SWARN(silent, s, "", "Cannot allocate memory for " "journal device name"); - goto error; + goto error_unlocked; } } #ifdef CONFIG_QUOTA @@ -2240,6 +2241,7 @@ error_unlocked: kfree(qf_names[j]); } #endif + kfree(sbi->s_jdev); kfree(sbi); s->s_fs_info = NULL; diff --git a/fs/stat.c b/fs/stat.c index c38e4c2e1221..030008796479 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -21,6 +21,8 @@ #include <linux/uaccess.h> #include <asm/unistd.h> +#include "internal.h" + /** * generic_fillattr - Fill in the basic attributes from the inode struct * @inode: Inode to use as the source @@ -150,6 +152,23 @@ int vfs_statx_fd(unsigned int fd, struct kstat *stat, } EXPORT_SYMBOL(vfs_statx_fd); +inline unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags, int flags) +{ + if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | + AT_EMPTY_PATH | KSTAT_QUERY_FLAGS)) != 0) + return -EINVAL; + + *lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT; + if (flags & AT_SYMLINK_NOFOLLOW) + *lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & AT_NO_AUTOMOUNT) + *lookup_flags &= ~LOOKUP_AUTOMOUNT; + if (flags & AT_EMPTY_PATH) + *lookup_flags |= LOOKUP_EMPTY; + + return 0; +} + /** * vfs_statx - Get basic and extra attributes by filename * @dfd: A file descriptor representing the base dir for a relative filename @@ -170,19 +189,10 @@ int vfs_statx(int dfd, const char __user *filename, int flags, { struct path path; int error = -EINVAL; - unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT; + unsigned lookup_flags; - if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | - AT_EMPTY_PATH | KSTAT_QUERY_FLAGS)) != 0) + if (vfs_stat_set_lookup_flags(&lookup_flags, flags)) return -EINVAL; - - if (flags & AT_SYMLINK_NOFOLLOW) - lookup_flags &= ~LOOKUP_FOLLOW; - if (flags & AT_NO_AUTOMOUNT) - lookup_flags &= ~LOOKUP_AUTOMOUNT; - if (flags & AT_EMPTY_PATH) - lookup_flags |= LOOKUP_EMPTY; - retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (error) @@ -523,7 +533,7 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename, } #endif /* __ARCH_WANT_STAT64 || __ARCH_WANT_COMPAT_STAT64 */ -static noinline_for_stack int +noinline_for_stack int cp_statx(const struct kstat *stat, struct statx __user *buffer) { struct statx tmp; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index c8e8f50c6054..bc4dec5b1633 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -786,7 +786,9 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu, if (page_offset > end_index) break; - page = find_or_create_page(mapping, page_offset, ra_gfp_mask); + page = pagecache_get_page(mapping, page_offset, + FGP_LOCK|FGP_ACCESSED|FGP_CREAT|FGP_NOWAIT, + ra_gfp_mask); if (!page) break; if (!PageUptodate(page)) diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 5dc5abca11c7..d49fc04f2d7d 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -17,10 +17,14 @@ #include "ubifs.h" /* Need to be kept consistent with checked flags in ioctl2ubifs() */ -#define UBIFS_SUPPORTED_IOCTL_FLAGS \ +#define UBIFS_SETTABLE_IOCTL_FLAGS \ (FS_COMPR_FL | FS_SYNC_FL | FS_APPEND_FL | \ FS_IMMUTABLE_FL | FS_DIRSYNC_FL) +/* Need to be kept consistent with checked flags in ubifs2ioctl() */ +#define UBIFS_GETTABLE_IOCTL_FLAGS \ + (UBIFS_SETTABLE_IOCTL_FLAGS | FS_ENCRYPT_FL) + /** * ubifs_set_inode_flags - set VFS inode flags. * @inode: VFS inode to set flags for @@ -91,6 +95,8 @@ static int ubifs2ioctl(int ubifs_flags) ioctl_flags |= FS_IMMUTABLE_FL; if (ubifs_flags & UBIFS_DIRSYNC_FL) ioctl_flags |= FS_DIRSYNC_FL; + if (ubifs_flags & UBIFS_CRYPT_FL) + ioctl_flags |= FS_ENCRYPT_FL; return ioctl_flags; } @@ -113,7 +119,8 @@ static int setflags(struct inode *inode, int flags) if (err) goto out_unlock; - ui->flags = ioctl2ubifs(flags); + ui->flags &= ~ioctl2ubifs(UBIFS_SETTABLE_IOCTL_FLAGS); + ui->flags |= ioctl2ubifs(flags); ubifs_set_inode_flags(inode); inode->i_ctime = current_time(inode); release = ui->dirty; @@ -155,8 +162,9 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) if (get_user(flags, (int __user *) arg)) return -EFAULT; - if (flags & ~UBIFS_SUPPORTED_IOCTL_FLAGS) + if (flags & ~UBIFS_GETTABLE_IOCTL_FLAGS) return -EOPNOTSUPP; + flags &= UBIFS_SETTABLE_IOCTL_FLAGS; if (!S_ISDIR(inode->i_mode)) flags &= ~FS_DIRSYNC_FL; diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 54d6db61106f..edf43ddd7dce 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -129,7 +129,7 @@ static void __orphan_drop(struct ubifs_info *c, struct ubifs_orphan *o) static void orphan_delete(struct ubifs_info *c, struct ubifs_orphan *orph) { if (orph->del) { - dbg_gen("deleted twice ino %lu", orph->inum); + dbg_gen("deleted twice ino %lu", (unsigned long)orph->inum); return; } @@ -137,7 +137,7 @@ static void orphan_delete(struct ubifs_info *c, struct ubifs_orphan *orph) orph->del = 1; orph->dnext = c->orph_dnext; c->orph_dnext = orph; - dbg_gen("delete later ino %lu", orph->inum); + dbg_gen("delete later ino %lu", (unsigned long)orph->inum); return; } diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index 2b7c04bf8983..17c90dff7266 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -161,7 +161,7 @@ static int create_default_filesystem(struct ubifs_info *c) sup = kzalloc(ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size), GFP_KERNEL); mst = kzalloc(c->mst_node_alsz, GFP_KERNEL); idx_node_size = ubifs_idx_node_sz(c, 1); - idx = kzalloc(ALIGN(tmp, c->min_io_size), GFP_KERNEL); + idx = kzalloc(ALIGN(idx_node_size, c->min_io_size), GFP_KERNEL); ino = kzalloc(ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size), GFP_KERNEL); cs = kzalloc(ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size), GFP_KERNEL); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 5e1e8ec0589e..7fc2f3f07c16 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1599,6 +1599,7 @@ out_free: vfree(c->ileb_buf); vfree(c->sbuf); kfree(c->bottom_up_buf); + kfree(c->sup_node); ubifs_debugging_exit(c); return err; } @@ -1641,6 +1642,7 @@ static void ubifs_umount(struct ubifs_info *c) vfree(c->ileb_buf); vfree(c->sbuf); kfree(c->bottom_up_buf); + kfree(c->sup_node); ubifs_debugging_exit(c); } diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h index fb7f2c7bec9c..3fd85464abd5 100644 --- a/fs/udf/ecma_167.h +++ b/fs/udf/ecma_167.h @@ -4,7 +4,8 @@ * This file is based on ECMA-167 3rd edition (June 1997) * http://www.ecma.ch * - * Copyright (c) 2001-2002 Ben Fennema <bfennema@falcon.csc.calpoly.edu> + * Copyright (c) 2001-2002 Ben Fennema + * Copyright (c) 2017-2019 Pali Rohár <pali.rohar@gmail.com> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -32,11 +33,19 @@ * SUCH DAMAGE. */ +/** + * @file + * ECMA-167r3 defines and structure definitions + */ + #include <linux/types.h> #ifndef _ECMA_167_H #define _ECMA_167_H 1 +/* Character sets and coding - d-characters (ECMA 167r3 1/7.2) */ +typedef uint8_t dchars; + /* Character set specification (ECMA 167r3 1/7.2.1) */ struct charspec { uint8_t charSetType; @@ -54,6 +63,7 @@ struct charspec { #define CHARSPEC_TYPE_CS7 0x07 /* (1/7.2.9) */ #define CHARSPEC_TYPE_CS8 0x08 /* (1/7.2.10) */ +/* Fixed-length character fields - d-string (EMCA 167r3 1/7.2.12) */ typedef uint8_t dstring; /* Timestamp (ECMA 167r3 1/7.3) */ @@ -85,22 +95,8 @@ struct regid { } __packed; /* Flags (ECMA 167r3 1/7.4.1) */ -#define ENTITYID_FLAGS_DIRTY 0x00 -#define ENTITYID_FLAGS_PROTECTED 0x01 - -/* OSTA UDF 2.1.5.2 */ -#define UDF_ID_COMPLIANT "*OSTA UDF Compliant" - -/* OSTA UDF 2.1.5.3 */ -struct domainEntityIDSuffix { - uint16_t revision; - uint8_t flags; - uint8_t reserved[5]; -}; - -/* OSTA UDF 2.1.5.3 */ -#define ENTITYIDSUFFIX_FLAGS_HARDWRITEPROTECT 0 -#define ENTITYIDSUFFIX_FLAGS_SOFTWRITEPROTECT 1 +#define ENTITYID_FLAGS_DIRTY 0x01 +#define ENTITYID_FLAGS_PROTECTED 0x02 /* Volume Structure Descriptor (ECMA 167r3 2/9.1) */ #define VSD_STD_ID_LEN 5 @@ -202,6 +198,13 @@ struct NSRDesc { uint8_t structData[2040]; } __packed; +/* Generic Descriptor */ +struct genericDesc { + struct tag descTag; + __le32 volDescSeqNum; + uint8_t reserved[492]; +} __packed; + /* Primary Volume Descriptor (ECMA 167r3 3/10.1) */ struct primaryVolDesc { struct tag descTag; @@ -316,7 +319,7 @@ struct genericPartitionMap { /* Partition Map Type (ECMA 167r3 3/10.7.1.1) */ #define GP_PARTITION_MAP_TYPE_UNDEF 0x00 -#define GP_PARTIITON_MAP_TYPE_1 0x01 +#define GP_PARTITION_MAP_TYPE_1 0x01 #define GP_PARTITION_MAP_TYPE_2 0x02 /* Type 1 Partition Map (ECMA 167r3 3/10.7.2) */ @@ -723,6 +726,7 @@ struct appUseExtAttr { #define EXTATTR_DEV_SPEC 12 #define EXTATTR_IMP_USE 2048 #define EXTATTR_APP_USE 65536 +#define EXTATTR_SUBTYPE 1 /* Unallocated Space Entry (ECMA 167r3 4/14.11) */ struct unallocSpaceEntry { @@ -754,10 +758,12 @@ struct partitionIntegrityEntry { /* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */ /* Extent Length (ECMA 167r3 4/14.14.1.1) */ +#define EXT_LENGTH_MASK 0x3FFFFFFF +#define EXT_TYPE_MASK 0xC0000000 #define EXT_RECORDED_ALLOCATED 0x00000000 #define EXT_NOT_RECORDED_ALLOCATED 0x40000000 #define EXT_NOT_RECORDED_NOT_ALLOCATED 0x80000000 -#define EXT_NEXT_EXTENT_ALLOCDECS 0xC0000000 +#define EXT_NEXT_EXTENT_ALLOCDESCS 0xC0000000 /* Long Allocation Descriptor (ECMA 167r3 4/14.14.2) */ @@ -774,7 +780,7 @@ struct pathComponent { uint8_t componentType; uint8_t lengthComponentIdent; __le16 componentFileVersionNum; - dstring componentIdent[0]; + dchars componentIdent[0]; } __packed; /* File Entry (ECMA 167r3 4/14.17) */ diff --git a/fs/udf/inode.c b/fs/udf/inode.c index ea80036d7897..e875bc5668ee 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1981,10 +1981,10 @@ int udf_setup_indirect_aext(struct inode *inode, udf_pblk_t block, __udf_add_aext(inode, &nepos, &cp_loc, cp_len, 1); udf_write_aext(inode, epos, &nepos.block, - sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDECS, 0); + sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDESCS, 0); } else { __udf_add_aext(inode, epos, &nepos.block, - sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDECS, 0); + sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDESCS, 0); } brelse(epos->bh); @@ -2143,7 +2143,7 @@ int8_t udf_next_aext(struct inode *inode, struct extent_position *epos, unsigned int indirections = 0; while ((etype = udf_current_aext(inode, epos, eloc, elen, inc)) == - (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) { + (EXT_NEXT_EXTENT_ALLOCDESCS >> 30)) { udf_pblk_t block; if (++indirections > UDF_MAX_INDIR_EXTS) { diff --git a/fs/udf/osta_udf.h b/fs/udf/osta_udf.h index a4da59e38b7f..35e61b2cacfe 100644 --- a/fs/udf/osta_udf.h +++ b/fs/udf/osta_udf.h @@ -1,10 +1,11 @@ /* * osta_udf.h * - * This file is based on OSTA UDF(tm) 2.50 (April 30, 2003) + * This file is based on OSTA UDF(tm) 2.60 (March 1, 2005) * http://www.osta.org * - * Copyright (c) 2001-2004 Ben Fennema <bfennema@falcon.csc.calpoly.edu> + * Copyright (c) 2001-2004 Ben Fennema + * Copyright (c) 2017-2019 Pali Rohár <pali.rohar@gmail.com> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -32,38 +33,57 @@ * SUCH DAMAGE. */ +/** + * @file + * OSTA-UDF defines and structure definitions + */ + #include "ecma_167.h" #ifndef _OSTA_UDF_H #define _OSTA_UDF_H 1 -/* OSTA CS0 Charspec (UDF 2.50 2.1.2) */ +/* OSTA CS0 Charspec (UDF 2.60 2.1.2) */ #define UDF_CHAR_SET_TYPE 0 #define UDF_CHAR_SET_INFO "OSTA Compressed Unicode" -/* Entity Identifier (UDF 2.50 2.1.5) */ -/* Identifiers (UDF 2.50 2.1.5.2) */ +/* Entity Identifier (UDF 2.60 2.1.5) */ +/* Identifiers (UDF 2.60 2.1.5.2) */ +/* Implementation Use Extended Attribute (UDF 2.60 3.3.4.5) */ +/* Virtual Allocation Table (UDF 1.50 2.2.10) */ +/* Logical Volume Extended Information (UDF 1.50 Errata, DCN 5003, 3.3.4.5.1.3) */ +/* OS2EA (UDF 1.50 3.3.4.5.3.1) */ +/* MacUniqueIDTable (UDF 1.50 3.3.4.5.4.3) */ +/* MacResourceFork (UDF 1.50 3.3.4.5.4.4) */ #define UDF_ID_DEVELOPER "*Linux UDFFS" #define UDF_ID_COMPLIANT "*OSTA UDF Compliant" #define UDF_ID_LV_INFO "*UDF LV Info" #define UDF_ID_FREE_EA "*UDF FreeEASpace" #define UDF_ID_FREE_APP_EA "*UDF FreeAppEASpace" #define UDF_ID_DVD_CGMS "*UDF DVD CGMS Info" +#define UDF_ID_VAT_LVEXTENSION "*UDF VAT LVExtension" #define UDF_ID_OS2_EA "*UDF OS/2 EA" #define UDF_ID_OS2_EA_LENGTH "*UDF OS/2 EALength" #define UDF_ID_MAC_VOLUME "*UDF Mac VolumeInfo" #define UDF_ID_MAC_FINDER "*UDF Mac FinderInfo" #define UDF_ID_MAC_UNIQUE "*UDF Mac UniqueIDTable" #define UDF_ID_MAC_RESOURCE "*UDF Mac ResourceFork" +#define UDF_ID_OS400_DIRINFO "*UDF OS/400 DirInfo" #define UDF_ID_VIRTUAL "*UDF Virtual Partition" #define UDF_ID_SPARABLE "*UDF Sparable Partition" #define UDF_ID_ALLOC "*UDF Virtual Alloc Tbl" #define UDF_ID_SPARING "*UDF Sparing Table" #define UDF_ID_METADATA "*UDF Metadata Partition" -/* Identifier Suffix (UDF 2.50 2.1.5.3) */ -#define IS_DF_HARD_WRITE_PROTECT 0x01 -#define IS_DF_SOFT_WRITE_PROTECT 0x02 +/* Identifier Suffix (UDF 2.60 2.1.5.3) */ +#define DOMAIN_FLAGS_HARD_WRITE_PROTECT 0x01 +#define DOMAIN_FLAGS_SOFT_WRITE_PROTECT 0x02 + +struct domainIdentSuffix { + __le16 UDFRevision; + uint8_t domainFlags; + uint8_t reserved[5]; +} __packed; struct UDFIdentSuffix { __le16 UDFRevision; @@ -75,15 +95,15 @@ struct UDFIdentSuffix { struct impIdentSuffix { uint8_t OSClass; uint8_t OSIdentifier; - uint8_t reserved[6]; + uint8_t impUse[6]; } __packed; struct appIdentSuffix { uint8_t impUse[8]; } __packed; -/* Logical Volume Integrity Descriptor (UDF 2.50 2.2.6) */ -/* Implementation Use (UDF 2.50 2.2.6.4) */ +/* Logical Volume Integrity Descriptor (UDF 2.60 2.2.6) */ +/* Implementation Use (UDF 2.60 2.2.6.4) */ struct logicalVolIntegrityDescImpUse { struct regid impIdent; __le32 numFiles; @@ -94,8 +114,8 @@ struct logicalVolIntegrityDescImpUse { uint8_t impUse[0]; } __packed; -/* Implementation Use Volume Descriptor (UDF 2.50 2.2.7) */ -/* Implementation Use (UDF 2.50 2.2.7.2) */ +/* Implementation Use Volume Descriptor (UDF 2.60 2.2.7) */ +/* Implementation Use (UDF 2.60 2.2.7.2) */ struct impUseVolDescImpUse { struct charspec LVICharset; dstring logicalVolIdent[128]; @@ -115,7 +135,7 @@ struct udfPartitionMap2 { __le16 partitionNum; } __packed; -/* Virtual Partition Map (UDF 2.50 2.2.8) */ +/* Virtual Partition Map (UDF 2.60 2.2.8) */ struct virtualPartitionMap { uint8_t partitionMapType; uint8_t partitionMapLength; @@ -126,7 +146,7 @@ struct virtualPartitionMap { uint8_t reserved2[24]; } __packed; -/* Sparable Partition Map (UDF 2.50 2.2.9) */ +/* Sparable Partition Map (UDF 2.60 2.2.9) */ struct sparablePartitionMap { uint8_t partitionMapType; uint8_t partitionMapLength; @@ -141,7 +161,7 @@ struct sparablePartitionMap { __le32 locSparingTable[4]; } __packed; -/* Metadata Partition Map (UDF 2.4.0 2.2.10) */ +/* Metadata Partition Map (UDF 2.60 2.2.10) */ struct metadataPartitionMap { uint8_t partitionMapType; uint8_t partitionMapLength; @@ -160,14 +180,14 @@ struct metadataPartitionMap { /* Virtual Allocation Table (UDF 1.5 2.2.10) */ struct virtualAllocationTable15 { - __le32 VirtualSector[0]; + __le32 vatEntry[0]; struct regid vatIdent; __le32 previousVATICBLoc; } __packed; #define ICBTAG_FILE_TYPE_VAT15 0x00U -/* Virtual Allocation Table (UDF 2.50 2.2.11) */ +/* Virtual Allocation Table (UDF 2.60 2.2.11) */ struct virtualAllocationTable20 { __le16 lengthHeader; __le16 lengthImpUse; @@ -175,9 +195,9 @@ struct virtualAllocationTable20 { __le32 previousVATICBLoc; __le32 numFiles; __le32 numDirs; - __le16 minReadRevision; - __le16 minWriteRevision; - __le16 maxWriteRevision; + __le16 minUDFReadRev; + __le16 minUDFWriteRev; + __le16 maxUDFWriteRev; __le16 reserved; uint8_t impUse[0]; __le32 vatEntry[0]; @@ -185,7 +205,7 @@ struct virtualAllocationTable20 { #define ICBTAG_FILE_TYPE_VAT20 0xF8U -/* Sparing Table (UDF 2.50 2.2.12) */ +/* Sparing Table (UDF 2.60 2.2.12) */ struct sparingEntry { __le32 origLocation; __le32 mappedLocation; @@ -201,12 +221,12 @@ struct sparingTable { mapEntry[0]; } __packed; -/* Metadata File (and Metadata Mirror File) (UDF 2.50 2.2.13.1) */ +/* Metadata File (and Metadata Mirror File) (UDF 2.60 2.2.13.1) */ #define ICBTAG_FILE_TYPE_MAIN 0xFA #define ICBTAG_FILE_TYPE_MIRROR 0xFB #define ICBTAG_FILE_TYPE_BITMAP 0xFC -/* struct struct long_ad ICB - ADImpUse (UDF 2.50 2.2.4.3) */ +/* struct struct long_ad ICB - ADImpUse (UDF 2.60 2.2.4.3) */ struct allocDescImpUse { __le16 flags; uint8_t impUse[4]; @@ -214,17 +234,17 @@ struct allocDescImpUse { #define AD_IU_EXT_ERASED 0x0001 -/* Real-Time Files (UDF 2.50 6.11) */ +/* Real-Time Files (UDF 2.60 6.11) */ #define ICBTAG_FILE_TYPE_REALTIME 0xF9U -/* Implementation Use Extended Attribute (UDF 2.50 3.3.4.5) */ -/* FreeEASpace (UDF 2.50 3.3.4.5.1.1) */ +/* Implementation Use Extended Attribute (UDF 2.60 3.3.4.5) */ +/* FreeEASpace (UDF 2.60 3.3.4.5.1.1) */ struct freeEaSpace { __le16 headerChecksum; uint8_t freeEASpace[0]; } __packed; -/* DVD Copyright Management Information (UDF 2.50 3.3.4.5.1.2) */ +/* DVD Copyright Management Information (UDF 2.60 3.3.4.5.1.2) */ struct DVDCopyrightImpUse { __le16 headerChecksum; uint8_t CGMSInfo; @@ -232,20 +252,35 @@ struct DVDCopyrightImpUse { uint8_t protectionSystemInfo[4]; } __packed; -/* Application Use Extended Attribute (UDF 2.50 3.3.4.6) */ -/* FreeAppEASpace (UDF 2.50 3.3.4.6.1) */ +/* Logical Volume Extended Information (UDF 1.50 Errata, DCN 5003, 3.3.4.5.1.3) */ +struct LVExtensionEA { + __le16 headerChecksum; + __le64 verificationID; + __le32 numFiles; + __le32 numDirs; + dstring logicalVolIdent[128]; +} __packed; + +/* Application Use Extended Attribute (UDF 2.60 3.3.4.6) */ +/* FreeAppEASpace (UDF 2.60 3.3.4.6.1) */ struct freeAppEASpace { __le16 headerChecksum; uint8_t freeEASpace[0]; } __packed; -/* UDF Defined System Stream (UDF 2.50 3.3.7) */ +/* UDF Defined System Stream (UDF 2.60 3.3.7) */ #define UDF_ID_UNIQUE_ID "*UDF Unique ID Mapping Data" #define UDF_ID_NON_ALLOC "*UDF Non-Allocatable Space" #define UDF_ID_POWER_CAL "*UDF Power Cal Table" #define UDF_ID_BACKUP "*UDF Backup" -/* Operating System Identifiers (UDF 2.50 6.3) */ +/* UDF Defined Non-System Streams (UDF 2.60 3.3.8) */ +#define UDF_ID_MAC_RESOURCE_FORK_STREAM "*UDF Macintosh Resource Fork" +/* #define UDF_ID_OS2_EA "*UDF OS/2 EA" */ +#define UDF_ID_NT_ACL "*UDF NT ACL" +#define UDF_ID_UNIX_ACL "*UDF UNIX ACL" + +/* Operating System Identifiers (UDF 2.60 6.3) */ #define UDF_OS_CLASS_UNDEF 0x00U #define UDF_OS_CLASS_DOS 0x01U #define UDF_OS_CLASS_OS2 0x02U @@ -270,6 +305,7 @@ struct freeAppEASpace { #define UDF_OS_ID_LINUX 0x05U #define UDF_OS_ID_MKLINUX 0x06U #define UDF_OS_ID_FREEBSD 0x07U +#define UDF_OS_ID_NETBSD 0x08U #define UDF_OS_ID_WIN9X 0x00U #define UDF_OS_ID_WINNT 0x00U #define UDF_OS_ID_OS400 0x00U diff --git a/fs/udf/super.c b/fs/udf/super.c index 8c28e93e9b73..f747bf72edbe 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -767,20 +767,20 @@ static int udf_check_vsd(struct super_block *sb) static int udf_verify_domain_identifier(struct super_block *sb, struct regid *ident, char *dname) { - struct domainEntityIDSuffix *suffix; + struct domainIdentSuffix *suffix; if (memcmp(ident->ident, UDF_ID_COMPLIANT, strlen(UDF_ID_COMPLIANT))) { udf_warn(sb, "Not OSTA UDF compliant %s descriptor.\n", dname); goto force_ro; } - if (ident->flags & (1 << ENTITYID_FLAGS_DIRTY)) { + if (ident->flags & ENTITYID_FLAGS_DIRTY) { udf_warn(sb, "Possibly not OSTA UDF compliant %s descriptor.\n", dname); goto force_ro; } - suffix = (struct domainEntityIDSuffix *)ident->identSuffix; - if (suffix->flags & (1 << ENTITYIDSUFFIX_FLAGS_HARDWRITEPROTECT) || - suffix->flags & (1 << ENTITYIDSUFFIX_FLAGS_SOFTWRITEPROTECT)) { + suffix = (struct domainIdentSuffix *)ident->identSuffix; + if ((suffix->domainFlags & DOMAIN_FLAGS_HARD_WRITE_PROTECT) || + (suffix->domainFlags & DOMAIN_FLAGS_SOFT_WRITE_PROTECT)) { if (!sb_rdonly(sb)) { udf_warn(sb, "Descriptor for %s marked write protected." " Forcing read only mount.\n", dname); @@ -1035,7 +1035,6 @@ static int check_partition_desc(struct super_block *sb, switch (le32_to_cpu(p->accessType)) { case PD_ACCESS_TYPE_READ_ONLY: case PD_ACCESS_TYPE_WRITE_ONCE: - case PD_ACCESS_TYPE_REWRITABLE: case PD_ACCESS_TYPE_NONE: goto force_ro; } @@ -1063,7 +1062,8 @@ static int check_partition_desc(struct super_block *sb, goto force_ro; if (map->s_partition_type == UDF_VIRTUAL_MAP15 || - map->s_partition_type == UDF_VIRTUAL_MAP20) + map->s_partition_type == UDF_VIRTUAL_MAP20 || + map->s_partition_type == UDF_METADATA_MAP25) goto force_ro; return 0; @@ -2402,6 +2402,10 @@ static int udf_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = sbi->s_partmaps[sbi->s_partition].s_partition_len; buf->f_bfree = udf_count_free(sb); buf->f_bavail = buf->f_bfree; + /* + * Let's pretend each free block is also a free 'inode' since UDF does + * not have separate preallocated table of inodes. + */ buf->f_files = (lvidiu != NULL ? (le32_to_cpu(lvidiu->numFiles) + le32_to_cpu(lvidiu->numDirs)) : 0) + buf->f_bfree; @@ -2492,17 +2496,29 @@ static unsigned int udf_count_free_table(struct super_block *sb, static unsigned int udf_count_free(struct super_block *sb) { unsigned int accum = 0; - struct udf_sb_info *sbi; + struct udf_sb_info *sbi = UDF_SB(sb); struct udf_part_map *map; + unsigned int part = sbi->s_partition; + int ptype = sbi->s_partmaps[part].s_partition_type; + + if (ptype == UDF_METADATA_MAP25) { + part = sbi->s_partmaps[part].s_type_specific.s_metadata. + s_phys_partition_ref; + } else if (ptype == UDF_VIRTUAL_MAP15 || ptype == UDF_VIRTUAL_MAP20) { + /* + * Filesystems with VAT are append-only and we cannot write to + * them. Let's just report 0 here. + */ + return 0; + } - sbi = UDF_SB(sb); if (sbi->s_lvid_bh) { struct logicalVolIntegrityDesc *lvid = (struct logicalVolIntegrityDesc *) sbi->s_lvid_bh->b_data; - if (le32_to_cpu(lvid->numOfPartitions) > sbi->s_partition) { + if (le32_to_cpu(lvid->numOfPartitions) > part) { accum = le32_to_cpu( - lvid->freeSpaceTable[sbi->s_partition]); + lvid->freeSpaceTable[part]); if (accum == 0xFFFFFFFF) accum = 0; } @@ -2511,7 +2527,7 @@ static unsigned int udf_count_free(struct super_block *sb) if (accum) return accum; - map = &sbi->s_partmaps[sbi->s_partition]; + map = &sbi->s_partmaps[part]; if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) { accum += udf_count_free_bitmap(sb, map->s_uspace.s_bitmap); diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index 63a47f1e1d52..532cda99644e 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -241,7 +241,7 @@ int udf_truncate_extents(struct inode *inode) while ((etype = udf_current_aext(inode, &epos, &eloc, &elen, 0)) != -1) { - if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) { + if (etype == (EXT_NEXT_EXTENT_ALLOCDESCS >> 30)) { udf_write_aext(inode, &epos, &neloc, nelen, 0); if (indirect_ext_len) { /* We managed to free all extents in the diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 0d7fcc983b3d..e6149720ce02 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -62,6 +62,7 @@ xfs_attr_args_init( struct xfs_da_args *args, struct xfs_inode *dp, const unsigned char *name, + size_t namelen, int flags) { @@ -74,7 +75,7 @@ xfs_attr_args_init( args->dp = dp; args->flags = flags; args->name = name; - args->namelen = strlen((const char *)name); + args->namelen = namelen; if (args->namelen >= MAXNAMELEN) return -EFAULT; /* match IRIX behaviour */ @@ -139,6 +140,7 @@ int xfs_attr_get( struct xfs_inode *ip, const unsigned char *name, + size_t namelen, unsigned char **value, int *valuelenp, int flags) @@ -154,7 +156,7 @@ xfs_attr_get( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - error = xfs_attr_args_init(&args, ip, name, flags); + error = xfs_attr_args_init(&args, ip, name, namelen, flags); if (error) return error; @@ -338,6 +340,7 @@ int xfs_attr_set( struct xfs_inode *dp, const unsigned char *name, + size_t namelen, unsigned char *value, int valuelen, int flags) @@ -353,7 +356,7 @@ xfs_attr_set( if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return -EIO; - error = xfs_attr_args_init(&args, dp, name, flags); + error = xfs_attr_args_init(&args, dp, name, namelen, flags); if (error) return error; @@ -442,6 +445,7 @@ int xfs_attr_remove( struct xfs_inode *dp, const unsigned char *name, + size_t namelen, int flags) { struct xfs_mount *mp = dp->i_mount; @@ -453,7 +457,7 @@ xfs_attr_remove( if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return -EIO; - error = xfs_attr_args_init(&args, dp, name, flags); + error = xfs_attr_args_init(&args, dp, name, namelen, flags); if (error) return error; @@ -1007,7 +1011,7 @@ restart: * The INCOMPLETE flag means that we will find the "old" * attr, not the "new" one. */ - args->flags |= XFS_ATTR_INCOMPLETE; + args->op_flags |= XFS_DA_OP_INCOMPLETE; state = xfs_da_state_alloc(); state->args = args; state->mp = mp; diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 94badfa1743e..4243b2272642 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -26,7 +26,7 @@ struct xfs_attr_list_context; *========================================================================*/ -#define ATTR_DONTFOLLOW 0x0001 /* -- unused, from IRIX -- */ +#define ATTR_DONTFOLLOW 0x0001 /* -- ignored, from IRIX -- */ #define ATTR_ROOT 0x0002 /* use attrs in root (trusted) namespace */ #define ATTR_TRUST 0x0004 /* -- unused, from IRIX -- */ #define ATTR_SECURE 0x0008 /* use attrs in security namespace */ @@ -37,7 +37,10 @@ struct xfs_attr_list_context; #define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */ #define ATTR_INCOMPLETE 0x4000 /* [kernel] return INCOMPLETE attr keys */ -#define ATTR_ALLOC 0x8000 /* allocate xattr buffer on demand */ +#define ATTR_ALLOC 0x8000 /* [kernel] allocate xattr buffer on demand */ + +#define ATTR_KERNEL_FLAGS \ + (ATTR_KERNOTIME | ATTR_KERNOVAL | ATTR_INCOMPLETE | ATTR_ALLOC) #define XFS_ATTR_FLAGS \ { ATTR_DONTFOLLOW, "DONTFOLLOW" }, \ @@ -145,11 +148,13 @@ int xfs_attr_list_int(struct xfs_attr_list_context *); int xfs_inode_hasattr(struct xfs_inode *ip); int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args); int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, - unsigned char **value, int *valuelenp, int flags); + size_t namelen, unsigned char **value, int *valuelenp, + int flags); int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, - unsigned char *value, int valuelen, int flags); + size_t namelen, unsigned char *value, int valuelen, int flags); int xfs_attr_set_args(struct xfs_da_args *args); -int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); +int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, + size_t namelen, int flags); int xfs_attr_remove_args(struct xfs_da_args *args); int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, int flags, struct attrlist_cursor_kern *cursor); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 08d4b10ae2d5..fed537a4353d 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -2403,8 +2403,8 @@ xfs_attr3_leaf_lookup_int( * If we are looking for INCOMPLETE entries, show only those. * If we are looking for complete entries, show only those. */ - if ((args->flags & XFS_ATTR_INCOMPLETE) != - (entry->flags & XFS_ATTR_INCOMPLETE)) { + if (!!(args->op_flags & XFS_DA_OP_INCOMPLETE) != + !!(entry->flags & XFS_ATTR_INCOMPLETE)) { continue; } if (entry->flags & XFS_ATTR_LOCAL) { diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index f4a188e28b7b..73615b1dd1a8 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -39,15 +39,6 @@ struct xfs_attr3_icleaf_hdr { } freemap[XFS_ATTR_LEAF_MAPSIZE]; }; -/* - * Used to keep a list of "remote value" extents when unlinking an inode. - */ -typedef struct xfs_attr_inactive_list { - xfs_dablk_t valueblk; /* block number of value bytes */ - int valuelen; /* number of bytes in value */ -} xfs_attr_inactive_list_t; - - /*======================================================================== * Function prototypes for the kernel. *========================================================================*/ diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index a6ef5df42669..a266d05df146 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -26,6 +26,23 @@ #define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */ /* + * Remote Attribute Values + * ======================= + * + * Remote extended attribute values are conceptually simple -- they're written + * to data blocks mapped by an inode's attribute fork, and they have an upper + * size limit of 64k. Setting a value does not involve the XFS log. + * + * However, on a v5 filesystem, maximally sized remote attr values require one + * block more than 64k worth of space to hold both the remote attribute value + * header (64 bytes). On a 4k block filesystem this results in a 68k buffer; + * on a 64k block filesystem, this would be a 128k buffer. Note that the log + * format can only handle a dirty buffer of XFS_MAX_BLOCKSIZE length (64k). + * Therefore, we /must/ ensure that remote attribute value buffers never touch + * the logging system and therefore never have a log item. + */ + +/* * Each contiguous block has a header, so it is not just a simple attribute * length to FSB conversion. */ @@ -401,17 +418,25 @@ xfs_attr_rmtval_get( (map[i].br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); - error = xfs_trans_read_buf(mp, args->trans, - mp->m_ddev_targp, - dblkno, dblkcnt, 0, &bp, - &xfs_attr3_rmt_buf_ops); - if (error) + bp = xfs_buf_read(mp->m_ddev_targp, dblkno, dblkcnt, 0, + &xfs_attr3_rmt_buf_ops); + if (!bp) + return -ENOMEM; + error = bp->b_error; + if (error) { + xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_relse(bp); + + /* bad CRC means corrupted metadata */ + if (error == -EFSBADCRC) + error = -EFSCORRUPTED; return error; + } error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino, &offset, &valuelen, &dst); - xfs_trans_brelse(args->trans, bp); + xfs_buf_relse(bp); if (error) return error; @@ -552,6 +577,33 @@ xfs_attr_rmtval_set( return 0; } +/* Mark stale any incore buffers for the remote value. */ +int +xfs_attr_rmtval_stale( + struct xfs_inode *ip, + struct xfs_bmbt_irec *map, + xfs_buf_flags_t incore_flags) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_buf *bp; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (XFS_IS_CORRUPT(mp, map->br_startblock == DELAYSTARTBLOCK) || + XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK)) + return -EFSCORRUPTED; + + bp = xfs_buf_incore(mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, map->br_startblock), + XFS_FSB_TO_BB(mp, map->br_blockcount), incore_flags); + if (bp) { + xfs_buf_stale(bp); + xfs_buf_relse(bp); + } + + return 0; +} + /* * Remove the value associated with an attribute by deleting the * out-of-line buffer that it is stored on. @@ -560,7 +612,6 @@ int xfs_attr_rmtval_remove( struct xfs_da_args *args) { - struct xfs_mount *mp = args->dp->i_mount; xfs_dablk_t lblkno; int blkcnt; int error; @@ -575,9 +626,6 @@ xfs_attr_rmtval_remove( blkcnt = args->rmtblkcnt; while (blkcnt > 0) { struct xfs_bmbt_irec map; - struct xfs_buf *bp; - xfs_daddr_t dblkno; - int dblkcnt; int nmap; /* @@ -588,22 +636,11 @@ xfs_attr_rmtval_remove( blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); if (error) return error; - ASSERT(nmap == 1); - ASSERT((map.br_startblock != DELAYSTARTBLOCK) && - (map.br_startblock != HOLESTARTBLOCK)); - - dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), - dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - - /* - * If the "remote" value is in the cache, remove it. - */ - bp = xfs_buf_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK); - if (bp) { - xfs_buf_stale(bp); - xfs_buf_relse(bp); - bp = NULL; - } + if (XFS_IS_CORRUPT(args->dp->i_mount, nmap != 1)) + return -EFSCORRUPTED; + error = xfs_attr_rmtval_stale(args->dp, &map, XBF_TRYLOCK); + if (error) + return error; lblkno += map.br_blockcount; blkcnt -= map.br_blockcount; diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index 9d20b66ad379..6fb4572845ce 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -11,5 +11,7 @@ int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen); int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_rmtval_set(struct xfs_da_args *args); int xfs_attr_rmtval_remove(struct xfs_da_args *args); +int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, + xfs_buf_flags_t incore_flags); #endif /* __XFS_ATTR_REMOTE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index e2cc98931552..b22c7e928eb1 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -2389,8 +2389,6 @@ xfs_btree_lshift( XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1); if (level > 0) { /* It's a nonleaf. operate on keys and ptrs */ - int i; /* loop index */ - for (i = 0; i < rrecs; i++) { error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level); if (error) diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index e16610d1c14f..0f4fbb0889ff 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -89,6 +89,7 @@ typedef struct xfs_da_args { #define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */ #define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */ #define XFS_DA_OP_ALLOCVAL 0x0020 /* lookup to alloc buffer if found */ +#define XFS_DA_OP_INCOMPLETE 0x0040 /* lookup INCOMPLETE attr keys */ #define XFS_DA_OP_FLAGS \ { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ @@ -96,7 +97,8 @@ typedef struct xfs_da_args { { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \ - { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" } + { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" }, \ + { XFS_DA_OP_INCOMPLETE, "INCOMPLETE" } /* * Storage for holding state during Btree searches and split/join ops. diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 3dee33043e09..734837a9b51a 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -217,7 +217,7 @@ typedef struct xfs_dir2_sf_entry { * A 64-bit or 32-bit inode number follows here, at a variable offset * after the name. */ -} xfs_dir2_sf_entry_t; +} __packed xfs_dir2_sf_entry_t; static inline int xfs_dir2_sf_hdr_size(int i8count) { @@ -683,8 +683,6 @@ struct xfs_attr3_leafblock { /* * Flags used in the leaf_entry[i].flags field. - * NOTE: the INCOMPLETE bit must not collide with the flags bits specified - * on the system call, they are "or"ed together for various operations. */ #define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */ #define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */ diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 1b7dcbae051c..77e9fa385980 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1540,6 +1540,13 @@ typedef struct xfs_bmdr_block { #define BMBT_BLOCKCOUNT_BITLEN 21 #define BMBT_STARTOFF_MASK ((1ULL << BMBT_STARTOFF_BITLEN) - 1) +#define BMBT_BLOCKCOUNT_MASK ((1ULL << BMBT_BLOCKCOUNT_BITLEN) - 1) + +/* + * bmbt records have a file offset (block) field that is 54 bits wide, so this + * is the largest xfs_fileoff_t that we ever expect to see. + */ +#define XFS_MAX_FILEOFF (BMBT_STARTOFF_MASK + BMBT_BLOCKCOUNT_MASK) typedef struct xfs_bmbt_rec { __be64 l0, l1; diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 8ef31d71a9c7..9bac0d2e56dc 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -462,11 +462,20 @@ static inline uint xfs_log_dinode_size(int version) #define XFS_BLF_GDQUOT_BUF (1<<4) /* - * This is the structure used to lay out a buf log item in the - * log. The data map describes which 128 byte chunks of the buffer - * have been logged. - */ -#define XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) + * This is the structure used to lay out a buf log item in the log. The data + * map describes which 128 byte chunks of the buffer have been logged. + * + * The placement of blf_map_size causes blf_data_map to start at an odd + * multiple of sizeof(unsigned int) offset within the struct. Because the data + * bitmap size will always be an even number, the end of the data_map (and + * therefore the structure) will also be at an odd multiple of sizeof(unsigned + * int). Some 64-bit compilers will insert padding at the end of the struct to + * ensure 64-bit alignment of blf_blkno, but 32-bit ones will not. Therefore, + * XFS_BLF_DATAMAP_SIZE must be an odd number to make the padding explicit and + * keep the structure size consistent between 32-bit and 64-bit platforms. + */ +#define __XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) +#define XFS_BLF_DATAMAP_SIZE (__XFS_BLF_DATAMAP_SIZE + 1) typedef struct xfs_buf_log_format { unsigned short blf_type; /* buf log item type indicator */ diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 60c61d7052a8..c3422403b169 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -75,7 +75,6 @@ static inline xfs_extlen_t xrep_calc_ag_resblks( struct xfs_scrub *sc) { - ASSERT(!(sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)); return 0; } diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 91693fce34a8..cd743fad8478 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -145,7 +145,8 @@ xfs_get_acl(struct inode *inode, int type) * go out to the disk. */ len = XFS_ACL_MAX_SIZE(ip->i_mount); - error = xfs_attr_get(ip, ea_name, (unsigned char **)&xfs_acl, &len, + error = xfs_attr_get(ip, ea_name, strlen(ea_name), + (unsigned char **)&xfs_acl, &len, ATTR_ALLOC | ATTR_ROOT); if (error) { /* @@ -196,15 +197,17 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) len -= sizeof(struct xfs_acl_entry) * (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count); - error = xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl, - len, ATTR_ROOT); + error = xfs_attr_set(ip, ea_name, strlen(ea_name), + (unsigned char *)xfs_acl, len, ATTR_ROOT); kmem_free(xfs_acl); } else { /* * A NULL ACL argument means we want to remove the ACL. */ - error = xfs_attr_remove(ip, ea_name, ATTR_ROOT); + error = xfs_attr_remove(ip, ea_name, + strlen(ea_name), + ATTR_ROOT); /* * If the attribute didn't exist to start with that's fine. diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 5ff49523d8ea..8fbb841cd6fe 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -25,22 +25,18 @@ #include "xfs_error.h" /* - * Look at all the extents for this logical region, - * invalidate any buffers that are incore/in transactions. + * Invalidate any incore buffers associated with this remote attribute value + * extent. We never log remote attribute value buffers, which means that they + * won't be attached to a transaction and are therefore safe to mark stale. + * The actual bunmapi will be taken care of later. */ STATIC int -xfs_attr3_leaf_freextent( - struct xfs_trans **trans, +xfs_attr3_rmt_stale( struct xfs_inode *dp, xfs_dablk_t blkno, int blkcnt) { struct xfs_bmbt_irec map; - struct xfs_buf *bp; - xfs_dablk_t tblkno; - xfs_daddr_t dblkno; - int tblkcnt; - int dblkcnt; int nmap; int error; @@ -48,47 +44,29 @@ xfs_attr3_leaf_freextent( * Roll through the "value", invalidating the attribute value's * blocks. */ - tblkno = blkno; - tblkcnt = blkcnt; - while (tblkcnt > 0) { + while (blkcnt > 0) { /* * Try to remember where we decided to put the value. */ nmap = 1; - error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt, + error = xfs_bmapi_read(dp, (xfs_fileoff_t)blkno, blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); - if (error) { + if (error) return error; - } - ASSERT(nmap == 1); - ASSERT(map.br_startblock != DELAYSTARTBLOCK); + if (XFS_IS_CORRUPT(dp->i_mount, nmap != 1)) + return -EFSCORRUPTED; /* - * If it's a hole, these are already unmapped - * so there's nothing to invalidate. + * Mark any incore buffers for the remote value as stale. We + * never log remote attr value buffers, so the buffer should be + * easy to kill. */ - if (map.br_startblock != HOLESTARTBLOCK) { - - dblkno = XFS_FSB_TO_DADDR(dp->i_mount, - map.br_startblock); - dblkcnt = XFS_FSB_TO_BB(dp->i_mount, - map.br_blockcount); - bp = xfs_trans_get_buf(*trans, - dp->i_mount->m_ddev_targp, - dblkno, dblkcnt, 0); - if (!bp) - return -ENOMEM; - xfs_trans_binval(*trans, bp); - /* - * Roll to next transaction. - */ - error = xfs_trans_roll_inode(trans, dp); - if (error) - return error; - } + error = xfs_attr_rmtval_stale(dp, &map, 0); + if (error) + return error; - tblkno += map.br_blockcount; - tblkcnt -= map.br_blockcount; + blkno += map.br_blockcount; + blkcnt -= map.br_blockcount; } return 0; @@ -102,86 +80,45 @@ xfs_attr3_leaf_freextent( */ STATIC int xfs_attr3_leaf_inactive( - struct xfs_trans **trans, - struct xfs_inode *dp, - struct xfs_buf *bp) + struct xfs_trans **trans, + struct xfs_inode *dp, + struct xfs_buf *bp) { - struct xfs_attr_leafblock *leaf; - struct xfs_attr3_icleaf_hdr ichdr; - struct xfs_attr_leaf_entry *entry; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_mount *mp = bp->b_mount; + struct xfs_attr_leafblock *leaf = bp->b_addr; + struct xfs_attr_leaf_entry *entry; struct xfs_attr_leaf_name_remote *name_rmt; - struct xfs_attr_inactive_list *list; - struct xfs_attr_inactive_list *lp; - int error; - int count; - int size; - int tmp; - int i; - struct xfs_mount *mp = bp->b_mount; + int error = 0; + int i; - leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); /* - * Count the number of "remote" value extents. + * Find the remote value extents for this leaf and invalidate their + * incore buffers. */ - count = 0; entry = xfs_attr3_leaf_entryp(leaf); for (i = 0; i < ichdr.count; entry++, i++) { - if (be16_to_cpu(entry->nameidx) && - ((entry->flags & XFS_ATTR_LOCAL) == 0)) { - name_rmt = xfs_attr3_leaf_name_remote(leaf, i); - if (name_rmt->valueblk) - count++; - } - } + int blkcnt; - /* - * If there are no "remote" values, we're done. - */ - if (count == 0) { - xfs_trans_brelse(*trans, bp); - return 0; - } - - /* - * Allocate storage for a list of all the "remote" value extents. - */ - size = count * sizeof(xfs_attr_inactive_list_t); - list = kmem_alloc(size, 0); - - /* - * Identify each of the "remote" value extents. - */ - lp = list; - entry = xfs_attr3_leaf_entryp(leaf); - for (i = 0; i < ichdr.count; entry++, i++) { - if (be16_to_cpu(entry->nameidx) && - ((entry->flags & XFS_ATTR_LOCAL) == 0)) { - name_rmt = xfs_attr3_leaf_name_remote(leaf, i); - if (name_rmt->valueblk) { - lp->valueblk = be32_to_cpu(name_rmt->valueblk); - lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount, - be32_to_cpu(name_rmt->valuelen)); - lp++; - } - } - } - xfs_trans_brelse(*trans, bp); /* unlock for trans. in freextent() */ + if (!entry->nameidx || (entry->flags & XFS_ATTR_LOCAL)) + continue; - /* - * Invalidate each of the "remote" value extents. - */ - error = 0; - for (lp = list, i = 0; i < count; i++, lp++) { - tmp = xfs_attr3_leaf_freextent(trans, dp, - lp->valueblk, lp->valuelen); + name_rmt = xfs_attr3_leaf_name_remote(leaf, i); + if (!name_rmt->valueblk) + continue; - if (error == 0) - error = tmp; /* save only the 1st errno */ + blkcnt = xfs_attr3_rmt_blocks(dp->i_mount, + be32_to_cpu(name_rmt->valuelen)); + error = xfs_attr3_rmt_stale(dp, + be32_to_cpu(name_rmt->valueblk), blkcnt); + if (error) + goto err; } - kmem_free(list); + xfs_trans_brelse(*trans, bp); +err: return error; } diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 3984779e5911..5be8973a452c 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -27,6 +27,23 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); +/* Is this log iovec plausibly large enough to contain the buffer log format? */ +bool +xfs_buf_log_check_iovec( + struct xfs_log_iovec *iovec) +{ + struct xfs_buf_log_format *blfp = iovec->i_addr; + char *bmp_end; + char *item_end; + + if (offsetof(struct xfs_buf_log_format, blf_data_map) > iovec->i_len) + return false; + + item_end = (char *)iovec->i_addr + iovec->i_len; + bmp_end = (char *)&blfp->blf_data_map[blfp->blf_map_size]; + return bmp_end <= item_end; +} + static inline int xfs_buf_log_format_size( struct xfs_buf_log_format *blfp) @@ -688,7 +705,7 @@ static const struct xfs_item_ops xfs_buf_item_ops = { .iop_push = xfs_buf_item_push, }; -STATIC int +STATIC void xfs_buf_item_get_format( struct xfs_buf_log_item *bip, int count) @@ -698,14 +715,11 @@ xfs_buf_item_get_format( if (count == 1) { bip->bli_formats = &bip->__bli_format; - return 0; + return; } bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format), 0); - if (!bip->bli_formats) - return -ENOMEM; - return 0; } STATIC void @@ -731,7 +745,6 @@ xfs_buf_item_init( struct xfs_buf_log_item *bip = bp->b_log_item; int chunks; int map_size; - int error; int i; /* @@ -760,19 +773,22 @@ xfs_buf_item_init( * Discontiguous buffer support follows the layout of the underlying * buffer. This makes the implementation as simple as possible. */ - error = xfs_buf_item_get_format(bip, bp->b_map_count); - ASSERT(error == 0); - if (error) { /* to stop gcc throwing set-but-unused warnings */ - kmem_cache_free(xfs_buf_item_zone, bip); - return error; - } - + xfs_buf_item_get_format(bip, bp->b_map_count); for (i = 0; i < bip->bli_format_count; i++) { chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len), XFS_BLF_CHUNK); map_size = DIV_ROUND_UP(chunks, NBWORD); + if (map_size > XFS_BLF_DATAMAP_SIZE) { + kmem_cache_free(xfs_buf_item_zone, bip); + xfs_err(mp, + "buffer item dirty bitmap (%u uints) too small to reflect %u bytes!", + map_size, + BBTOB(bp->b_maps[i].bm_len)); + return -EFSCORRUPTED; + } + bip->bli_formats[i].blf_type = XFS_LI_BUF; bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn; bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len; @@ -805,6 +821,9 @@ xfs_buf_item_log_segment( uint end_bit; uint mask; + ASSERT(first < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD); + ASSERT(last < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD); + /* * Convert byte offsets to bit numbers. */ diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 4a054b11011a..30114b510332 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -61,6 +61,7 @@ void xfs_buf_iodone_callbacks(struct xfs_buf *); void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); bool xfs_buf_resubmit_failed_buffers(struct xfs_buf *, struct list_head *); +bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec); extern kmem_zone_t *xfs_buf_item_zone; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 2bff21ca9d78..9cfd3209f52b 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -137,7 +137,7 @@ xfs_qm_adjust_dqtimers( (d->d_blk_hardlimit && (be64_to_cpu(d->d_bcount) > be64_to_cpu(d->d_blk_hardlimit)))) { - d->d_btimer = cpu_to_be32(get_seconds() + + d->d_btimer = cpu_to_be32(ktime_get_real_seconds() + mp->m_quotainfo->qi_btimelimit); } else { d->d_bwarns = 0; @@ -160,7 +160,7 @@ xfs_qm_adjust_dqtimers( (d->d_ino_hardlimit && (be64_to_cpu(d->d_icount) > be64_to_cpu(d->d_ino_hardlimit)))) { - d->d_itimer = cpu_to_be32(get_seconds() + + d->d_itimer = cpu_to_be32(ktime_get_real_seconds() + mp->m_quotainfo->qi_itimelimit); } else { d->d_iwarns = 0; @@ -183,7 +183,7 @@ xfs_qm_adjust_dqtimers( (d->d_rtb_hardlimit && (be64_to_cpu(d->d_rtbcount) > be64_to_cpu(d->d_rtb_hardlimit)))) { - d->d_rtbtimer = cpu_to_be32(get_seconds() + + d->d_rtbtimer = cpu_to_be32(ktime_get_real_seconds() + mp->m_quotainfo->qi_rtbtimelimit); } else { d->d_rtbwarns = 0; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index c93250108952..b8a4a3f29b36 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -187,7 +187,12 @@ xfs_file_dio_aio_read( file_accessed(iocb->ki_filp); - xfs_ilock(ip, XFS_IOLOCK_SHARED); + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) + return -EAGAIN; + } else { + xfs_ilock(ip, XFS_IOLOCK_SHARED); + } ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, is_sync_kiocb(iocb)); xfs_iunlock(ip, XFS_IOLOCK_SHARED); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 401da197f012..1979a0055763 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1518,10 +1518,8 @@ xfs_itruncate_extents_flags( struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp = *tpp; xfs_fileoff_t first_unmap_block; - xfs_fileoff_t last_block; xfs_filblks_t unmap_len; int error = 0; - int done = 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(!atomic_read(&VFS_I(ip)->i_count) || @@ -1541,21 +1539,22 @@ xfs_itruncate_extents_flags( * the end of the file (in a crash where the space is allocated * but the inode size is not yet updated), simply remove any * blocks which show up between the new EOF and the maximum - * possible file size. If the first block to be removed is - * beyond the maximum file size (ie it is the same as last_block), - * then there is nothing to do. + * possible file size. + * + * We have to free all the blocks to the bmbt maximum offset, even if + * the page cache can't scale that far. */ first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); - last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); - if (first_unmap_block == last_block) + if (first_unmap_block >= XFS_MAX_FILEOFF) { + WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF); return 0; + } - ASSERT(first_unmap_block < last_block); - unmap_len = last_block - first_unmap_block + 1; - while (!done) { + unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1; + while (unmap_len > 0) { ASSERT(tp->t_firstblock == NULLFSBLOCK); - error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, flags, - XFS_ITRUNC_MAX_EXTENTS, &done); + error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len, + flags, XFS_ITRUNC_MAX_EXTENTS); if (error) goto out; @@ -1575,7 +1574,7 @@ xfs_itruncate_extents_flags( if (whichfork == XFS_DATA_FORK) { /* Remove all pending CoW reservations. */ error = xfs_reflink_cancel_cow_blocks(ip, &tp, - first_unmap_block, last_block, true); + first_unmap_block, XFS_MAX_FILEOFF, true); if (error) goto out; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 7b35d62ede9f..d42de92cb283 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -357,6 +357,7 @@ xfs_attrmulti_attr_get( { unsigned char *kbuf; int error = -EFAULT; + size_t namelen; if (*len > XFS_XATTR_SIZE_MAX) return -EINVAL; @@ -364,7 +365,9 @@ xfs_attrmulti_attr_get( if (!kbuf) return -ENOMEM; - error = xfs_attr_get(XFS_I(inode), name, &kbuf, (int *)len, flags); + namelen = strlen(name); + error = xfs_attr_get(XFS_I(inode), name, namelen, &kbuf, (int *)len, + flags); if (error) goto out_kfree; @@ -386,6 +389,7 @@ xfs_attrmulti_attr_set( { unsigned char *kbuf; int error; + size_t namelen; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; @@ -396,7 +400,8 @@ xfs_attrmulti_attr_set( if (IS_ERR(kbuf)) return PTR_ERR(kbuf); - error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags); + namelen = strlen(name); + error = xfs_attr_set(XFS_I(inode), name, namelen, kbuf, len, flags); if (!error) xfs_forget_acl(inode, name, flags); kfree(kbuf); @@ -410,10 +415,12 @@ xfs_attrmulti_attr_remove( uint32_t flags) { int error; + size_t namelen; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; - error = xfs_attr_remove(XFS_I(inode), name, flags); + namelen = strlen(name); + error = xfs_attr_remove(XFS_I(inode), name, namelen, flags); if (!error) xfs_forget_acl(inode, name, flags); return error; @@ -462,6 +469,13 @@ xfs_attrmulti_by_handle( error = 0; for (i = 0; i < am_hreq.opcount; i++) { + if ((ops[i].am_flags & ATTR_ROOT) && + (ops[i].am_flags & ATTR_SECURE)) { + ops[i].am_error = -EINVAL; + continue; + } + ops[i].am_flags &= ~ATTR_KERNEL_FLAGS; + ops[i].am_error = strncpy_from_user((char *)attr_name, ops[i].am_attrname, MAXNAMELEN); if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index c4c4f09113d3..769581a79c58 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -107,7 +107,7 @@ xfs_ioctl32_bstime_copyin( xfs_bstime_t *bstime, compat_xfs_bstime_t __user *bstime32) { - compat_time_t sec32; /* tv_sec differs on 64 vs. 32 */ + old_time32_t sec32; /* tv_sec differs on 64 vs. 32 */ if (get_user(sec32, &bstime32->tv_sec) || get_user(bstime->tv_nsec, &bstime32->tv_nsec)) @@ -450,6 +450,13 @@ xfs_compat_attrmulti_by_handle( error = 0; for (i = 0; i < am_hreq.opcount; i++) { + if ((ops[i].am_flags & ATTR_ROOT) && + (ops[i].am_flags & ATTR_SECURE)) { + ops[i].am_error = -EINVAL; + continue; + } + ops[i].am_flags &= ~ATTR_KERNEL_FLAGS; + ops[i].am_error = strncpy_from_user((char *)attr_name, compat_ptr(ops[i].am_attrname), MAXNAMELEN); diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h index 8c7743cd490e..053de7d894cd 100644 --- a/fs/xfs/xfs_ioctl32.h +++ b/fs/xfs/xfs_ioctl32.h @@ -32,7 +32,7 @@ #endif typedef struct compat_xfs_bstime { - compat_time_t tv_sec; /* seconds */ + old_time32_t tv_sec; /* seconds */ __s32 tv_nsec; /* and nanoseconds */ } compat_xfs_bstime_t; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 28e2d1f37267..bb590a267a7f 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -923,7 +923,7 @@ xfs_buffered_write_iomap_begin( xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb); /* Trim the mapping to the nearest shared extent boundary. */ - error = xfs_inode_need_cow(ip, &imap, &shared); + error = xfs_bmap_trim_cow(ip, &imap, &shared); if (error) goto out_unlock; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 8afe69ca188b..81f2f93caec0 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -50,8 +50,10 @@ xfs_initxattrs( int error = 0; for (xattr = xattr_array; xattr->name != NULL; xattr++) { - error = xfs_attr_set(ip, xattr->name, xattr->value, - xattr->value_len, ATTR_SECURE); + error = xfs_attr_set(ip, xattr->name, + strlen(xattr->name), + xattr->value, xattr->value_len, + ATTR_SECURE); if (error < 0) break; } diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 99ec3fba4548..0d683fb96396 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1934,6 +1934,12 @@ xlog_recover_buffer_pass1( struct list_head *bucket; struct xfs_buf_cancel *bcp; + if (!xfs_buf_log_check_iovec(&item->ri_buf[0])) { + xfs_err(log->l_mp, "bad buffer log item size (%d)", + item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + /* * If this isn't a cancel buffer item, then just return. */ diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index b6701b4f59a9..5f04d8a5ab2a 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -111,6 +111,7 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10); /* log structures */ + XFS_CHECK_STRUCT_SIZE(struct xfs_buf_log_format, 88); XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24); XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_32, 28); XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_64, 32); diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 7823af39008b..4e57edca8bce 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -64,9 +64,9 @@ struct xfs_quotainfo { struct xfs_inode *qi_pquotaip; /* project quota inode */ struct list_lru qi_lru; int qi_dquots; - time_t qi_btimelimit; /* limit for blks timer */ - time_t qi_itimelimit; /* limit for inodes timer */ - time_t qi_rtbtimelimit;/* limit for rt blks timer */ + time64_t qi_btimelimit; /* limit for blks timer */ + time64_t qi_itimelimit; /* limit for inodes timer */ + time64_t qi_rtbtimelimit;/* limit for rt blks timer */ xfs_qwarncnt_t qi_bwarnlimit; /* limit for blks warnings */ xfs_qwarncnt_t qi_iwarnlimit; /* limit for inodes warnings */ xfs_qwarncnt_t qi_rtbwarnlimit;/* limit for rt blks warnings */ diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index c7de17deeae6..38669e827206 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -37,9 +37,9 @@ xfs_qm_fill_state( tstate->flags |= QCI_SYSFILE; tstate->blocks = ip->i_d.di_nblocks; tstate->nextents = ip->i_d.di_nextents; - tstate->spc_timelimit = q->qi_btimelimit; - tstate->ino_timelimit = q->qi_itimelimit; - tstate->rt_spc_timelimit = q->qi_rtbtimelimit; + tstate->spc_timelimit = (u32)q->qi_btimelimit; + tstate->ino_timelimit = (u32)q->qi_itimelimit; + tstate->rt_spc_timelimit = (u32)q->qi_rtbtimelimit; tstate->spc_warnlimit = q->qi_bwarnlimit; tstate->ino_warnlimit = q->qi_iwarnlimit; tstate->rt_spc_warnlimit = q->qi_rtbwarnlimit; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index de451235c4ee..e723b267a247 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -223,8 +223,8 @@ xfs_reflink_trim_around_shared( } } -bool -xfs_inode_need_cow( +int +xfs_bmap_trim_cow( struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool *shared) @@ -327,7 +327,7 @@ xfs_find_trim_cow_extent( if (cmap->br_startoff > offset_fsb) { xfs_trim_extent(imap, imap->br_startoff, cmap->br_startoff - imap->br_startoff); - return xfs_inode_need_cow(ip, imap, shared); + return xfs_bmap_trim_cow(ip, imap, shared); } *shared = true; @@ -1457,7 +1457,8 @@ xfs_reflink_clear_inode_flag( * We didn't find any shared blocks so turn off the reflink flag. * First, get rid of any leftover CoW mappings. */ - error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true); + error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, XFS_MAX_FILEOFF, + true); if (error) return error; diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index d18ad7f4fb64..3e4fd46373ab 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -22,7 +22,7 @@ extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal); extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, struct xfs_bmbt_irec *irec, bool *shared); -bool xfs_inode_need_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, +int xfs_bmap_trim_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool *shared); int xfs_reflink_allocate_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index d9ae27ddf253..760901783944 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -193,32 +193,6 @@ xfs_fs_show_options( return 0; } -static uint64_t -xfs_max_file_offset( - unsigned int blockshift) -{ - unsigned int pagefactor = 1; - unsigned int bitshift = BITS_PER_LONG - 1; - - /* Figure out maximum filesize, on Linux this can depend on - * the filesystem blocksize (on 32 bit platforms). - * __block_write_begin does this in an [unsigned] long long... - * page->index << (PAGE_SHIFT - bbits) - * So, for page sized blocks (4K on 32 bit platforms), - * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is - * (((u64)PAGE_SIZE << (BITS_PER_LONG-1))-1) - * but for smaller blocksizes it is less (bbits = log2 bsize). - */ - -#if BITS_PER_LONG == 32 - ASSERT(sizeof(sector_t) == 8); - pagefactor = PAGE_SIZE; - bitshift = BITS_PER_LONG; -#endif - - return (((uint64_t)pagefactor) << bitshift) - 1; -} - /* * Set parameters for inode allocation heuristics, taking into account * filesystem size and inode32/inode64 mount options; i.e. specifically @@ -1424,6 +1398,26 @@ xfs_fc_fill_super( if (error) goto out_free_sb; + /* + * XFS block mappings use 54 bits to store the logical block offset. + * This should suffice to handle the maximum file size that the VFS + * supports (currently 2^63 bytes on 64-bit and ULONG_MAX << PAGE_SHIFT + * bytes on 32-bit), but as XFS and VFS have gotten the s_maxbytes + * calculation wrong on 32-bit kernels in the past, we'll add a WARN_ON + * to check this assertion. + * + * Avoid integer overflow by comparing the maximum bmbt offset to the + * maximum pagecache offset in units of fs blocks. + */ + if (XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE) > XFS_MAX_FILEOFF) { + xfs_warn(mp, +"MAX_LFS_FILESIZE block offset (%llu) exceeds extent map maximum (%llu)!", + XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE), + XFS_MAX_FILEOFF); + error = -EINVAL; + goto out_free_sb; + } + error = xfs_filestream_mount(mp); if (error) goto out_free_sb; @@ -1435,7 +1429,7 @@ xfs_fc_fill_super( sb->s_magic = XFS_SUPER_MAGIC; sb->s_blocksize = mp->m_sb.sb_blocksize; sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; - sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits); + sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_max_links = XFS_MAXLINK; sb->s_time_gran = 1; sb->s_time_min = S32_MIN; diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index a6fe2d8dc40f..d1b9869bc5fa 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -580,7 +580,7 @@ xfs_trans_dqresv( { xfs_qcnt_t hardlimit; xfs_qcnt_t softlimit; - time_t timer; + time64_t timer; xfs_qwarncnt_t warns; xfs_qwarncnt_t warnlimit; xfs_qcnt_t total_count; @@ -635,7 +635,8 @@ xfs_trans_dqresv( goto error_return; } if (softlimit && total_count > softlimit) { - if ((timer != 0 && get_seconds() > timer) || + if ((timer != 0 && + ktime_get_real_seconds() > timer) || (warns != 0 && warns >= warnlimit)) { xfs_quota_warn(mp, dqp, QUOTA_NL_BSOFTLONGWARN); @@ -662,7 +663,8 @@ xfs_trans_dqresv( goto error_return; } if (softlimit && total_count > softlimit) { - if ((timer != 0 && get_seconds() > timer) || + if ((timer != 0 && + ktime_get_real_seconds() > timer) || (warns != 0 && warns >= warnlimit)) { xfs_quota_warn(mp, dqp, QUOTA_NL_ISOFTLONGWARN); diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 383f0203d103..b0fedb543f97 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -24,6 +24,7 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, int xflags = handler->flags; struct xfs_inode *ip = XFS_I(inode); int error, asize = size; + size_t namelen = strlen(name); /* Convert Linux syscall to XFS internal ATTR flags */ if (!size) { @@ -31,7 +32,8 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, value = NULL; } - error = xfs_attr_get(ip, name, (unsigned char **)&value, &asize, xflags); + error = xfs_attr_get(ip, name, namelen, (unsigned char **)&value, + &asize, xflags); if (error) return error; return asize; @@ -67,6 +69,7 @@ xfs_xattr_set(const struct xattr_handler *handler, struct dentry *unused, int xflags = handler->flags; struct xfs_inode *ip = XFS_I(inode); int error; + size_t namelen = strlen(name); /* Convert Linux syscall to XFS internal ATTR flags */ if (flags & XATTR_CREATE) @@ -74,10 +77,11 @@ xfs_xattr_set(const struct xattr_handler *handler, struct dentry *unused, if (flags & XATTR_REPLACE) xflags |= ATTR_REPLACE; - if (!value) - return xfs_attr_remove(ip, (unsigned char *)name, xflags); - error = xfs_attr_set(ip, (unsigned char *)name, - (void *)value, size, xflags); + if (value) + error = xfs_attr_set(ip, name, namelen, (void *)value, size, + xflags); + else + error = xfs_attr_remove(ip, name, namelen, xflags); if (!error) xfs_forget_acl(inode, name, xflags); |