diff options
Diffstat (limited to 'fs')
96 files changed, 13233 insertions, 1410 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index bfb1c6095c7a..2501e6f1f965 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -112,6 +112,8 @@ config MANDATORY_FILE_LOCKING source "fs/crypto/Kconfig" +source "fs/verity/Kconfig" + source "fs/notify/Kconfig" source "fs/quota/Kconfig" @@ -261,6 +263,7 @@ source "fs/romfs/Kconfig" source "fs/pstore/Kconfig" source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" +source "fs/erofs/Kconfig" endif # MISC_FILESYSTEMS diff --git a/fs/Makefile b/fs/Makefile index d60089fd689b..14231b4cf383 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -34,6 +34,7 @@ obj-$(CONFIG_AIO) += aio.o obj-$(CONFIG_IO_URING) += io_uring.o obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FS_ENCRYPTION) += crypto/ +obj-$(CONFIG_FS_VERITY) += verity/ obj-$(CONFIG_FILE_LOCKING) += locks.o obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o @@ -130,3 +131,4 @@ obj-$(CONFIG_F2FS_FS) += f2fs/ obj-$(CONFIG_CEPH_FS) += ceph/ obj-$(CONFIG_PSTORE) += pstore/ obj-$(CONFIG_EFIVAR_FS) += efivarfs/ +obj-$(CONFIG_EROFS_FS) += erofs/ diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1ff438fd5bc2..eeb75281894e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3628,6 +3628,13 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb) TASK_UNINTERRUPTIBLE); } +static void end_extent_buffer_writeback(struct extent_buffer *eb) +{ + clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); + smp_mb__after_atomic(); + wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); +} + /* * Lock eb pages and flush the bio if we can't the locks * @@ -3699,8 +3706,11 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb if (!trylock_page(p)) { if (!flush) { - ret = flush_write_bio(epd); - if (ret < 0) { + int err; + + err = flush_write_bio(epd); + if (err < 0) { + ret = err; failed_page_nr = i; goto err_unlock; } @@ -3715,16 +3725,23 @@ err_unlock: /* Unlock already locked pages */ for (i = 0; i < failed_page_nr; i++) unlock_page(eb->pages[i]); + /* + * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it. + * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can + * be made and undo everything done before. + */ + btrfs_tree_lock(eb); + spin_lock(&eb->refs_lock); + set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); + end_extent_buffer_writeback(eb); + spin_unlock(&eb->refs_lock); + percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len, + fs_info->dirty_metadata_batch); + btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); + btrfs_tree_unlock(eb); return ret; } -static void end_extent_buffer_writeback(struct extent_buffer *eb) -{ - clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); - smp_mb__after_atomic(); - wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); -} - static void set_btree_ioerr(struct page *page) { struct extent_buffer *eb = (struct extent_buffer *)page->private; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 6c8297bcfeb7..1bfd7e34f31e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4985,7 +4985,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, BTRFS_I(inode), LOG_OTHER_INODE_ALL, 0, LLONG_MAX, ctx); - iput(inode); + btrfs_add_delayed_iput(inode); } } continue; @@ -5000,7 +5000,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, ret = btrfs_log_inode(trans, root, BTRFS_I(inode), LOG_OTHER_INODE, 0, LLONG_MAX, ctx); if (ret) { - iput(inode); + btrfs_add_delayed_iput(inode); continue; } @@ -5009,7 +5009,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { - iput(inode); + btrfs_add_delayed_iput(inode); continue; } @@ -5056,7 +5056,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, } path->slots[0]++; } - iput(inode); + btrfs_add_delayed_iput(inode); } return ret; @@ -5689,7 +5689,7 @@ process_leaf: } if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) { - iput(di_inode); + btrfs_add_delayed_iput(di_inode); break; } @@ -5701,7 +5701,7 @@ process_leaf: if (!ret && btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) ret = 1; - iput(di_inode); + btrfs_add_delayed_iput(di_inode); if (ret) goto next_dir_inode; if (ctx->log_new_dentries) { @@ -5848,7 +5848,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, if (!ret && ctx && ctx->log_new_dentries) ret = log_new_dir_dentries(trans, root, BTRFS_I(dir_inode), ctx); - iput(dir_inode); + btrfs_add_delayed_iput(dir_inode); if (ret) goto out; } @@ -5891,7 +5891,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, ret = btrfs_log_inode(trans, root, BTRFS_I(inode), LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); - iput(inode); + btrfs_add_delayed_iput(inode); if (ret) return ret; diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index b16219e5dac9..350bc3061656 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -16,7 +16,7 @@ config CIFS select CRYPTO_GCM select CRYPTO_ECB select CRYPTO_AES - select CRYPTO_DES + select CRYPTO_LIB_DES select KEYS help This is the client VFS module for the SMB3 family of NAS protocols, diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 3289b566463f..4e2f74894e9b 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1601,7 +1601,6 @@ MODULE_DESCRIPTION ("VFS to access SMB3 servers e.g. Samba, Macs, Azure and Windows (and " "also older servers complying with the SNIA CIFS Specification)"); MODULE_VERSION(CIFS_VERSION); -MODULE_SOFTDEP("pre: des"); MODULE_SOFTDEP("pre: ecb"); MODULE_SOFTDEP("pre: hmac"); MODULE_SOFTDEP("pre: md4"); diff --git a/fs/cifs/export.c b/fs/cifs/export.c index ce8b7f677c58..eb0bb8ca8e63 100644 --- a/fs/cifs/export.c +++ b/fs/cifs/export.c @@ -24,7 +24,7 @@ */ /* - * See Documentation/filesystems/nfs/Exporting + * See Documentation/filesystems/nfs/exporting.rst * and examples in fs/exportfs * * Since cifs is a network file system, an "fsid" must be included for diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index 2b6d87bfdf8e..39a938443e3e 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c @@ -11,13 +11,14 @@ */ -#include <linux/crypto.h> #include <linux/module.h> #include <linux/slab.h> +#include <linux/fips.h> #include <linux/fs.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/random.h> +#include <crypto/des.h> #include "cifs_fs_sb.h" #include "cifs_unicode.h" #include "cifspdu.h" @@ -58,19 +59,18 @@ static int smbhash(unsigned char *out, const unsigned char *in, unsigned char *key) { unsigned char key2[8]; - struct crypto_cipher *tfm_des; + struct des_ctx ctx; str_to_key(key, key2); - tfm_des = crypto_alloc_cipher("des", 0, 0); - if (IS_ERR(tfm_des)) { - cifs_dbg(VFS, "could not allocate des crypto API\n"); - return PTR_ERR(tfm_des); + if (fips_enabled) { + cifs_dbg(VFS, "FIPS compliance enabled: DES not permitted\n"); + return -ENOENT; } - crypto_cipher_setkey(tfm_des, key2, 8); - crypto_cipher_encrypt_one(tfm_des, out, in); - crypto_free_cipher(tfm_des); + des_expand_key(&ctx, key2, DES_KEY_SIZE); + des_encrypt(&ctx, out, in); + memzero_explicit(&ctx, sizeof(ctx)); return 0; } diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index 644d48c12ce8..3aec27e5eb82 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -63,11 +63,8 @@ static long coda_pioctl(struct file *filp, unsigned int cmd, * Look up the pathname. Note that the pathname is in * user memory, and namei takes care of this */ - if (data.follow) - error = user_path(data.path, &path); - else - error = user_lpath(data.path, &path); - + error = user_path_at(AT_FDCWD, data.path, + data.follow ? LOOKUP_FOLLOW : 0, &path); if (error) return error; diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index f752d83a9c44..520f1813e789 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -20,6 +20,15 @@ #include <linux/list.h> #include <linux/spinlock.h> +struct configfs_fragment { + atomic_t frag_count; + struct rw_semaphore frag_sem; + bool frag_dead; +}; + +void put_fragment(struct configfs_fragment *); +struct configfs_fragment *get_fragment(struct configfs_fragment *); + struct configfs_dirent { atomic_t s_count; int s_dependent_count; @@ -34,6 +43,7 @@ struct configfs_dirent { #ifdef CONFIG_LOCKDEP int s_depth; #endif + struct configfs_fragment *s_frag; }; #define CONFIGFS_ROOT 0x0001 @@ -61,8 +71,8 @@ extern int configfs_create(struct dentry *, umode_t mode, void (*init)(struct in extern int configfs_create_file(struct config_item *, const struct configfs_attribute *); extern int configfs_create_bin_file(struct config_item *, const struct configfs_bin_attribute *); -extern int configfs_make_dirent(struct configfs_dirent *, - struct dentry *, void *, umode_t, int); +extern int configfs_make_dirent(struct configfs_dirent *, struct dentry *, + void *, umode_t, int, struct configfs_fragment *); extern int configfs_dirent_is_ready(struct configfs_dirent *); extern void configfs_hash_and_remove(struct dentry * dir, const char * name); @@ -137,6 +147,7 @@ static inline void release_configfs_dirent(struct configfs_dirent * sd) { if (!(sd->s_type & CONFIGFS_ROOT)) { kfree(sd->s_iattr); + put_fragment(sd->s_frag); kmem_cache_free(configfs_dir_cachep, sd); } } diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 92112915de8e..79fc25aaa8cd 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -151,11 +151,38 @@ configfs_adjust_dir_dirent_depth_after_populate(struct configfs_dirent *sd) #endif /* CONFIG_LOCKDEP */ +static struct configfs_fragment *new_fragment(void) +{ + struct configfs_fragment *p; + + p = kmalloc(sizeof(struct configfs_fragment), GFP_KERNEL); + if (p) { + atomic_set(&p->frag_count, 1); + init_rwsem(&p->frag_sem); + p->frag_dead = false; + } + return p; +} + +void put_fragment(struct configfs_fragment *frag) +{ + if (frag && atomic_dec_and_test(&frag->frag_count)) + kfree(frag); +} + +struct configfs_fragment *get_fragment(struct configfs_fragment *frag) +{ + if (likely(frag)) + atomic_inc(&frag->frag_count); + return frag; +} + /* * Allocates a new configfs_dirent and links it to the parent configfs_dirent */ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent *parent_sd, - void *element, int type) + void *element, int type, + struct configfs_fragment *frag) { struct configfs_dirent * sd; @@ -175,6 +202,7 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent *paren kmem_cache_free(configfs_dir_cachep, sd); return ERR_PTR(-ENOENT); } + sd->s_frag = get_fragment(frag); list_add(&sd->s_sibling, &parent_sd->s_children); spin_unlock(&configfs_dirent_lock); @@ -209,11 +237,11 @@ static int configfs_dirent_exists(struct configfs_dirent *parent_sd, int configfs_make_dirent(struct configfs_dirent * parent_sd, struct dentry * dentry, void * element, - umode_t mode, int type) + umode_t mode, int type, struct configfs_fragment *frag) { struct configfs_dirent * sd; - sd = configfs_new_dirent(parent_sd, element, type); + sd = configfs_new_dirent(parent_sd, element, type, frag); if (IS_ERR(sd)) return PTR_ERR(sd); @@ -260,7 +288,8 @@ static void init_symlink(struct inode * inode) * until it is validated by configfs_dir_set_ready() */ -static int configfs_create_dir(struct config_item *item, struct dentry *dentry) +static int configfs_create_dir(struct config_item *item, struct dentry *dentry, + struct configfs_fragment *frag) { int error; umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; @@ -273,7 +302,8 @@ static int configfs_create_dir(struct config_item *item, struct dentry *dentry) return error; error = configfs_make_dirent(p->d_fsdata, dentry, item, mode, - CONFIGFS_DIR | CONFIGFS_USET_CREATING); + CONFIGFS_DIR | CONFIGFS_USET_CREATING, + frag); if (unlikely(error)) return error; @@ -338,9 +368,10 @@ int configfs_create_link(struct configfs_symlink *sl, { int err = 0; umode_t mode = S_IFLNK | S_IRWXUGO; + struct configfs_dirent *p = parent->d_fsdata; - err = configfs_make_dirent(parent->d_fsdata, dentry, sl, mode, - CONFIGFS_ITEM_LINK); + err = configfs_make_dirent(p, dentry, sl, mode, + CONFIGFS_ITEM_LINK, p->s_frag); if (!err) { err = configfs_create(dentry, mode, init_symlink); if (err) { @@ -599,7 +630,8 @@ static int populate_attrs(struct config_item *item) static int configfs_attach_group(struct config_item *parent_item, struct config_item *item, - struct dentry *dentry); + struct dentry *dentry, + struct configfs_fragment *frag); static void configfs_detach_group(struct config_item *item); static void detach_groups(struct config_group *group) @@ -647,7 +679,8 @@ static void detach_groups(struct config_group *group) * try using vfs_mkdir. Just a thought. */ static int create_default_group(struct config_group *parent_group, - struct config_group *group) + struct config_group *group, + struct configfs_fragment *frag) { int ret; struct configfs_dirent *sd; @@ -663,7 +696,7 @@ static int create_default_group(struct config_group *parent_group, d_add(child, NULL); ret = configfs_attach_group(&parent_group->cg_item, - &group->cg_item, child); + &group->cg_item, child, frag); if (!ret) { sd = child->d_fsdata; sd->s_type |= CONFIGFS_USET_DEFAULT; @@ -677,13 +710,14 @@ static int create_default_group(struct config_group *parent_group, return ret; } -static int populate_groups(struct config_group *group) +static int populate_groups(struct config_group *group, + struct configfs_fragment *frag) { struct config_group *new_group; int ret = 0; list_for_each_entry(new_group, &group->default_groups, group_entry) { - ret = create_default_group(group, new_group); + ret = create_default_group(group, new_group, frag); if (ret) { detach_groups(group); break; @@ -797,11 +831,12 @@ static void link_group(struct config_group *parent_group, struct config_group *g */ static int configfs_attach_item(struct config_item *parent_item, struct config_item *item, - struct dentry *dentry) + struct dentry *dentry, + struct configfs_fragment *frag) { int ret; - ret = configfs_create_dir(item, dentry); + ret = configfs_create_dir(item, dentry, frag); if (!ret) { ret = populate_attrs(item); if (ret) { @@ -831,12 +866,13 @@ static void configfs_detach_item(struct config_item *item) static int configfs_attach_group(struct config_item *parent_item, struct config_item *item, - struct dentry *dentry) + struct dentry *dentry, + struct configfs_fragment *frag) { int ret; struct configfs_dirent *sd; - ret = configfs_attach_item(parent_item, item, dentry); + ret = configfs_attach_item(parent_item, item, dentry, frag); if (!ret) { sd = dentry->d_fsdata; sd->s_type |= CONFIGFS_USET_DIR; @@ -852,7 +888,7 @@ static int configfs_attach_group(struct config_item *parent_item, */ inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD); configfs_adjust_dir_dirent_depth_before_populate(sd); - ret = populate_groups(to_config_group(item)); + ret = populate_groups(to_config_group(item), frag); if (ret) { configfs_detach_item(item); d_inode(dentry)->i_flags |= S_DEAD; @@ -1247,6 +1283,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode struct configfs_dirent *sd; const struct config_item_type *type; struct module *subsys_owner = NULL, *new_item_owner = NULL; + struct configfs_fragment *frag; char *name; sd = dentry->d_parent->d_fsdata; @@ -1265,6 +1302,12 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode goto out; } + frag = new_fragment(); + if (!frag) { + ret = -ENOMEM; + goto out; + } + /* Get a working ref for the duration of this function */ parent_item = configfs_get_config_item(dentry->d_parent); type = parent_item->ci_type; @@ -1367,9 +1410,9 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode spin_unlock(&configfs_dirent_lock); if (group) - ret = configfs_attach_group(parent_item, item, dentry); + ret = configfs_attach_group(parent_item, item, dentry, frag); else - ret = configfs_attach_item(parent_item, item, dentry); + ret = configfs_attach_item(parent_item, item, dentry, frag); spin_lock(&configfs_dirent_lock); sd->s_type &= ~CONFIGFS_USET_IN_MKDIR; @@ -1406,6 +1449,7 @@ out_put: * reference. */ config_item_put(parent_item); + put_fragment(frag); out: return ret; @@ -1417,6 +1461,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) struct config_item *item; struct configfs_subsystem *subsys; struct configfs_dirent *sd; + struct configfs_fragment *frag; struct module *subsys_owner = NULL, *dead_item_owner = NULL; int ret; @@ -1474,6 +1519,16 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) } } while (ret == -EAGAIN); + frag = sd->s_frag; + if (down_write_killable(&frag->frag_sem)) { + spin_lock(&configfs_dirent_lock); + configfs_detach_rollback(dentry); + spin_unlock(&configfs_dirent_lock); + return -EINTR; + } + frag->frag_dead = true; + up_write(&frag->frag_sem); + /* Get a working ref for the duration of this function */ item = configfs_get_config_item(dentry); @@ -1574,7 +1629,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file) */ err = -ENOENT; if (configfs_dirent_is_ready(parent_sd)) { - file->private_data = configfs_new_dirent(parent_sd, NULL, 0); + file->private_data = configfs_new_dirent(parent_sd, NULL, 0, NULL); if (IS_ERR(file->private_data)) err = PTR_ERR(file->private_data); else @@ -1732,8 +1787,13 @@ int configfs_register_group(struct config_group *parent_group, { struct configfs_subsystem *subsys = parent_group->cg_subsys; struct dentry *parent; + struct configfs_fragment *frag; int ret; + frag = new_fragment(); + if (!frag) + return -ENOMEM; + mutex_lock(&subsys->su_mutex); link_group(parent_group, group); mutex_unlock(&subsys->su_mutex); @@ -1741,7 +1801,7 @@ int configfs_register_group(struct config_group *parent_group, parent = parent_group->cg_item.ci_dentry; inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); - ret = create_default_group(parent_group, group); + ret = create_default_group(parent_group, group, frag); if (ret) goto err_out; @@ -1749,12 +1809,14 @@ int configfs_register_group(struct config_group *parent_group, configfs_dir_set_ready(group->cg_item.ci_dentry->d_fsdata); spin_unlock(&configfs_dirent_lock); inode_unlock(d_inode(parent)); + put_fragment(frag); return 0; err_out: inode_unlock(d_inode(parent)); mutex_lock(&subsys->su_mutex); unlink_group(group); mutex_unlock(&subsys->su_mutex); + put_fragment(frag); return ret; } EXPORT_SYMBOL(configfs_register_group); @@ -1770,16 +1832,12 @@ void configfs_unregister_group(struct config_group *group) struct configfs_subsystem *subsys = group->cg_subsys; struct dentry *dentry = group->cg_item.ci_dentry; struct dentry *parent = group->cg_item.ci_parent->ci_dentry; + struct configfs_dirent *sd = dentry->d_fsdata; + struct configfs_fragment *frag = sd->s_frag; - mutex_lock(&subsys->su_mutex); - if (!group->cg_item.ci_parent->ci_group) { - /* - * The parent has already been unlinked and detached - * due to a rmdir. - */ - goto unlink_group; - } - mutex_unlock(&subsys->su_mutex); + down_write(&frag->frag_sem); + frag->frag_dead = true; + up_write(&frag->frag_sem); inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); spin_lock(&configfs_dirent_lock); @@ -1796,7 +1854,6 @@ void configfs_unregister_group(struct config_group *group) dput(dentry); mutex_lock(&subsys->su_mutex); -unlink_group: unlink_group(group); mutex_unlock(&subsys->su_mutex); } @@ -1853,10 +1910,17 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) struct dentry *dentry; struct dentry *root; struct configfs_dirent *sd; + struct configfs_fragment *frag; + + frag = new_fragment(); + if (!frag) + return -ENOMEM; root = configfs_pin_fs(); - if (IS_ERR(root)) + if (IS_ERR(root)) { + put_fragment(frag); return PTR_ERR(root); + } if (!group->cg_item.ci_name) group->cg_item.ci_name = group->cg_item.ci_namebuf; @@ -1872,7 +1936,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) d_add(dentry, NULL); err = configfs_attach_group(sd->s_element, &group->cg_item, - dentry); + dentry, frag); if (err) { BUG_ON(d_inode(dentry)); d_drop(dentry); @@ -1890,6 +1954,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) unlink_group(group); configfs_release_fs(); } + put_fragment(frag); return err; } @@ -1899,12 +1964,18 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) struct config_group *group = &subsys->su_group; struct dentry *dentry = group->cg_item.ci_dentry; struct dentry *root = dentry->d_sb->s_root; + struct configfs_dirent *sd = dentry->d_fsdata; + struct configfs_fragment *frag = sd->s_frag; if (dentry->d_parent != root) { pr_err("Tried to unregister non-subsystem!\n"); return; } + down_write(&frag->frag_sem); + frag->frag_dead = true; + up_write(&frag->frag_sem); + inode_lock_nested(d_inode(root), I_MUTEX_PARENT); inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD); diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 61e4db4390a1..fb65b706cc0d 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -39,40 +39,44 @@ struct configfs_buffer { bool write_in_progress; char *bin_buffer; int bin_buffer_size; + int cb_max_size; + struct config_item *item; + struct module *owner; + union { + struct configfs_attribute *attr; + struct configfs_bin_attribute *bin_attr; + }; }; +static inline struct configfs_fragment *to_frag(struct file *file) +{ + struct configfs_dirent *sd = file->f_path.dentry->d_fsdata; -/** - * fill_read_buffer - allocate and fill buffer from item. - * @dentry: dentry pointer. - * @buffer: data buffer for file. - * - * Allocate @buffer->page, if it hasn't been already, then call the - * config_item's show() method to fill the buffer with this attribute's - * data. - * This is called only once, on the file's first read. - */ -static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buffer) + return sd->s_frag; +} + +static int fill_read_buffer(struct file *file, struct configfs_buffer *buffer) { - struct configfs_attribute * attr = to_attr(dentry); - struct config_item * item = to_item(dentry->d_parent); - int ret = 0; - ssize_t count; + struct configfs_fragment *frag = to_frag(file); + ssize_t count = -ENOENT; if (!buffer->page) buffer->page = (char *) get_zeroed_page(GFP_KERNEL); if (!buffer->page) return -ENOMEM; - count = attr->show(item, buffer->page); - - BUG_ON(count > (ssize_t)SIMPLE_ATTR_SIZE); - if (count >= 0) { - buffer->needs_read_fill = 0; - buffer->count = count; - } else - ret = count; - return ret; + down_read(&frag->frag_sem); + if (!frag->frag_dead) + count = buffer->attr->show(buffer->item, buffer->page); + up_read(&frag->frag_sem); + + if (count < 0) + return count; + if (WARN_ON_ONCE(count > (ssize_t)SIMPLE_ATTR_SIZE)) + return -EIO; + buffer->needs_read_fill = 0; + buffer->count = count; + return 0; } /** @@ -97,12 +101,13 @@ static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buf static ssize_t configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct configfs_buffer * buffer = file->private_data; + struct configfs_buffer *buffer = file->private_data; ssize_t retval = 0; mutex_lock(&buffer->mutex); if (buffer->needs_read_fill) { - if ((retval = fill_read_buffer(file->f_path.dentry,buffer))) + retval = fill_read_buffer(file, buffer); + if (retval) goto out; } pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n", @@ -138,10 +143,8 @@ static ssize_t configfs_read_bin_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) { + struct configfs_fragment *frag = to_frag(file); struct configfs_buffer *buffer = file->private_data; - struct dentry *dentry = file->f_path.dentry; - struct config_item *item = to_item(dentry->d_parent); - struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry); ssize_t retval = 0; ssize_t len = min_t(size_t, count, PAGE_SIZE); @@ -156,14 +159,19 @@ configfs_read_bin_file(struct file *file, char __user *buf, if (buffer->needs_read_fill) { /* perform first read with buf == NULL to get extent */ - len = bin_attr->read(item, NULL, 0); + down_read(&frag->frag_sem); + if (!frag->frag_dead) + len = buffer->bin_attr->read(buffer->item, NULL, 0); + else + len = -ENOENT; + up_read(&frag->frag_sem); if (len <= 0) { retval = len; goto out; } /* do not exceed the maximum value */ - if (bin_attr->cb_max_size && len > bin_attr->cb_max_size) { + if (buffer->cb_max_size && len > buffer->cb_max_size) { retval = -EFBIG; goto out; } @@ -176,7 +184,13 @@ configfs_read_bin_file(struct file *file, char __user *buf, buffer->bin_buffer_size = len; /* perform second read to fill buffer */ - len = bin_attr->read(item, buffer->bin_buffer, len); + down_read(&frag->frag_sem); + if (!frag->frag_dead) + len = buffer->bin_attr->read(buffer->item, + buffer->bin_buffer, len); + else + len = -ENOENT; + up_read(&frag->frag_sem); if (len < 0) { retval = len; vfree(buffer->bin_buffer); @@ -226,25 +240,17 @@ fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size return error ? -EFAULT : count; } - -/** - * flush_write_buffer - push buffer to config_item. - * @dentry: dentry to the attribute - * @buffer: data buffer for file. - * @count: number of bytes - * - * Get the correct pointers for the config_item and the attribute we're - * dealing with, then call the store() method for the attribute, - * passing the buffer that we acquired in fill_write_buffer(). - */ - static int -flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size_t count) +flush_write_buffer(struct file *file, struct configfs_buffer *buffer, size_t count) { - struct configfs_attribute * attr = to_attr(dentry); - struct config_item * item = to_item(dentry->d_parent); - - return attr->store(item, buffer->page, count); + struct configfs_fragment *frag = to_frag(file); + int res = -ENOENT; + + down_read(&frag->frag_sem); + if (!frag->frag_dead) + res = buffer->attr->store(buffer->item, buffer->page, count); + up_read(&frag->frag_sem); + return res; } @@ -268,13 +274,13 @@ flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size static ssize_t configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct configfs_buffer * buffer = file->private_data; + struct configfs_buffer *buffer = file->private_data; ssize_t len; mutex_lock(&buffer->mutex); len = fill_write_buffer(buffer, buf, count); if (len > 0) - len = flush_write_buffer(file->f_path.dentry, buffer, len); + len = flush_write_buffer(file, buffer, len); if (len > 0) *ppos += len; mutex_unlock(&buffer->mutex); @@ -299,8 +305,6 @@ configfs_write_bin_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct configfs_buffer *buffer = file->private_data; - struct dentry *dentry = file->f_path.dentry; - struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry); void *tbuf = NULL; ssize_t len; @@ -316,8 +320,8 @@ configfs_write_bin_file(struct file *file, const char __user *buf, /* buffer grows? */ if (*ppos + count > buffer->bin_buffer_size) { - if (bin_attr->cb_max_size && - *ppos + count > bin_attr->cb_max_size) { + if (buffer->cb_max_size && + *ppos + count > buffer->cb_max_size) { len = -EFBIG; goto out; } @@ -349,31 +353,51 @@ out: return len; } -static int check_perm(struct inode * inode, struct file * file, int type) +static int __configfs_open_file(struct inode *inode, struct file *file, int type) { - struct config_item *item = configfs_get_config_item(file->f_path.dentry->d_parent); - struct configfs_attribute * attr = to_attr(file->f_path.dentry); - struct configfs_bin_attribute *bin_attr = NULL; - struct configfs_buffer * buffer; - struct configfs_item_operations * ops = NULL; - int error = 0; + struct dentry *dentry = file->f_path.dentry; + struct configfs_fragment *frag = to_frag(file); + struct configfs_attribute *attr; + struct configfs_buffer *buffer; + int error; - if (!item || !attr) - goto Einval; + error = -ENOMEM; + buffer = kzalloc(sizeof(struct configfs_buffer), GFP_KERNEL); + if (!buffer) + goto out; - if (type & CONFIGFS_ITEM_BIN_ATTR) - bin_attr = to_bin_attr(file->f_path.dentry); + error = -ENOENT; + down_read(&frag->frag_sem); + if (unlikely(frag->frag_dead)) + goto out_free_buffer; - /* Grab the module reference for this attribute if we have one */ - if (!try_module_get(attr->ca_owner)) { - error = -ENODEV; - goto Done; + error = -EINVAL; + buffer->item = to_item(dentry->d_parent); + if (!buffer->item) + goto out_free_buffer; + + attr = to_attr(dentry); + if (!attr) + goto out_put_item; + + if (type & CONFIGFS_ITEM_BIN_ATTR) { + buffer->bin_attr = to_bin_attr(dentry); + buffer->cb_max_size = buffer->bin_attr->cb_max_size; + } else { + buffer->attr = attr; } - if (item->ci_type) - ops = item->ci_type->ct_item_ops; - else - goto Eaccess; + buffer->owner = attr->ca_owner; + /* Grab the module reference for this attribute if we have one */ + error = -ENODEV; + if (!try_module_get(buffer->owner)) + goto out_put_item; + + error = -EACCES; + if (!buffer->item->ci_type) + goto out_put_module; + + buffer->ops = buffer->item->ci_type->ct_item_ops; /* File needs write support. * The inode's perms must say it's ok, @@ -381,13 +405,11 @@ static int check_perm(struct inode * inode, struct file * file, int type) */ if (file->f_mode & FMODE_WRITE) { if (!(inode->i_mode & S_IWUGO)) - goto Eaccess; - + goto out_put_module; if ((type & CONFIGFS_ITEM_ATTR) && !attr->store) - goto Eaccess; - - if ((type & CONFIGFS_ITEM_BIN_ATTR) && !bin_attr->write) - goto Eaccess; + goto out_put_module; + if ((type & CONFIGFS_ITEM_BIN_ATTR) && !buffer->bin_attr->write) + goto out_put_module; } /* File needs read support. @@ -396,92 +418,72 @@ static int check_perm(struct inode * inode, struct file * file, int type) */ if (file->f_mode & FMODE_READ) { if (!(inode->i_mode & S_IRUGO)) - goto Eaccess; - + goto out_put_module; if ((type & CONFIGFS_ITEM_ATTR) && !attr->show) - goto Eaccess; - - if ((type & CONFIGFS_ITEM_BIN_ATTR) && !bin_attr->read) - goto Eaccess; + goto out_put_module; + if ((type & CONFIGFS_ITEM_BIN_ATTR) && !buffer->bin_attr->read) + goto out_put_module; } - /* No error? Great, allocate a buffer for the file, and store it - * it in file->private_data for easy access. - */ - buffer = kzalloc(sizeof(struct configfs_buffer),GFP_KERNEL); - if (!buffer) { - error = -ENOMEM; - goto Enomem; - } mutex_init(&buffer->mutex); buffer->needs_read_fill = 1; buffer->read_in_progress = false; buffer->write_in_progress = false; - buffer->ops = ops; file->private_data = buffer; - goto Done; + up_read(&frag->frag_sem); + return 0; - Einval: - error = -EINVAL; - goto Done; - Eaccess: - error = -EACCES; - Enomem: - module_put(attr->ca_owner); - Done: - if (error && item) - config_item_put(item); +out_put_module: + module_put(buffer->owner); +out_put_item: + config_item_put(buffer->item); +out_free_buffer: + up_read(&frag->frag_sem); + kfree(buffer); +out: return error; } static int configfs_release(struct inode *inode, struct file *filp) { - struct config_item * item = to_item(filp->f_path.dentry->d_parent); - struct configfs_attribute * attr = to_attr(filp->f_path.dentry); - struct module * owner = attr->ca_owner; - struct configfs_buffer * buffer = filp->private_data; - - if (item) - config_item_put(item); - /* After this point, attr should not be accessed. */ - module_put(owner); - - if (buffer) { - if (buffer->page) - free_page((unsigned long)buffer->page); - mutex_destroy(&buffer->mutex); - kfree(buffer); - } + struct configfs_buffer *buffer = filp->private_data; + + module_put(buffer->owner); + if (buffer->page) + free_page((unsigned long)buffer->page); + mutex_destroy(&buffer->mutex); + kfree(buffer); return 0; } static int configfs_open_file(struct inode *inode, struct file *filp) { - return check_perm(inode, filp, CONFIGFS_ITEM_ATTR); + return __configfs_open_file(inode, filp, CONFIGFS_ITEM_ATTR); } static int configfs_open_bin_file(struct inode *inode, struct file *filp) { - return check_perm(inode, filp, CONFIGFS_ITEM_BIN_ATTR); + return __configfs_open_file(inode, filp, CONFIGFS_ITEM_BIN_ATTR); } -static int configfs_release_bin_file(struct inode *inode, struct file *filp) +static int configfs_release_bin_file(struct inode *inode, struct file *file) { - struct configfs_buffer *buffer = filp->private_data; - struct dentry *dentry = filp->f_path.dentry; - struct config_item *item = to_item(dentry->d_parent); - struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry); - ssize_t len = 0; - int ret; + struct configfs_buffer *buffer = file->private_data; buffer->read_in_progress = false; if (buffer->write_in_progress) { + struct configfs_fragment *frag = to_frag(file); buffer->write_in_progress = false; - len = bin_attr->write(item, buffer->bin_buffer, - buffer->bin_buffer_size); - + down_read(&frag->frag_sem); + if (!frag->frag_dead) { + /* result of ->release() is ignored */ + buffer->bin_attr->write(buffer->item, + buffer->bin_buffer, + buffer->bin_buffer_size); + } + up_read(&frag->frag_sem); /* vfree on NULL is safe */ vfree(buffer->bin_buffer); buffer->bin_buffer = NULL; @@ -489,10 +491,8 @@ static int configfs_release_bin_file(struct inode *inode, struct file *filp) buffer->needs_read_fill = 1; } - ret = configfs_release(inode, filp); - if (len < 0) - return len; - return ret; + configfs_release(inode, file); + return 0; } @@ -527,7 +527,7 @@ int configfs_create_file(struct config_item * item, const struct configfs_attrib inode_lock_nested(d_inode(dir), I_MUTEX_NORMAL); error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, - CONFIGFS_ITEM_ATTR); + CONFIGFS_ITEM_ATTR, parent_sd->s_frag); inode_unlock(d_inode(dir)); return error; @@ -549,7 +549,7 @@ int configfs_create_bin_file(struct config_item *item, inode_lock_nested(dir->d_inode, I_MUTEX_NORMAL); error = configfs_make_dirent(parent_sd, NULL, (void *) bin_attr, mode, - CONFIGFS_ITEM_BIN_ATTR); + CONFIGFS_ITEM_BIN_ATTR, parent_sd->s_frag); inode_unlock(dir->d_inode); return error; diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig index 5fdf24877c17..ff5a1746cbae 100644 --- a/fs/crypto/Kconfig +++ b/fs/crypto/Kconfig @@ -7,6 +7,8 @@ config FS_ENCRYPTION select CRYPTO_ECB select CRYPTO_XTS select CRYPTO_CTS + select CRYPTO_SHA512 + select CRYPTO_HMAC select KEYS help Enable encryption of files and directories. This diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile index 4f0df5e682e4..232e2bb5a337 100644 --- a/fs/crypto/Makefile +++ b/fs/crypto/Makefile @@ -1,5 +1,13 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_FS_ENCRYPTION) += fscrypto.o -fscrypto-y := crypto.o fname.o hooks.o keyinfo.o policy.o +fscrypto-y := crypto.o \ + fname.o \ + hkdf.o \ + hooks.o \ + keyring.o \ + keysetup.o \ + keysetup_v1.o \ + policy.o + fscrypto-$(CONFIG_BLOCK) += bio.o diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 45c3d0427fb2..32a7ad0098cc 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -141,7 +141,7 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, memset(iv, 0, ci->ci_mode->ivsize); iv->lblk_num = cpu_to_le64(lblk_num); - if (ci->ci_flags & FS_POLICY_FLAG_DIRECT_KEY) + if (fscrypt_is_direct_key_policy(&ci->ci_policy)) memcpy(iv->nonce, ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE); if (ci->ci_essiv_tfm != NULL) @@ -188,10 +188,8 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); skcipher_request_free(req); if (res) { - fscrypt_err(inode->i_sb, - "%scryption failed for inode %lu, block %llu: %d", - (rw == FS_DECRYPT ? "de" : "en"), - inode->i_ino, lblk_num, res); + fscrypt_err(inode, "%scryption failed for block %llu: %d", + (rw == FS_DECRYPT ? "De" : "En"), lblk_num, res); return res; } return 0; @@ -453,7 +451,7 @@ fail: return res; } -void fscrypt_msg(struct super_block *sb, const char *level, +void fscrypt_msg(const struct inode *inode, const char *level, const char *fmt, ...) { static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, @@ -467,8 +465,9 @@ void fscrypt_msg(struct super_block *sb, const char *level, va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - if (sb) - printk("%sfscrypt (%s): %pV\n", level, sb->s_id, &vaf); + if (inode) + printk("%sfscrypt (%s, inode %lu): %pV\n", + level, inode->i_sb->s_id, inode->i_ino, &vaf); else printk("%sfscrypt: %pV\n", level, &vaf); va_end(args); @@ -479,6 +478,8 @@ void fscrypt_msg(struct super_block *sb, const char *level, */ static int __init fscrypt_init(void) { + int err = -ENOMEM; + /* * Use an unbound workqueue to allow bios to be decrypted in parallel * even when they happen to complete on the same CPU. This sacrifices @@ -501,31 +502,19 @@ static int __init fscrypt_init(void) if (!fscrypt_info_cachep) goto fail_free_ctx; + err = fscrypt_init_keyring(); + if (err) + goto fail_free_info; + return 0; +fail_free_info: + kmem_cache_destroy(fscrypt_info_cachep); fail_free_ctx: kmem_cache_destroy(fscrypt_ctx_cachep); fail_free_queue: destroy_workqueue(fscrypt_read_workqueue); fail: - return -ENOMEM; -} -module_init(fscrypt_init) - -/** - * fscrypt_exit() - Shutdown the fs encryption system - */ -static void __exit fscrypt_exit(void) -{ - fscrypt_destroy(); - - if (fscrypt_read_workqueue) - destroy_workqueue(fscrypt_read_workqueue); - kmem_cache_destroy(fscrypt_ctx_cachep); - kmem_cache_destroy(fscrypt_info_cachep); - - fscrypt_essiv_cleanup(); + return err; } -module_exit(fscrypt_exit); - -MODULE_LICENSE("GPL"); +late_initcall(fscrypt_init) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 00d150ff3033..3da3707c10e3 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -71,9 +71,7 @@ int fname_encrypt(struct inode *inode, const struct qstr *iname, res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); skcipher_request_free(req); if (res < 0) { - fscrypt_err(inode->i_sb, - "Filename encryption failed for inode %lu: %d", - inode->i_ino, res); + fscrypt_err(inode, "Filename encryption failed: %d", res); return res; } @@ -117,9 +115,7 @@ static int fname_decrypt(struct inode *inode, res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait); skcipher_request_free(req); if (res < 0) { - fscrypt_err(inode->i_sb, - "Filename decryption failed for inode %lu: %d", - inode->i_ino, res); + fscrypt_err(inode, "Filename decryption failed: %d", res); return res; } @@ -127,44 +123,45 @@ static int fname_decrypt(struct inode *inode, return 0; } -static const char *lookup_table = +static const char lookup_table[65] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; #define BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) /** - * digest_encode() - + * base64_encode() - * - * Encodes the input digest using characters from the set [a-zA-Z0-9_+]. + * Encodes the input string using characters from the set [A-Za-z0-9+,]. * The encoded string is roughly 4/3 times the size of the input string. + * + * Return: length of the encoded string */ -static int digest_encode(const char *src, int len, char *dst) +static int base64_encode(const u8 *src, int len, char *dst) { - int i = 0, bits = 0, ac = 0; + int i, bits = 0, ac = 0; char *cp = dst; - while (i < len) { - ac += (((unsigned char) src[i]) << bits); + for (i = 0; i < len; i++) { + ac += src[i] << bits; bits += 8; do { *cp++ = lookup_table[ac & 0x3f]; ac >>= 6; bits -= 6; } while (bits >= 6); - i++; } if (bits) *cp++ = lookup_table[ac & 0x3f]; return cp - dst; } -static int digest_decode(const char *src, int len, char *dst) +static int base64_decode(const char *src, int len, u8 *dst) { - int i = 0, bits = 0, ac = 0; + int i, bits = 0, ac = 0; const char *p; - char *cp = dst; + u8 *cp = dst; - while (i < len) { + for (i = 0; i < len; i++) { p = strchr(lookup_table, src[i]); if (p == NULL || src[i] == 0) return -2; @@ -175,7 +172,6 @@ static int digest_decode(const char *src, int len, char *dst) ac >>= 8; bits -= 8; } - i++; } if (ac) return -1; @@ -185,8 +181,9 @@ static int digest_decode(const char *src, int len, char *dst) bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, u32 max_len, u32 *encrypted_len_ret) { - int padding = 4 << (inode->i_crypt_info->ci_flags & - FS_POLICY_FLAGS_PAD_MASK); + const struct fscrypt_info *ci = inode->i_crypt_info; + int padding = 4 << (fscrypt_policy_flags(&ci->ci_policy) & + FSCRYPT_POLICY_FLAGS_PAD_MASK); u32 encrypted_len; if (orig_len > max_len) @@ -272,7 +269,7 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, return fname_decrypt(inode, iname, oname); if (iname->len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE) { - oname->len = digest_encode(iname->name, iname->len, + oname->len = base64_encode(iname->name, iname->len, oname->name); return 0; } @@ -287,7 +284,7 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, FSCRYPT_FNAME_DIGEST(iname->name, iname->len), FSCRYPT_FNAME_DIGEST_SIZE); oname->name[0] = '_'; - oname->len = 1 + digest_encode((const char *)&digested_name, + oname->len = 1 + base64_encode((const u8 *)&digested_name, sizeof(digested_name), oname->name + 1); return 0; } @@ -380,8 +377,8 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, if (fname->crypto_buf.name == NULL) return -ENOMEM; - ret = digest_decode(iname->name + digested, iname->len - digested, - fname->crypto_buf.name); + ret = base64_decode(iname->name + digested, iname->len - digested, + fname->crypto_buf.name); if (ret < 0) { ret = -ENOENT; goto errout; diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 8978eec9d766..e84efc01512e 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -4,9 +4,8 @@ * * Copyright (C) 2015, Google, Inc. * - * This contains encryption key functions. - * - * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. + * Originally written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar. + * Heavily modified since then. */ #ifndef _FSCRYPT_PRIVATE_H @@ -15,30 +14,133 @@ #include <linux/fscrypt.h> #include <crypto/hash.h> -/* Encryption parameters */ +#define CONST_STRLEN(str) (sizeof(str) - 1) + #define FS_KEY_DERIVATION_NONCE_SIZE 16 -/** - * Encryption context for inode - * - * Protector format: - * 1 byte: Protector format (1 = this version) - * 1 byte: File contents encryption mode - * 1 byte: File names encryption mode - * 1 byte: Flags - * 8 bytes: Master Key descriptor - * 16 bytes: Encryption Key derivation nonce - */ -struct fscrypt_context { - u8 format; +#define FSCRYPT_MIN_KEY_SIZE 16 + +#define FSCRYPT_CONTEXT_V1 1 +#define FSCRYPT_CONTEXT_V2 2 + +struct fscrypt_context_v1 { + u8 version; /* FSCRYPT_CONTEXT_V1 */ u8 contents_encryption_mode; u8 filenames_encryption_mode; u8 flags; - u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; + u8 master_key_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE]; u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; -} __packed; +}; -#define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 +struct fscrypt_context_v2 { + u8 version; /* FSCRYPT_CONTEXT_V2 */ + u8 contents_encryption_mode; + u8 filenames_encryption_mode; + u8 flags; + u8 __reserved[4]; + u8 master_key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]; + u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; +}; + +/** + * fscrypt_context - the encryption context of an inode + * + * This is the on-disk equivalent of an fscrypt_policy, stored alongside each + * encrypted file usually in a hidden extended attribute. It contains the + * fields from the fscrypt_policy, in order to identify the encryption algorithm + * and key with which the file is encrypted. It also contains a nonce that was + * randomly generated by fscrypt itself; this is used as KDF input or as a tweak + * to cause different files to be encrypted differently. + */ +union fscrypt_context { + u8 version; + struct fscrypt_context_v1 v1; + struct fscrypt_context_v2 v2; +}; + +/* + * Return the size expected for the given fscrypt_context based on its version + * number, or 0 if the context version is unrecognized. + */ +static inline int fscrypt_context_size(const union fscrypt_context *ctx) +{ + switch (ctx->version) { + case FSCRYPT_CONTEXT_V1: + BUILD_BUG_ON(sizeof(ctx->v1) != 28); + return sizeof(ctx->v1); + case FSCRYPT_CONTEXT_V2: + BUILD_BUG_ON(sizeof(ctx->v2) != 40); + return sizeof(ctx->v2); + } + return 0; +} + +#undef fscrypt_policy +union fscrypt_policy { + u8 version; + struct fscrypt_policy_v1 v1; + struct fscrypt_policy_v2 v2; +}; + +/* + * Return the size expected for the given fscrypt_policy based on its version + * number, or 0 if the policy version is unrecognized. + */ +static inline int fscrypt_policy_size(const union fscrypt_policy *policy) +{ + switch (policy->version) { + case FSCRYPT_POLICY_V1: + return sizeof(policy->v1); + case FSCRYPT_POLICY_V2: + return sizeof(policy->v2); + } + return 0; +} + +/* Return the contents encryption mode of a valid encryption policy */ +static inline u8 +fscrypt_policy_contents_mode(const union fscrypt_policy *policy) +{ + switch (policy->version) { + case FSCRYPT_POLICY_V1: + return policy->v1.contents_encryption_mode; + case FSCRYPT_POLICY_V2: + return policy->v2.contents_encryption_mode; + } + BUG(); +} + +/* Return the filenames encryption mode of a valid encryption policy */ +static inline u8 +fscrypt_policy_fnames_mode(const union fscrypt_policy *policy) +{ + switch (policy->version) { + case FSCRYPT_POLICY_V1: + return policy->v1.filenames_encryption_mode; + case FSCRYPT_POLICY_V2: + return policy->v2.filenames_encryption_mode; + } + BUG(); +} + +/* Return the flags (FSCRYPT_POLICY_FLAG*) of a valid encryption policy */ +static inline u8 +fscrypt_policy_flags(const union fscrypt_policy *policy) +{ + switch (policy->version) { + case FSCRYPT_POLICY_V1: + return policy->v1.flags; + case FSCRYPT_POLICY_V2: + return policy->v2.flags; + } + BUG(); +} + +static inline bool +fscrypt_is_direct_key_policy(const union fscrypt_policy *policy) +{ + return fscrypt_policy_flags(policy) & FSCRYPT_POLICY_FLAG_DIRECT_KEY; +} /** * For encrypted symlinks, the ciphertext length is stored at the beginning @@ -68,23 +170,37 @@ struct fscrypt_info { struct crypto_cipher *ci_essiv_tfm; /* - * Encryption mode used for this inode. It corresponds to either - * ci_data_mode or ci_filename_mode, depending on the inode type. + * Encryption mode used for this inode. It corresponds to either the + * contents or filenames encryption mode, depending on the inode type. */ struct fscrypt_mode *ci_mode; + /* Back-pointer to the inode */ + struct inode *ci_inode; + + /* + * The master key with which this inode was unlocked (decrypted). This + * will be NULL if the master key was found in a process-subscribed + * keyring rather than in the filesystem-level keyring. + */ + struct key *ci_master_key; + + /* + * Link in list of inodes that were unlocked with the master key. + * Only used when ->ci_master_key is set. + */ + struct list_head ci_master_key_link; + /* - * If non-NULL, then this inode uses a master key directly rather than a - * derived key, and ci_ctfm will equal ci_master_key->mk_ctfm. - * Otherwise, this inode uses a derived key. + * If non-NULL, then encryption is done using the master key directly + * and ci_ctfm will equal ci_direct_key->dk_ctfm. */ - struct fscrypt_master_key *ci_master_key; + struct fscrypt_direct_key *ci_direct_key; - /* fields from the fscrypt_context */ - u8 ci_data_mode; - u8 ci_filename_mode; - u8 ci_flags; - u8 ci_master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; + /* The encryption policy used by this inode */ + union fscrypt_policy ci_policy; + + /* This inode's nonce, copied from the fscrypt_context */ u8 ci_nonce[FS_KEY_DERIVATION_NONCE_SIZE]; }; @@ -98,16 +214,16 @@ typedef enum { static inline bool fscrypt_valid_enc_modes(u32 contents_mode, u32 filenames_mode) { - if (contents_mode == FS_ENCRYPTION_MODE_AES_128_CBC && - filenames_mode == FS_ENCRYPTION_MODE_AES_128_CTS) + if (contents_mode == FSCRYPT_MODE_AES_128_CBC && + filenames_mode == FSCRYPT_MODE_AES_128_CTS) return true; - if (contents_mode == FS_ENCRYPTION_MODE_AES_256_XTS && - filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS) + if (contents_mode == FSCRYPT_MODE_AES_256_XTS && + filenames_mode == FSCRYPT_MODE_AES_256_CTS) return true; - if (contents_mode == FS_ENCRYPTION_MODE_ADIANTUM && - filenames_mode == FS_ENCRYPTION_MODE_ADIANTUM) + if (contents_mode == FSCRYPT_MODE_ADIANTUM && + filenames_mode == FSCRYPT_MODE_ADIANTUM) return true; return false; @@ -125,12 +241,12 @@ extern struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags); extern const struct dentry_operations fscrypt_d_ops; extern void __printf(3, 4) __cold -fscrypt_msg(struct super_block *sb, const char *level, const char *fmt, ...); +fscrypt_msg(const struct inode *inode, const char *level, const char *fmt, ...); -#define fscrypt_warn(sb, fmt, ...) \ - fscrypt_msg(sb, KERN_WARNING, fmt, ##__VA_ARGS__) -#define fscrypt_err(sb, fmt, ...) \ - fscrypt_msg(sb, KERN_ERR, fmt, ##__VA_ARGS__) +#define fscrypt_warn(inode, fmt, ...) \ + fscrypt_msg((inode), KERN_WARNING, fmt, ##__VA_ARGS__) +#define fscrypt_err(inode, fmt, ...) \ + fscrypt_msg((inode), KERN_ERR, fmt, ##__VA_ARGS__) #define FSCRYPT_MAX_IV_SIZE 32 @@ -155,7 +271,172 @@ extern bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, u32 max_len, u32 *encrypted_len_ret); -/* keyinfo.c */ +/* hkdf.c */ + +struct fscrypt_hkdf { + struct crypto_shash *hmac_tfm; +}; + +extern int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, + unsigned int master_key_size); + +/* + * The list of contexts in which fscrypt uses HKDF. These values are used as + * the first byte of the HKDF application-specific info string to guarantee that + * info strings are never repeated between contexts. This ensures that all HKDF + * outputs are unique and cryptographically isolated, i.e. knowledge of one + * output doesn't reveal another. + */ +#define HKDF_CONTEXT_KEY_IDENTIFIER 1 +#define HKDF_CONTEXT_PER_FILE_KEY 2 +#define HKDF_CONTEXT_PER_MODE_KEY 3 + +extern int fscrypt_hkdf_expand(struct fscrypt_hkdf *hkdf, u8 context, + const u8 *info, unsigned int infolen, + u8 *okm, unsigned int okmlen); + +extern void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf); + +/* keyring.c */ + +/* + * fscrypt_master_key_secret - secret key material of an in-use master key + */ +struct fscrypt_master_key_secret { + + /* + * For v2 policy keys: HKDF context keyed by this master key. + * For v1 policy keys: not set (hkdf.hmac_tfm == NULL). + */ + struct fscrypt_hkdf hkdf; + + /* Size of the raw key in bytes. Set even if ->raw isn't set. */ + u32 size; + + /* For v1 policy keys: the raw key. Wiped for v2 policy keys. */ + u8 raw[FSCRYPT_MAX_KEY_SIZE]; + +} __randomize_layout; + +/* + * fscrypt_master_key - an in-use master key + * + * This represents a master encryption key which has been added to the + * filesystem and can be used to "unlock" the encrypted files which were + * encrypted with it. + */ +struct fscrypt_master_key { + + /* + * The secret key material. After FS_IOC_REMOVE_ENCRYPTION_KEY is + * executed, this is wiped and no new inodes can be unlocked with this + * key; however, there may still be inodes in ->mk_decrypted_inodes + * which could not be evicted. As long as some inodes still remain, + * FS_IOC_REMOVE_ENCRYPTION_KEY can be retried, or + * FS_IOC_ADD_ENCRYPTION_KEY can add the secret again. + * + * Locking: protected by key->sem (outer) and mk_secret_sem (inner). + * The reason for two locks is that key->sem also protects modifying + * mk_users, which ranks it above the semaphore for the keyring key + * type, which is in turn above page faults (via keyring_read). But + * sometimes filesystems call fscrypt_get_encryption_info() from within + * a transaction, which ranks it below page faults. So we need a + * separate lock which protects mk_secret but not also mk_users. + */ + struct fscrypt_master_key_secret mk_secret; + struct rw_semaphore mk_secret_sem; + + /* + * For v1 policy keys: an arbitrary key descriptor which was assigned by + * userspace (->descriptor). + * + * For v2 policy keys: a cryptographic hash of this key (->identifier). + */ + struct fscrypt_key_specifier mk_spec; + + /* + * Keyring which contains a key of type 'key_type_fscrypt_user' for each + * user who has added this key. Normally each key will be added by just + * one user, but it's possible that multiple users share a key, and in + * that case we need to keep track of those users so that one user can't + * remove the key before the others want it removed too. + * + * This is NULL for v1 policy keys; those can only be added by root. + * + * Locking: in addition to this keyrings own semaphore, this is + * protected by the master key's key->sem, so we can do atomic + * search+insert. It can also be searched without taking any locks, but + * in that case the returned key may have already been removed. + */ + struct key *mk_users; + + /* + * Length of ->mk_decrypted_inodes, plus one if mk_secret is present. + * Once this goes to 0, the master key is removed from ->s_master_keys. + * The 'struct fscrypt_master_key' will continue to live as long as the + * 'struct key' whose payload it is, but we won't let this reference + * count rise again. + */ + refcount_t mk_refcount; + + /* + * List of inodes that were unlocked using this key. This allows the + * inodes to be evicted efficiently if the key is removed. + */ + struct list_head mk_decrypted_inodes; + spinlock_t mk_decrypted_inodes_lock; + + /* Per-mode tfms for DIRECT_KEY policies, allocated on-demand */ + struct crypto_skcipher *mk_mode_keys[__FSCRYPT_MODE_MAX + 1]; + +} __randomize_layout; + +static inline bool +is_master_key_secret_present(const struct fscrypt_master_key_secret *secret) +{ + /* + * The READ_ONCE() is only necessary for fscrypt_drop_inode() and + * fscrypt_key_describe(). These run in atomic context, so they can't + * take ->mk_secret_sem and thus 'secret' can change concurrently which + * would be a data race. But they only need to know whether the secret + * *was* present at the time of check, so READ_ONCE() suffices. + */ + return READ_ONCE(secret->size) != 0; +} + +static inline const char *master_key_spec_type( + const struct fscrypt_key_specifier *spec) +{ + switch (spec->type) { + case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: + return "descriptor"; + case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER: + return "identifier"; + } + return "[unknown]"; +} + +static inline int master_key_spec_len(const struct fscrypt_key_specifier *spec) +{ + switch (spec->type) { + case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: + return FSCRYPT_KEY_DESCRIPTOR_SIZE; + case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER: + return FSCRYPT_KEY_IDENTIFIER_SIZE; + } + return 0; +} + +extern struct key * +fscrypt_find_master_key(struct super_block *sb, + const struct fscrypt_key_specifier *mk_spec); + +extern int fscrypt_verify_key_added(struct super_block *sb, + const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); + +extern int __init fscrypt_init_keyring(void); + +/* keysetup.c */ struct fscrypt_mode { const char *friendly_name; @@ -166,6 +447,36 @@ struct fscrypt_mode { bool needs_essiv; }; -extern void __exit fscrypt_essiv_cleanup(void); +static inline bool +fscrypt_mode_supports_direct_key(const struct fscrypt_mode *mode) +{ + return mode->ivsize >= offsetofend(union fscrypt_iv, nonce); +} + +extern struct crypto_skcipher * +fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key, + const struct inode *inode); + +extern int fscrypt_set_derived_key(struct fscrypt_info *ci, + const u8 *derived_key); + +/* keysetup_v1.c */ + +extern void fscrypt_put_direct_key(struct fscrypt_direct_key *dk); + +extern int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, + const u8 *raw_master_key); + +extern int fscrypt_setup_v1_file_key_via_subscribed_keyrings( + struct fscrypt_info *ci); +/* policy.c */ + +extern bool fscrypt_policies_equal(const union fscrypt_policy *policy1, + const union fscrypt_policy *policy2); +extern bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, + const struct inode *inode); +extern int fscrypt_policy_from_context(union fscrypt_policy *policy_u, + const union fscrypt_context *ctx_u, + int ctx_size); #endif /* _FSCRYPT_PRIVATE_H */ diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c new file mode 100644 index 000000000000..f21873e1b467 --- /dev/null +++ b/fs/crypto/hkdf.c @@ -0,0 +1,181 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Implementation of HKDF ("HMAC-based Extract-and-Expand Key Derivation + * Function"), aka RFC 5869. See also the original paper (Krawczyk 2010): + * "Cryptographic Extraction and Key Derivation: The HKDF Scheme". + * + * This is used to derive keys from the fscrypt master keys. + * + * Copyright 2019 Google LLC + */ + +#include <crypto/hash.h> +#include <crypto/sha.h> + +#include "fscrypt_private.h" + +/* + * HKDF supports any unkeyed cryptographic hash algorithm, but fscrypt uses + * SHA-512 because it is reasonably secure and efficient; and since it produces + * a 64-byte digest, deriving an AES-256-XTS key preserves all 64 bytes of + * entropy from the master key and requires only one iteration of HKDF-Expand. + */ +#define HKDF_HMAC_ALG "hmac(sha512)" +#define HKDF_HASHLEN SHA512_DIGEST_SIZE + +/* + * HKDF consists of two steps: + * + * 1. HKDF-Extract: extract a pseudorandom key of length HKDF_HASHLEN bytes from + * the input keying material and optional salt. + * 2. HKDF-Expand: expand the pseudorandom key into output keying material of + * any length, parameterized by an application-specific info string. + * + * HKDF-Extract can be skipped if the input is already a pseudorandom key of + * length HKDF_HASHLEN bytes. However, cipher modes other than AES-256-XTS take + * shorter keys, and we don't want to force users of those modes to provide + * unnecessarily long master keys. Thus fscrypt still does HKDF-Extract. No + * salt is used, since fscrypt master keys should already be pseudorandom and + * there's no way to persist a random salt per master key from kernel mode. + */ + +/* HKDF-Extract (RFC 5869 section 2.2), unsalted */ +static int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm, + unsigned int ikmlen, u8 prk[HKDF_HASHLEN]) +{ + static const u8 default_salt[HKDF_HASHLEN]; + SHASH_DESC_ON_STACK(desc, hmac_tfm); + int err; + + err = crypto_shash_setkey(hmac_tfm, default_salt, HKDF_HASHLEN); + if (err) + return err; + + desc->tfm = hmac_tfm; + err = crypto_shash_digest(desc, ikm, ikmlen, prk); + shash_desc_zero(desc); + return err; +} + +/* + * Compute HKDF-Extract using the given master key as the input keying material, + * and prepare an HMAC transform object keyed by the resulting pseudorandom key. + * + * Afterwards, the keyed HMAC transform object can be used for HKDF-Expand many + * times without having to recompute HKDF-Extract each time. + */ +int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, + unsigned int master_key_size) +{ + struct crypto_shash *hmac_tfm; + u8 prk[HKDF_HASHLEN]; + int err; + + hmac_tfm = crypto_alloc_shash(HKDF_HMAC_ALG, 0, 0); + if (IS_ERR(hmac_tfm)) { + fscrypt_err(NULL, "Error allocating " HKDF_HMAC_ALG ": %ld", + PTR_ERR(hmac_tfm)); + return PTR_ERR(hmac_tfm); + } + + if (WARN_ON(crypto_shash_digestsize(hmac_tfm) != sizeof(prk))) { + err = -EINVAL; + goto err_free_tfm; + } + + err = hkdf_extract(hmac_tfm, master_key, master_key_size, prk); + if (err) + goto err_free_tfm; + + err = crypto_shash_setkey(hmac_tfm, prk, sizeof(prk)); + if (err) + goto err_free_tfm; + + hkdf->hmac_tfm = hmac_tfm; + goto out; + +err_free_tfm: + crypto_free_shash(hmac_tfm); +out: + memzero_explicit(prk, sizeof(prk)); + return err; +} + +/* + * HKDF-Expand (RFC 5869 section 2.3). This expands the pseudorandom key, which + * was already keyed into 'hkdf->hmac_tfm' by fscrypt_init_hkdf(), into 'okmlen' + * bytes of output keying material parameterized by the application-specific + * 'info' of length 'infolen' bytes, prefixed by "fscrypt\0" and the 'context' + * byte. This is thread-safe and may be called by multiple threads in parallel. + * + * ('context' isn't part of the HKDF specification; it's just a prefix fscrypt + * adds to its application-specific info strings to guarantee that it doesn't + * accidentally repeat an info string when using HKDF for different purposes.) + */ +int fscrypt_hkdf_expand(struct fscrypt_hkdf *hkdf, u8 context, + const u8 *info, unsigned int infolen, + u8 *okm, unsigned int okmlen) +{ + SHASH_DESC_ON_STACK(desc, hkdf->hmac_tfm); + u8 prefix[9]; + unsigned int i; + int err; + const u8 *prev = NULL; + u8 counter = 1; + u8 tmp[HKDF_HASHLEN]; + + if (WARN_ON(okmlen > 255 * HKDF_HASHLEN)) + return -EINVAL; + + desc->tfm = hkdf->hmac_tfm; + + memcpy(prefix, "fscrypt\0", 8); + prefix[8] = context; + + for (i = 0; i < okmlen; i += HKDF_HASHLEN) { + + err = crypto_shash_init(desc); + if (err) + goto out; + + if (prev) { + err = crypto_shash_update(desc, prev, HKDF_HASHLEN); + if (err) + goto out; + } + + err = crypto_shash_update(desc, prefix, sizeof(prefix)); + if (err) + goto out; + + err = crypto_shash_update(desc, info, infolen); + if (err) + goto out; + + BUILD_BUG_ON(sizeof(counter) != 1); + if (okmlen - i < HKDF_HASHLEN) { + err = crypto_shash_finup(desc, &counter, 1, tmp); + if (err) + goto out; + memcpy(&okm[i], tmp, okmlen - i); + memzero_explicit(tmp, sizeof(tmp)); + } else { + err = crypto_shash_finup(desc, &counter, 1, &okm[i]); + if (err) + goto out; + } + counter++; + prev = &okm[i]; + } + err = 0; +out: + if (unlikely(err)) + memzero_explicit(okm, okmlen); /* so caller doesn't need to */ + shash_desc_zero(desc); + return err; +} + +void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf) +{ + crypto_free_shash(hkdf->hmac_tfm); +} diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index c1d6715d88e9..bb3b7fcfdd48 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -39,9 +39,9 @@ int fscrypt_file_open(struct inode *inode, struct file *filp) dir = dget_parent(file_dentry(filp)); if (IS_ENCRYPTED(d_inode(dir)) && !fscrypt_has_permitted_context(d_inode(dir), inode)) { - fscrypt_warn(inode->i_sb, - "inconsistent encryption contexts: %lu/%lu", - d_inode(dir)->i_ino, inode->i_ino); + fscrypt_warn(inode, + "Inconsistent encryption context (parent directory: %lu)", + d_inode(dir)->i_ino); err = -EPERM; } dput(dir); diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c deleted file mode 100644 index 207ebed918c1..000000000000 --- a/fs/crypto/keyinfo.c +++ /dev/null @@ -1,611 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * key management facility for FS encryption support. - * - * Copyright (C) 2015, Google, Inc. - * - * This contains encryption key functions. - * - * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. - */ - -#include <keys/user-type.h> -#include <linux/hashtable.h> -#include <linux/scatterlist.h> -#include <crypto/aes.h> -#include <crypto/algapi.h> -#include <crypto/sha.h> -#include <crypto/skcipher.h> -#include "fscrypt_private.h" - -static struct crypto_shash *essiv_hash_tfm; - -/* Table of keys referenced by FS_POLICY_FLAG_DIRECT_KEY policies */ -static DEFINE_HASHTABLE(fscrypt_master_keys, 6); /* 6 bits = 64 buckets */ -static DEFINE_SPINLOCK(fscrypt_master_keys_lock); - -/* - * Key derivation function. This generates the derived key by encrypting the - * master key with AES-128-ECB using the inode's nonce as the AES key. - * - * The master key must be at least as long as the derived key. If the master - * key is longer, then only the first 'derived_keysize' bytes are used. - */ -static int derive_key_aes(const u8 *master_key, - const struct fscrypt_context *ctx, - u8 *derived_key, unsigned int derived_keysize) -{ - int res = 0; - struct skcipher_request *req = NULL; - DECLARE_CRYPTO_WAIT(wait); - struct scatterlist src_sg, dst_sg; - struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0); - - if (IS_ERR(tfm)) { - res = PTR_ERR(tfm); - tfm = NULL; - goto out; - } - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); - req = skcipher_request_alloc(tfm, GFP_NOFS); - if (!req) { - res = -ENOMEM; - goto out; - } - skcipher_request_set_callback(req, - CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - crypto_req_done, &wait); - res = crypto_skcipher_setkey(tfm, ctx->nonce, sizeof(ctx->nonce)); - if (res < 0) - goto out; - - sg_init_one(&src_sg, master_key, derived_keysize); - sg_init_one(&dst_sg, derived_key, derived_keysize); - skcipher_request_set_crypt(req, &src_sg, &dst_sg, derived_keysize, - NULL); - res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); -out: - skcipher_request_free(req); - crypto_free_skcipher(tfm); - return res; -} - -/* - * Search the current task's subscribed keyrings for a "logon" key with - * description prefix:descriptor, and if found acquire a read lock on it and - * return a pointer to its validated payload in *payload_ret. - */ -static struct key * -find_and_lock_process_key(const char *prefix, - const u8 descriptor[FS_KEY_DESCRIPTOR_SIZE], - unsigned int min_keysize, - const struct fscrypt_key **payload_ret) -{ - char *description; - struct key *key; - const struct user_key_payload *ukp; - const struct fscrypt_key *payload; - - description = kasprintf(GFP_NOFS, "%s%*phN", prefix, - FS_KEY_DESCRIPTOR_SIZE, descriptor); - if (!description) - return ERR_PTR(-ENOMEM); - - key = request_key(&key_type_logon, description, NULL); - kfree(description); - if (IS_ERR(key)) - return key; - - down_read(&key->sem); - ukp = user_key_payload_locked(key); - - if (!ukp) /* was the key revoked before we acquired its semaphore? */ - goto invalid; - - payload = (const struct fscrypt_key *)ukp->data; - - if (ukp->datalen != sizeof(struct fscrypt_key) || - payload->size < 1 || payload->size > FS_MAX_KEY_SIZE) { - fscrypt_warn(NULL, - "key with description '%s' has invalid payload", - key->description); - goto invalid; - } - - if (payload->size < min_keysize) { - fscrypt_warn(NULL, - "key with description '%s' is too short (got %u bytes, need %u+ bytes)", - key->description, payload->size, min_keysize); - goto invalid; - } - - *payload_ret = payload; - return key; - -invalid: - up_read(&key->sem); - key_put(key); - return ERR_PTR(-ENOKEY); -} - -static struct fscrypt_mode available_modes[] = { - [FS_ENCRYPTION_MODE_AES_256_XTS] = { - .friendly_name = "AES-256-XTS", - .cipher_str = "xts(aes)", - .keysize = 64, - .ivsize = 16, - }, - [FS_ENCRYPTION_MODE_AES_256_CTS] = { - .friendly_name = "AES-256-CTS-CBC", - .cipher_str = "cts(cbc(aes))", - .keysize = 32, - .ivsize = 16, - }, - [FS_ENCRYPTION_MODE_AES_128_CBC] = { - .friendly_name = "AES-128-CBC", - .cipher_str = "cbc(aes)", - .keysize = 16, - .ivsize = 16, - .needs_essiv = true, - }, - [FS_ENCRYPTION_MODE_AES_128_CTS] = { - .friendly_name = "AES-128-CTS-CBC", - .cipher_str = "cts(cbc(aes))", - .keysize = 16, - .ivsize = 16, - }, - [FS_ENCRYPTION_MODE_ADIANTUM] = { - .friendly_name = "Adiantum", - .cipher_str = "adiantum(xchacha12,aes)", - .keysize = 32, - .ivsize = 32, - }, -}; - -static struct fscrypt_mode * -select_encryption_mode(const struct fscrypt_info *ci, const struct inode *inode) -{ - if (!fscrypt_valid_enc_modes(ci->ci_data_mode, ci->ci_filename_mode)) { - fscrypt_warn(inode->i_sb, - "inode %lu uses unsupported encryption modes (contents mode %d, filenames mode %d)", - inode->i_ino, ci->ci_data_mode, - ci->ci_filename_mode); - return ERR_PTR(-EINVAL); - } - - if (S_ISREG(inode->i_mode)) - return &available_modes[ci->ci_data_mode]; - - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - return &available_modes[ci->ci_filename_mode]; - - WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n", - inode->i_ino, (inode->i_mode & S_IFMT)); - return ERR_PTR(-EINVAL); -} - -/* Find the master key, then derive the inode's actual encryption key */ -static int find_and_derive_key(const struct inode *inode, - const struct fscrypt_context *ctx, - u8 *derived_key, const struct fscrypt_mode *mode) -{ - struct key *key; - const struct fscrypt_key *payload; - int err; - - key = find_and_lock_process_key(FS_KEY_DESC_PREFIX, - ctx->master_key_descriptor, - mode->keysize, &payload); - if (key == ERR_PTR(-ENOKEY) && inode->i_sb->s_cop->key_prefix) { - key = find_and_lock_process_key(inode->i_sb->s_cop->key_prefix, - ctx->master_key_descriptor, - mode->keysize, &payload); - } - if (IS_ERR(key)) - return PTR_ERR(key); - - if (ctx->flags & FS_POLICY_FLAG_DIRECT_KEY) { - if (mode->ivsize < offsetofend(union fscrypt_iv, nonce)) { - fscrypt_warn(inode->i_sb, - "direct key mode not allowed with %s", - mode->friendly_name); - err = -EINVAL; - } else if (ctx->contents_encryption_mode != - ctx->filenames_encryption_mode) { - fscrypt_warn(inode->i_sb, - "direct key mode not allowed with different contents and filenames modes"); - err = -EINVAL; - } else { - memcpy(derived_key, payload->raw, mode->keysize); - err = 0; - } - } else { - err = derive_key_aes(payload->raw, ctx, derived_key, - mode->keysize); - } - up_read(&key->sem); - key_put(key); - return err; -} - -/* Allocate and key a symmetric cipher object for the given encryption mode */ -static struct crypto_skcipher * -allocate_skcipher_for_mode(struct fscrypt_mode *mode, const u8 *raw_key, - const struct inode *inode) -{ - struct crypto_skcipher *tfm; - int err; - - tfm = crypto_alloc_skcipher(mode->cipher_str, 0, 0); - if (IS_ERR(tfm)) { - fscrypt_warn(inode->i_sb, - "error allocating '%s' transform for inode %lu: %ld", - mode->cipher_str, inode->i_ino, PTR_ERR(tfm)); - return tfm; - } - if (unlikely(!mode->logged_impl_name)) { - /* - * fscrypt performance can vary greatly depending on which - * crypto algorithm implementation is used. Help people debug - * performance problems by logging the ->cra_driver_name the - * first time a mode is used. Note that multiple threads can - * race here, but it doesn't really matter. - */ - mode->logged_impl_name = true; - pr_info("fscrypt: %s using implementation \"%s\"\n", - mode->friendly_name, - crypto_skcipher_alg(tfm)->base.cra_driver_name); - } - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); - err = crypto_skcipher_setkey(tfm, raw_key, mode->keysize); - if (err) - goto err_free_tfm; - - return tfm; - -err_free_tfm: - crypto_free_skcipher(tfm); - return ERR_PTR(err); -} - -/* Master key referenced by FS_POLICY_FLAG_DIRECT_KEY policy */ -struct fscrypt_master_key { - struct hlist_node mk_node; - refcount_t mk_refcount; - const struct fscrypt_mode *mk_mode; - struct crypto_skcipher *mk_ctfm; - u8 mk_descriptor[FS_KEY_DESCRIPTOR_SIZE]; - u8 mk_raw[FS_MAX_KEY_SIZE]; -}; - -static void free_master_key(struct fscrypt_master_key *mk) -{ - if (mk) { - crypto_free_skcipher(mk->mk_ctfm); - kzfree(mk); - } -} - -static void put_master_key(struct fscrypt_master_key *mk) -{ - if (!refcount_dec_and_lock(&mk->mk_refcount, &fscrypt_master_keys_lock)) - return; - hash_del(&mk->mk_node); - spin_unlock(&fscrypt_master_keys_lock); - - free_master_key(mk); -} - -/* - * Find/insert the given master key into the fscrypt_master_keys table. If - * found, it is returned with elevated refcount, and 'to_insert' is freed if - * non-NULL. If not found, 'to_insert' is inserted and returned if it's - * non-NULL; otherwise NULL is returned. - */ -static struct fscrypt_master_key * -find_or_insert_master_key(struct fscrypt_master_key *to_insert, - const u8 *raw_key, const struct fscrypt_mode *mode, - const struct fscrypt_info *ci) -{ - unsigned long hash_key; - struct fscrypt_master_key *mk; - - /* - * Careful: to avoid potentially leaking secret key bytes via timing - * information, we must key the hash table by descriptor rather than by - * raw key, and use crypto_memneq() when comparing raw keys. - */ - - BUILD_BUG_ON(sizeof(hash_key) > FS_KEY_DESCRIPTOR_SIZE); - memcpy(&hash_key, ci->ci_master_key_descriptor, sizeof(hash_key)); - - spin_lock(&fscrypt_master_keys_lock); - hash_for_each_possible(fscrypt_master_keys, mk, mk_node, hash_key) { - if (memcmp(ci->ci_master_key_descriptor, mk->mk_descriptor, - FS_KEY_DESCRIPTOR_SIZE) != 0) - continue; - if (mode != mk->mk_mode) - continue; - if (crypto_memneq(raw_key, mk->mk_raw, mode->keysize)) - continue; - /* using existing tfm with same (descriptor, mode, raw_key) */ - refcount_inc(&mk->mk_refcount); - spin_unlock(&fscrypt_master_keys_lock); - free_master_key(to_insert); - return mk; - } - if (to_insert) - hash_add(fscrypt_master_keys, &to_insert->mk_node, hash_key); - spin_unlock(&fscrypt_master_keys_lock); - return to_insert; -} - -/* Prepare to encrypt directly using the master key in the given mode */ -static struct fscrypt_master_key * -fscrypt_get_master_key(const struct fscrypt_info *ci, struct fscrypt_mode *mode, - const u8 *raw_key, const struct inode *inode) -{ - struct fscrypt_master_key *mk; - int err; - - /* Is there already a tfm for this key? */ - mk = find_or_insert_master_key(NULL, raw_key, mode, ci); - if (mk) - return mk; - - /* Nope, allocate one. */ - mk = kzalloc(sizeof(*mk), GFP_NOFS); - if (!mk) - return ERR_PTR(-ENOMEM); - refcount_set(&mk->mk_refcount, 1); - mk->mk_mode = mode; - mk->mk_ctfm = allocate_skcipher_for_mode(mode, raw_key, inode); - if (IS_ERR(mk->mk_ctfm)) { - err = PTR_ERR(mk->mk_ctfm); - mk->mk_ctfm = NULL; - goto err_free_mk; - } - memcpy(mk->mk_descriptor, ci->ci_master_key_descriptor, - FS_KEY_DESCRIPTOR_SIZE); - memcpy(mk->mk_raw, raw_key, mode->keysize); - - return find_or_insert_master_key(mk, raw_key, mode, ci); - -err_free_mk: - free_master_key(mk); - return ERR_PTR(err); -} - -static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt) -{ - struct crypto_shash *tfm = READ_ONCE(essiv_hash_tfm); - - /* init hash transform on demand */ - if (unlikely(!tfm)) { - struct crypto_shash *prev_tfm; - - tfm = crypto_alloc_shash("sha256", 0, 0); - if (IS_ERR(tfm)) { - fscrypt_warn(NULL, - "error allocating SHA-256 transform: %ld", - PTR_ERR(tfm)); - return PTR_ERR(tfm); - } - prev_tfm = cmpxchg(&essiv_hash_tfm, NULL, tfm); - if (prev_tfm) { - crypto_free_shash(tfm); - tfm = prev_tfm; - } - } - - { - SHASH_DESC_ON_STACK(desc, tfm); - desc->tfm = tfm; - - return crypto_shash_digest(desc, key, keysize, salt); - } -} - -static int init_essiv_generator(struct fscrypt_info *ci, const u8 *raw_key, - int keysize) -{ - int err; - struct crypto_cipher *essiv_tfm; - u8 salt[SHA256_DIGEST_SIZE]; - - essiv_tfm = crypto_alloc_cipher("aes", 0, 0); - if (IS_ERR(essiv_tfm)) - return PTR_ERR(essiv_tfm); - - ci->ci_essiv_tfm = essiv_tfm; - - err = derive_essiv_salt(raw_key, keysize, salt); - if (err) - goto out; - - /* - * Using SHA256 to derive the salt/key will result in AES-256 being - * used for IV generation. File contents encryption will still use the - * configured keysize (AES-128) nevertheless. - */ - err = crypto_cipher_setkey(essiv_tfm, salt, sizeof(salt)); - if (err) - goto out; - -out: - memzero_explicit(salt, sizeof(salt)); - return err; -} - -void __exit fscrypt_essiv_cleanup(void) -{ - crypto_free_shash(essiv_hash_tfm); -} - -/* - * Given the encryption mode and key (normally the derived key, but for - * FS_POLICY_FLAG_DIRECT_KEY mode it's the master key), set up the inode's - * symmetric cipher transform object(s). - */ -static int setup_crypto_transform(struct fscrypt_info *ci, - struct fscrypt_mode *mode, - const u8 *raw_key, const struct inode *inode) -{ - struct fscrypt_master_key *mk; - struct crypto_skcipher *ctfm; - int err; - - if (ci->ci_flags & FS_POLICY_FLAG_DIRECT_KEY) { - mk = fscrypt_get_master_key(ci, mode, raw_key, inode); - if (IS_ERR(mk)) - return PTR_ERR(mk); - ctfm = mk->mk_ctfm; - } else { - mk = NULL; - ctfm = allocate_skcipher_for_mode(mode, raw_key, inode); - if (IS_ERR(ctfm)) - return PTR_ERR(ctfm); - } - ci->ci_master_key = mk; - ci->ci_ctfm = ctfm; - - if (mode->needs_essiv) { - /* ESSIV implies 16-byte IVs which implies !DIRECT_KEY */ - WARN_ON(mode->ivsize != AES_BLOCK_SIZE); - WARN_ON(ci->ci_flags & FS_POLICY_FLAG_DIRECT_KEY); - - err = init_essiv_generator(ci, raw_key, mode->keysize); - if (err) { - fscrypt_warn(inode->i_sb, - "error initializing ESSIV generator for inode %lu: %d", - inode->i_ino, err); - return err; - } - } - return 0; -} - -static void put_crypt_info(struct fscrypt_info *ci) -{ - if (!ci) - return; - - if (ci->ci_master_key) { - put_master_key(ci->ci_master_key); - } else { - crypto_free_skcipher(ci->ci_ctfm); - crypto_free_cipher(ci->ci_essiv_tfm); - } - kmem_cache_free(fscrypt_info_cachep, ci); -} - -int fscrypt_get_encryption_info(struct inode *inode) -{ - struct fscrypt_info *crypt_info; - struct fscrypt_context ctx; - struct fscrypt_mode *mode; - u8 *raw_key = NULL; - int res; - - if (fscrypt_has_encryption_key(inode)) - return 0; - - res = fscrypt_initialize(inode->i_sb->s_cop->flags); - if (res) - return res; - - res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); - if (res < 0) { - if (!fscrypt_dummy_context_enabled(inode) || - IS_ENCRYPTED(inode)) - return res; - /* Fake up a context for an unencrypted directory */ - memset(&ctx, 0, sizeof(ctx)); - ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; - ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS; - ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS; - memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE); - } else if (res != sizeof(ctx)) { - return -EINVAL; - } - - if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1) - return -EINVAL; - - if (ctx.flags & ~FS_POLICY_FLAGS_VALID) - return -EINVAL; - - crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_NOFS); - if (!crypt_info) - return -ENOMEM; - - crypt_info->ci_flags = ctx.flags; - crypt_info->ci_data_mode = ctx.contents_encryption_mode; - crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; - memcpy(crypt_info->ci_master_key_descriptor, ctx.master_key_descriptor, - FS_KEY_DESCRIPTOR_SIZE); - memcpy(crypt_info->ci_nonce, ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE); - - mode = select_encryption_mode(crypt_info, inode); - if (IS_ERR(mode)) { - res = PTR_ERR(mode); - goto out; - } - WARN_ON(mode->ivsize > FSCRYPT_MAX_IV_SIZE); - crypt_info->ci_mode = mode; - - /* - * This cannot be a stack buffer because it may be passed to the - * scatterlist crypto API as part of key derivation. - */ - res = -ENOMEM; - raw_key = kmalloc(mode->keysize, GFP_NOFS); - if (!raw_key) - goto out; - - res = find_and_derive_key(inode, &ctx, raw_key, mode); - if (res) - goto out; - - res = setup_crypto_transform(crypt_info, mode, raw_key, inode); - if (res) - goto out; - - if (cmpxchg_release(&inode->i_crypt_info, NULL, crypt_info) == NULL) - crypt_info = NULL; -out: - if (res == -ENOKEY) - res = 0; - put_crypt_info(crypt_info); - kzfree(raw_key); - return res; -} -EXPORT_SYMBOL(fscrypt_get_encryption_info); - -/** - * fscrypt_put_encryption_info - free most of an inode's fscrypt data - * - * Free the inode's fscrypt_info. Filesystems must call this when the inode is - * being evicted. An RCU grace period need not have elapsed yet. - */ -void fscrypt_put_encryption_info(struct inode *inode) -{ - put_crypt_info(inode->i_crypt_info); - inode->i_crypt_info = NULL; -} -EXPORT_SYMBOL(fscrypt_put_encryption_info); - -/** - * fscrypt_free_inode - free an inode's fscrypt data requiring RCU delay - * - * Free the inode's cached decrypted symlink target, if any. Filesystems must - * call this after an RCU grace period, just before they free the inode. - */ -void fscrypt_free_inode(struct inode *inode) -{ - if (IS_ENCRYPTED(inode) && S_ISLNK(inode->i_mode)) { - kfree(inode->i_link); - inode->i_link = NULL; - } -} -EXPORT_SYMBOL(fscrypt_free_inode); diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c new file mode 100644 index 000000000000..c34fa7c61b43 --- /dev/null +++ b/fs/crypto/keyring.c @@ -0,0 +1,984 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Filesystem-level keyring for fscrypt + * + * Copyright 2019 Google LLC + */ + +/* + * This file implements management of fscrypt master keys in the + * filesystem-level keyring, including the ioctls: + * + * - FS_IOC_ADD_ENCRYPTION_KEY + * - FS_IOC_REMOVE_ENCRYPTION_KEY + * - FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS + * - FS_IOC_GET_ENCRYPTION_KEY_STATUS + * + * See the "User API" section of Documentation/filesystems/fscrypt.rst for more + * information about these ioctls. + */ + +#include <crypto/skcipher.h> +#include <linux/key-type.h> +#include <linux/seq_file.h> + +#include "fscrypt_private.h" + +static void wipe_master_key_secret(struct fscrypt_master_key_secret *secret) +{ + fscrypt_destroy_hkdf(&secret->hkdf); + memzero_explicit(secret, sizeof(*secret)); +} + +static void move_master_key_secret(struct fscrypt_master_key_secret *dst, + struct fscrypt_master_key_secret *src) +{ + memcpy(dst, src, sizeof(*dst)); + memzero_explicit(src, sizeof(*src)); +} + +static void free_master_key(struct fscrypt_master_key *mk) +{ + size_t i; + + wipe_master_key_secret(&mk->mk_secret); + + for (i = 0; i < ARRAY_SIZE(mk->mk_mode_keys); i++) + crypto_free_skcipher(mk->mk_mode_keys[i]); + + key_put(mk->mk_users); + kzfree(mk); +} + +static inline bool valid_key_spec(const struct fscrypt_key_specifier *spec) +{ + if (spec->__reserved) + return false; + return master_key_spec_len(spec) != 0; +} + +static int fscrypt_key_instantiate(struct key *key, + struct key_preparsed_payload *prep) +{ + key->payload.data[0] = (struct fscrypt_master_key *)prep->data; + return 0; +} + +static void fscrypt_key_destroy(struct key *key) +{ + free_master_key(key->payload.data[0]); +} + +static void fscrypt_key_describe(const struct key *key, struct seq_file *m) +{ + seq_puts(m, key->description); + + if (key_is_positive(key)) { + const struct fscrypt_master_key *mk = key->payload.data[0]; + + if (!is_master_key_secret_present(&mk->mk_secret)) + seq_puts(m, ": secret removed"); + } +} + +/* + * Type of key in ->s_master_keys. Each key of this type represents a master + * key which has been added to the filesystem. Its payload is a + * 'struct fscrypt_master_key'. The "." prefix in the key type name prevents + * users from adding keys of this type via the keyrings syscalls rather than via + * the intended method of FS_IOC_ADD_ENCRYPTION_KEY. + */ +static struct key_type key_type_fscrypt = { + .name = "._fscrypt", + .instantiate = fscrypt_key_instantiate, + .destroy = fscrypt_key_destroy, + .describe = fscrypt_key_describe, +}; + +static int fscrypt_user_key_instantiate(struct key *key, + struct key_preparsed_payload *prep) +{ + /* + * We just charge FSCRYPT_MAX_KEY_SIZE bytes to the user's key quota for + * each key, regardless of the exact key size. The amount of memory + * actually used is greater than the size of the raw key anyway. + */ + return key_payload_reserve(key, FSCRYPT_MAX_KEY_SIZE); +} + +static void fscrypt_user_key_describe(const struct key *key, struct seq_file *m) +{ + seq_puts(m, key->description); +} + +/* + * Type of key in ->mk_users. Each key of this type represents a particular + * user who has added a particular master key. + * + * Note that the name of this key type really should be something like + * ".fscrypt-user" instead of simply ".fscrypt". But the shorter name is chosen + * mainly for simplicity of presentation in /proc/keys when read by a non-root + * user. And it is expected to be rare that a key is actually added by multiple + * users, since users should keep their encryption keys confidential. + */ +static struct key_type key_type_fscrypt_user = { + .name = ".fscrypt", + .instantiate = fscrypt_user_key_instantiate, + .describe = fscrypt_user_key_describe, +}; + +/* Search ->s_master_keys or ->mk_users */ +static struct key *search_fscrypt_keyring(struct key *keyring, + struct key_type *type, + const char *description) +{ + /* + * We need to mark the keyring reference as "possessed" so that we + * acquire permission to search it, via the KEY_POS_SEARCH permission. + */ + key_ref_t keyref = make_key_ref(keyring, true /* possessed */); + + keyref = keyring_search(keyref, type, description, false); + if (IS_ERR(keyref)) { + if (PTR_ERR(keyref) == -EAGAIN || /* not found */ + PTR_ERR(keyref) == -EKEYREVOKED) /* recently invalidated */ + keyref = ERR_PTR(-ENOKEY); + return ERR_CAST(keyref); + } + return key_ref_to_ptr(keyref); +} + +#define FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE \ + (CONST_STRLEN("fscrypt-") + FIELD_SIZEOF(struct super_block, s_id)) + +#define FSCRYPT_MK_DESCRIPTION_SIZE (2 * FSCRYPT_KEY_IDENTIFIER_SIZE + 1) + +#define FSCRYPT_MK_USERS_DESCRIPTION_SIZE \ + (CONST_STRLEN("fscrypt-") + 2 * FSCRYPT_KEY_IDENTIFIER_SIZE + \ + CONST_STRLEN("-users") + 1) + +#define FSCRYPT_MK_USER_DESCRIPTION_SIZE \ + (2 * FSCRYPT_KEY_IDENTIFIER_SIZE + CONST_STRLEN(".uid.") + 10 + 1) + +static void format_fs_keyring_description( + char description[FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE], + const struct super_block *sb) +{ + sprintf(description, "fscrypt-%s", sb->s_id); +} + +static void format_mk_description( + char description[FSCRYPT_MK_DESCRIPTION_SIZE], + const struct fscrypt_key_specifier *mk_spec) +{ + sprintf(description, "%*phN", + master_key_spec_len(mk_spec), (u8 *)&mk_spec->u); +} + +static void format_mk_users_keyring_description( + char description[FSCRYPT_MK_USERS_DESCRIPTION_SIZE], + const u8 mk_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) +{ + sprintf(description, "fscrypt-%*phN-users", + FSCRYPT_KEY_IDENTIFIER_SIZE, mk_identifier); +} + +static void format_mk_user_description( + char description[FSCRYPT_MK_USER_DESCRIPTION_SIZE], + const u8 mk_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) +{ + + sprintf(description, "%*phN.uid.%u", FSCRYPT_KEY_IDENTIFIER_SIZE, + mk_identifier, __kuid_val(current_fsuid())); +} + +/* Create ->s_master_keys if needed. Synchronized by fscrypt_add_key_mutex. */ +static int allocate_filesystem_keyring(struct super_block *sb) +{ + char description[FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE]; + struct key *keyring; + + if (sb->s_master_keys) + return 0; + + format_fs_keyring_description(description, sb); + keyring = keyring_alloc(description, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + current_cred(), KEY_POS_SEARCH | + KEY_USR_SEARCH | KEY_USR_READ | KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); + if (IS_ERR(keyring)) + return PTR_ERR(keyring); + + /* Pairs with READ_ONCE() in fscrypt_find_master_key() */ + smp_store_release(&sb->s_master_keys, keyring); + return 0; +} + +void fscrypt_sb_free(struct super_block *sb) +{ + key_put(sb->s_master_keys); + sb->s_master_keys = NULL; +} + +/* + * Find the specified master key in ->s_master_keys. + * Returns ERR_PTR(-ENOKEY) if not found. + */ +struct key *fscrypt_find_master_key(struct super_block *sb, + const struct fscrypt_key_specifier *mk_spec) +{ + struct key *keyring; + char description[FSCRYPT_MK_DESCRIPTION_SIZE]; + + /* pairs with smp_store_release() in allocate_filesystem_keyring() */ + keyring = READ_ONCE(sb->s_master_keys); + if (keyring == NULL) + return ERR_PTR(-ENOKEY); /* No keyring yet, so no keys yet. */ + + format_mk_description(description, mk_spec); + return search_fscrypt_keyring(keyring, &key_type_fscrypt, description); +} + +static int allocate_master_key_users_keyring(struct fscrypt_master_key *mk) +{ + char description[FSCRYPT_MK_USERS_DESCRIPTION_SIZE]; + struct key *keyring; + + format_mk_users_keyring_description(description, + mk->mk_spec.u.identifier); + keyring = keyring_alloc(description, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + current_cred(), KEY_POS_SEARCH | + KEY_USR_SEARCH | KEY_USR_READ | KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); + if (IS_ERR(keyring)) + return PTR_ERR(keyring); + + mk->mk_users = keyring; + return 0; +} + +/* + * Find the current user's "key" in the master key's ->mk_users. + * Returns ERR_PTR(-ENOKEY) if not found. + */ +static struct key *find_master_key_user(struct fscrypt_master_key *mk) +{ + char description[FSCRYPT_MK_USER_DESCRIPTION_SIZE]; + + format_mk_user_description(description, mk->mk_spec.u.identifier); + return search_fscrypt_keyring(mk->mk_users, &key_type_fscrypt_user, + description); +} + +/* + * Give the current user a "key" in ->mk_users. This charges the user's quota + * and marks the master key as added by the current user, so that it cannot be + * removed by another user with the key. Either the master key's key->sem must + * be held for write, or the master key must be still undergoing initialization. + */ +static int add_master_key_user(struct fscrypt_master_key *mk) +{ + char description[FSCRYPT_MK_USER_DESCRIPTION_SIZE]; + struct key *mk_user; + int err; + + format_mk_user_description(description, mk->mk_spec.u.identifier); + mk_user = key_alloc(&key_type_fscrypt_user, description, + current_fsuid(), current_gid(), current_cred(), + KEY_POS_SEARCH | KEY_USR_VIEW, 0, NULL); + if (IS_ERR(mk_user)) + return PTR_ERR(mk_user); + + err = key_instantiate_and_link(mk_user, NULL, 0, mk->mk_users, NULL); + key_put(mk_user); + return err; +} + +/* + * Remove the current user's "key" from ->mk_users. + * The master key's key->sem must be held for write. + * + * Returns 0 if removed, -ENOKEY if not found, or another -errno code. + */ +static int remove_master_key_user(struct fscrypt_master_key *mk) +{ + struct key *mk_user; + int err; + + mk_user = find_master_key_user(mk); + if (IS_ERR(mk_user)) + return PTR_ERR(mk_user); + err = key_unlink(mk->mk_users, mk_user); + key_put(mk_user); + return err; +} + +/* + * Allocate a new fscrypt_master_key which contains the given secret, set it as + * the payload of a new 'struct key' of type fscrypt, and link the 'struct key' + * into the given keyring. Synchronized by fscrypt_add_key_mutex. + */ +static int add_new_master_key(struct fscrypt_master_key_secret *secret, + const struct fscrypt_key_specifier *mk_spec, + struct key *keyring) +{ + struct fscrypt_master_key *mk; + char description[FSCRYPT_MK_DESCRIPTION_SIZE]; + struct key *key; + int err; + + mk = kzalloc(sizeof(*mk), GFP_KERNEL); + if (!mk) + return -ENOMEM; + + mk->mk_spec = *mk_spec; + + move_master_key_secret(&mk->mk_secret, secret); + init_rwsem(&mk->mk_secret_sem); + + refcount_set(&mk->mk_refcount, 1); /* secret is present */ + INIT_LIST_HEAD(&mk->mk_decrypted_inodes); + spin_lock_init(&mk->mk_decrypted_inodes_lock); + + if (mk_spec->type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) { + err = allocate_master_key_users_keyring(mk); + if (err) + goto out_free_mk; + err = add_master_key_user(mk); + if (err) + goto out_free_mk; + } + + /* + * Note that we don't charge this key to anyone's quota, since when + * ->mk_users is in use those keys are charged instead, and otherwise + * (when ->mk_users isn't in use) only root can add these keys. + */ + format_mk_description(description, mk_spec); + key = key_alloc(&key_type_fscrypt, description, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), + KEY_POS_SEARCH | KEY_USR_SEARCH | KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA, NULL); + if (IS_ERR(key)) { + err = PTR_ERR(key); + goto out_free_mk; + } + err = key_instantiate_and_link(key, mk, sizeof(*mk), keyring, NULL); + key_put(key); + if (err) + goto out_free_mk; + + return 0; + +out_free_mk: + free_master_key(mk); + return err; +} + +#define KEY_DEAD 1 + +static int add_existing_master_key(struct fscrypt_master_key *mk, + struct fscrypt_master_key_secret *secret) +{ + struct key *mk_user; + bool rekey; + int err; + + /* + * If the current user is already in ->mk_users, then there's nothing to + * do. (Not applicable for v1 policy keys, which have NULL ->mk_users.) + */ + if (mk->mk_users) { + mk_user = find_master_key_user(mk); + if (mk_user != ERR_PTR(-ENOKEY)) { + if (IS_ERR(mk_user)) + return PTR_ERR(mk_user); + key_put(mk_user); + return 0; + } + } + + /* If we'll be re-adding ->mk_secret, try to take the reference. */ + rekey = !is_master_key_secret_present(&mk->mk_secret); + if (rekey && !refcount_inc_not_zero(&mk->mk_refcount)) + return KEY_DEAD; + + /* Add the current user to ->mk_users, if applicable. */ + if (mk->mk_users) { + err = add_master_key_user(mk); + if (err) { + if (rekey && refcount_dec_and_test(&mk->mk_refcount)) + return KEY_DEAD; + return err; + } + } + + /* Re-add the secret if needed. */ + if (rekey) { + down_write(&mk->mk_secret_sem); + move_master_key_secret(&mk->mk_secret, secret); + up_write(&mk->mk_secret_sem); + } + return 0; +} + +static int add_master_key(struct super_block *sb, + struct fscrypt_master_key_secret *secret, + const struct fscrypt_key_specifier *mk_spec) +{ + static DEFINE_MUTEX(fscrypt_add_key_mutex); + struct key *key; + int err; + + mutex_lock(&fscrypt_add_key_mutex); /* serialize find + link */ +retry: + key = fscrypt_find_master_key(sb, mk_spec); + if (IS_ERR(key)) { + err = PTR_ERR(key); + if (err != -ENOKEY) + goto out_unlock; + /* Didn't find the key in ->s_master_keys. Add it. */ + err = allocate_filesystem_keyring(sb); + if (err) + goto out_unlock; + err = add_new_master_key(secret, mk_spec, sb->s_master_keys); + } else { + /* + * Found the key in ->s_master_keys. Re-add the secret if + * needed, and add the user to ->mk_users if needed. + */ + down_write(&key->sem); + err = add_existing_master_key(key->payload.data[0], secret); + up_write(&key->sem); + if (err == KEY_DEAD) { + /* Key being removed or needs to be removed */ + key_invalidate(key); + key_put(key); + goto retry; + } + key_put(key); + } +out_unlock: + mutex_unlock(&fscrypt_add_key_mutex); + return err; +} + +/* + * Add a master encryption key to the filesystem, causing all files which were + * encrypted with it to appear "unlocked" (decrypted) when accessed. + * + * When adding a key for use by v1 encryption policies, this ioctl is + * privileged, and userspace must provide the 'key_descriptor'. + * + * When adding a key for use by v2+ encryption policies, this ioctl is + * unprivileged. This is needed, in general, to allow non-root users to use + * encryption without encountering the visibility problems of process-subscribed + * keyrings and the inability to properly remove keys. This works by having + * each key identified by its cryptographically secure hash --- the + * 'key_identifier'. The cryptographic hash ensures that a malicious user + * cannot add the wrong key for a given identifier. Furthermore, each added key + * is charged to the appropriate user's quota for the keyrings service, which + * prevents a malicious user from adding too many keys. Finally, we forbid a + * user from removing a key while other users have added it too, which prevents + * a user who knows another user's key from causing a denial-of-service by + * removing it at an inopportune time. (We tolerate that a user who knows a key + * can prevent other users from removing it.) + * + * For more details, see the "FS_IOC_ADD_ENCRYPTION_KEY" section of + * Documentation/filesystems/fscrypt.rst. + */ +int fscrypt_ioctl_add_key(struct file *filp, void __user *_uarg) +{ + struct super_block *sb = file_inode(filp)->i_sb; + struct fscrypt_add_key_arg __user *uarg = _uarg; + struct fscrypt_add_key_arg arg; + struct fscrypt_master_key_secret secret; + int err; + + if (copy_from_user(&arg, uarg, sizeof(arg))) + return -EFAULT; + + if (!valid_key_spec(&arg.key_spec)) + return -EINVAL; + + if (arg.raw_size < FSCRYPT_MIN_KEY_SIZE || + arg.raw_size > FSCRYPT_MAX_KEY_SIZE) + return -EINVAL; + + if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved))) + return -EINVAL; + + memset(&secret, 0, sizeof(secret)); + secret.size = arg.raw_size; + err = -EFAULT; + if (copy_from_user(secret.raw, uarg->raw, secret.size)) + goto out_wipe_secret; + + switch (arg.key_spec.type) { + case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: + /* + * Only root can add keys that are identified by an arbitrary + * descriptor rather than by a cryptographic hash --- since + * otherwise a malicious user could add the wrong key. + */ + err = -EACCES; + if (!capable(CAP_SYS_ADMIN)) + goto out_wipe_secret; + break; + case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER: + err = fscrypt_init_hkdf(&secret.hkdf, secret.raw, secret.size); + if (err) + goto out_wipe_secret; + + /* + * Now that the HKDF context is initialized, the raw key is no + * longer needed. + */ + memzero_explicit(secret.raw, secret.size); + + /* Calculate the key identifier and return it to userspace. */ + err = fscrypt_hkdf_expand(&secret.hkdf, + HKDF_CONTEXT_KEY_IDENTIFIER, + NULL, 0, arg.key_spec.u.identifier, + FSCRYPT_KEY_IDENTIFIER_SIZE); + if (err) + goto out_wipe_secret; + err = -EFAULT; + if (copy_to_user(uarg->key_spec.u.identifier, + arg.key_spec.u.identifier, + FSCRYPT_KEY_IDENTIFIER_SIZE)) + goto out_wipe_secret; + break; + default: + WARN_ON(1); + err = -EINVAL; + goto out_wipe_secret; + } + + err = add_master_key(sb, &secret, &arg.key_spec); +out_wipe_secret: + wipe_master_key_secret(&secret); + return err; +} +EXPORT_SYMBOL_GPL(fscrypt_ioctl_add_key); + +/* + * Verify that the current user has added a master key with the given identifier + * (returns -ENOKEY if not). This is needed to prevent a user from encrypting + * their files using some other user's key which they don't actually know. + * Cryptographically this isn't much of a problem, but the semantics of this + * would be a bit weird, so it's best to just forbid it. + * + * The system administrator (CAP_FOWNER) can override this, which should be + * enough for any use cases where encryption policies are being set using keys + * that were chosen ahead of time but aren't available at the moment. + * + * Note that the key may have already removed by the time this returns, but + * that's okay; we just care whether the key was there at some point. + * + * Return: 0 if the key is added, -ENOKEY if it isn't, or another -errno code + */ +int fscrypt_verify_key_added(struct super_block *sb, + const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) +{ + struct fscrypt_key_specifier mk_spec; + struct key *key, *mk_user; + struct fscrypt_master_key *mk; + int err; + + mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER; + memcpy(mk_spec.u.identifier, identifier, FSCRYPT_KEY_IDENTIFIER_SIZE); + + key = fscrypt_find_master_key(sb, &mk_spec); + if (IS_ERR(key)) { + err = PTR_ERR(key); + goto out; + } + mk = key->payload.data[0]; + mk_user = find_master_key_user(mk); + if (IS_ERR(mk_user)) { + err = PTR_ERR(mk_user); + } else { + key_put(mk_user); + err = 0; + } + key_put(key); +out: + if (err == -ENOKEY && capable(CAP_FOWNER)) + err = 0; + return err; +} + +/* + * Try to evict the inode's dentries from the dentry cache. If the inode is a + * directory, then it can have at most one dentry; however, that dentry may be + * pinned by child dentries, so first try to evict the children too. + */ +static void shrink_dcache_inode(struct inode *inode) +{ + struct dentry *dentry; + + if (S_ISDIR(inode->i_mode)) { + dentry = d_find_any_alias(inode); + if (dentry) { + shrink_dcache_parent(dentry); + dput(dentry); + } + } + d_prune_aliases(inode); +} + +static void evict_dentries_for_decrypted_inodes(struct fscrypt_master_key *mk) +{ + struct fscrypt_info *ci; + struct inode *inode; + struct inode *toput_inode = NULL; + + spin_lock(&mk->mk_decrypted_inodes_lock); + + list_for_each_entry(ci, &mk->mk_decrypted_inodes, ci_master_key_link) { + inode = ci->ci_inode; + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) { + spin_unlock(&inode->i_lock); + continue; + } + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(&mk->mk_decrypted_inodes_lock); + + shrink_dcache_inode(inode); + iput(toput_inode); + toput_inode = inode; + + spin_lock(&mk->mk_decrypted_inodes_lock); + } + + spin_unlock(&mk->mk_decrypted_inodes_lock); + iput(toput_inode); +} + +static int check_for_busy_inodes(struct super_block *sb, + struct fscrypt_master_key *mk) +{ + struct list_head *pos; + size_t busy_count = 0; + unsigned long ino; + struct dentry *dentry; + char _path[256]; + char *path = NULL; + + spin_lock(&mk->mk_decrypted_inodes_lock); + + list_for_each(pos, &mk->mk_decrypted_inodes) + busy_count++; + + if (busy_count == 0) { + spin_unlock(&mk->mk_decrypted_inodes_lock); + return 0; + } + + { + /* select an example file to show for debugging purposes */ + struct inode *inode = + list_first_entry(&mk->mk_decrypted_inodes, + struct fscrypt_info, + ci_master_key_link)->ci_inode; + ino = inode->i_ino; + dentry = d_find_alias(inode); + } + spin_unlock(&mk->mk_decrypted_inodes_lock); + + if (dentry) { + path = dentry_path(dentry, _path, sizeof(_path)); + dput(dentry); + } + if (IS_ERR_OR_NULL(path)) + path = "(unknown)"; + + fscrypt_warn(NULL, + "%s: %zu inode(s) still busy after removing key with %s %*phN, including ino %lu (%s)", + sb->s_id, busy_count, master_key_spec_type(&mk->mk_spec), + master_key_spec_len(&mk->mk_spec), (u8 *)&mk->mk_spec.u, + ino, path); + return -EBUSY; +} + +static int try_to_lock_encrypted_files(struct super_block *sb, + struct fscrypt_master_key *mk) +{ + int err1; + int err2; + + /* + * An inode can't be evicted while it is dirty or has dirty pages. + * Thus, we first have to clean the inodes in ->mk_decrypted_inodes. + * + * Just do it the easy way: call sync_filesystem(). It's overkill, but + * it works, and it's more important to minimize the amount of caches we + * drop than the amount of data we sync. Also, unprivileged users can + * already call sync_filesystem() via sys_syncfs() or sys_sync(). + */ + down_read(&sb->s_umount); + err1 = sync_filesystem(sb); + up_read(&sb->s_umount); + /* If a sync error occurs, still try to evict as much as possible. */ + + /* + * Inodes are pinned by their dentries, so we have to evict their + * dentries. shrink_dcache_sb() would suffice, but would be overkill + * and inappropriate for use by unprivileged users. So instead go + * through the inodes' alias lists and try to evict each dentry. + */ + evict_dentries_for_decrypted_inodes(mk); + + /* + * evict_dentries_for_decrypted_inodes() already iput() each inode in + * the list; any inodes for which that dropped the last reference will + * have been evicted due to fscrypt_drop_inode() detecting the key + * removal and telling the VFS to evict the inode. So to finish, we + * just need to check whether any inodes couldn't be evicted. + */ + err2 = check_for_busy_inodes(sb, mk); + + return err1 ?: err2; +} + +/* + * Try to remove an fscrypt master encryption key. + * + * FS_IOC_REMOVE_ENCRYPTION_KEY (all_users=false) removes the current user's + * claim to the key, then removes the key itself if no other users have claims. + * FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS (all_users=true) always removes the + * key itself. + * + * To "remove the key itself", first we wipe the actual master key secret, so + * that no more inodes can be unlocked with it. Then we try to evict all cached + * inodes that had been unlocked with the key. + * + * If all inodes were evicted, then we unlink the fscrypt_master_key from the + * keyring. Otherwise it remains in the keyring in the "incompletely removed" + * state (without the actual secret key) where it tracks the list of remaining + * inodes. Userspace can execute the ioctl again later to retry eviction, or + * alternatively can re-add the secret key again. + * + * For more details, see the "Removing keys" section of + * Documentation/filesystems/fscrypt.rst. + */ +static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users) +{ + struct super_block *sb = file_inode(filp)->i_sb; + struct fscrypt_remove_key_arg __user *uarg = _uarg; + struct fscrypt_remove_key_arg arg; + struct key *key; + struct fscrypt_master_key *mk; + u32 status_flags = 0; + int err; + bool dead; + + if (copy_from_user(&arg, uarg, sizeof(arg))) + return -EFAULT; + + if (!valid_key_spec(&arg.key_spec)) + return -EINVAL; + + if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved))) + return -EINVAL; + + /* + * Only root can add and remove keys that are identified by an arbitrary + * descriptor rather than by a cryptographic hash. + */ + if (arg.key_spec.type == FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR && + !capable(CAP_SYS_ADMIN)) + return -EACCES; + + /* Find the key being removed. */ + key = fscrypt_find_master_key(sb, &arg.key_spec); + if (IS_ERR(key)) + return PTR_ERR(key); + mk = key->payload.data[0]; + + down_write(&key->sem); + + /* If relevant, remove current user's (or all users) claim to the key */ + if (mk->mk_users && mk->mk_users->keys.nr_leaves_on_tree != 0) { + if (all_users) + err = keyring_clear(mk->mk_users); + else + err = remove_master_key_user(mk); + if (err) { + up_write(&key->sem); + goto out_put_key; + } + if (mk->mk_users->keys.nr_leaves_on_tree != 0) { + /* + * Other users have still added the key too. We removed + * the current user's claim to the key, but we still + * can't remove the key itself. + */ + status_flags |= + FSCRYPT_KEY_REMOVAL_STATUS_FLAG_OTHER_USERS; + err = 0; + up_write(&key->sem); + goto out_put_key; + } + } + + /* No user claims remaining. Go ahead and wipe the secret. */ + dead = false; + if (is_master_key_secret_present(&mk->mk_secret)) { + down_write(&mk->mk_secret_sem); + wipe_master_key_secret(&mk->mk_secret); + dead = refcount_dec_and_test(&mk->mk_refcount); + up_write(&mk->mk_secret_sem); + } + up_write(&key->sem); + if (dead) { + /* + * No inodes reference the key, and we wiped the secret, so the + * key object is free to be removed from the keyring. + */ + key_invalidate(key); + err = 0; + } else { + /* Some inodes still reference this key; try to evict them. */ + err = try_to_lock_encrypted_files(sb, mk); + if (err == -EBUSY) { + status_flags |= + FSCRYPT_KEY_REMOVAL_STATUS_FLAG_FILES_BUSY; + err = 0; + } + } + /* + * We return 0 if we successfully did something: removed a claim to the + * key, wiped the secret, or tried locking the files again. Users need + * to check the informational status flags if they care whether the key + * has been fully removed including all files locked. + */ +out_put_key: + key_put(key); + if (err == 0) + err = put_user(status_flags, &uarg->removal_status_flags); + return err; +} + +int fscrypt_ioctl_remove_key(struct file *filp, void __user *uarg) +{ + return do_remove_key(filp, uarg, false); +} +EXPORT_SYMBOL_GPL(fscrypt_ioctl_remove_key); + +int fscrypt_ioctl_remove_key_all_users(struct file *filp, void __user *uarg) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + return do_remove_key(filp, uarg, true); +} +EXPORT_SYMBOL_GPL(fscrypt_ioctl_remove_key_all_users); + +/* + * Retrieve the status of an fscrypt master encryption key. + * + * We set ->status to indicate whether the key is absent, present, or + * incompletely removed. "Incompletely removed" means that the master key + * secret has been removed, but some files which had been unlocked with it are + * still in use. This field allows applications to easily determine the state + * of an encrypted directory without using a hack such as trying to open a + * regular file in it (which can confuse the "incompletely removed" state with + * absent or present). + * + * In addition, for v2 policy keys we allow applications to determine, via + * ->status_flags and ->user_count, whether the key has been added by the + * current user, by other users, or by both. Most applications should not need + * this, since ordinarily only one user should know a given key. However, if a + * secret key is shared by multiple users, applications may wish to add an + * already-present key to prevent other users from removing it. This ioctl can + * be used to check whether that really is the case before the work is done to + * add the key --- which might e.g. require prompting the user for a passphrase. + * + * For more details, see the "FS_IOC_GET_ENCRYPTION_KEY_STATUS" section of + * Documentation/filesystems/fscrypt.rst. + */ +int fscrypt_ioctl_get_key_status(struct file *filp, void __user *uarg) +{ + struct super_block *sb = file_inode(filp)->i_sb; + struct fscrypt_get_key_status_arg arg; + struct key *key; + struct fscrypt_master_key *mk; + int err; + + if (copy_from_user(&arg, uarg, sizeof(arg))) + return -EFAULT; + + if (!valid_key_spec(&arg.key_spec)) + return -EINVAL; + + if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved))) + return -EINVAL; + + arg.status_flags = 0; + arg.user_count = 0; + memset(arg.__out_reserved, 0, sizeof(arg.__out_reserved)); + + key = fscrypt_find_master_key(sb, &arg.key_spec); + if (IS_ERR(key)) { + if (key != ERR_PTR(-ENOKEY)) + return PTR_ERR(key); + arg.status = FSCRYPT_KEY_STATUS_ABSENT; + err = 0; + goto out; + } + mk = key->payload.data[0]; + down_read(&key->sem); + + if (!is_master_key_secret_present(&mk->mk_secret)) { + arg.status = FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED; + err = 0; + goto out_release_key; + } + + arg.status = FSCRYPT_KEY_STATUS_PRESENT; + if (mk->mk_users) { + struct key *mk_user; + + arg.user_count = mk->mk_users->keys.nr_leaves_on_tree; + mk_user = find_master_key_user(mk); + if (!IS_ERR(mk_user)) { + arg.status_flags |= + FSCRYPT_KEY_STATUS_FLAG_ADDED_BY_SELF; + key_put(mk_user); + } else if (mk_user != ERR_PTR(-ENOKEY)) { + err = PTR_ERR(mk_user); + goto out_release_key; + } + } + err = 0; +out_release_key: + up_read(&key->sem); + key_put(key); +out: + if (!err && copy_to_user(uarg, &arg, sizeof(arg))) + err = -EFAULT; + return err; +} +EXPORT_SYMBOL_GPL(fscrypt_ioctl_get_key_status); + +int __init fscrypt_init_keyring(void) +{ + int err; + + err = register_key_type(&key_type_fscrypt); + if (err) + return err; + + err = register_key_type(&key_type_fscrypt_user); + if (err) + goto err_unregister_fscrypt; + + return 0; + +err_unregister_fscrypt: + unregister_key_type(&key_type_fscrypt); + return err; +} diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c new file mode 100644 index 000000000000..d71c2d6dd162 --- /dev/null +++ b/fs/crypto/keysetup.c @@ -0,0 +1,591 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Key setup facility for FS encryption support. + * + * Copyright (C) 2015, Google, Inc. + * + * Originally written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar. + * Heavily modified since then. + */ + +#include <crypto/aes.h> +#include <crypto/sha.h> +#include <crypto/skcipher.h> +#include <linux/key.h> + +#include "fscrypt_private.h" + +static struct crypto_shash *essiv_hash_tfm; + +static struct fscrypt_mode available_modes[] = { + [FSCRYPT_MODE_AES_256_XTS] = { + .friendly_name = "AES-256-XTS", + .cipher_str = "xts(aes)", + .keysize = 64, + .ivsize = 16, + }, + [FSCRYPT_MODE_AES_256_CTS] = { + .friendly_name = "AES-256-CTS-CBC", + .cipher_str = "cts(cbc(aes))", + .keysize = 32, + .ivsize = 16, + }, + [FSCRYPT_MODE_AES_128_CBC] = { + .friendly_name = "AES-128-CBC", + .cipher_str = "cbc(aes)", + .keysize = 16, + .ivsize = 16, + .needs_essiv = true, + }, + [FSCRYPT_MODE_AES_128_CTS] = { + .friendly_name = "AES-128-CTS-CBC", + .cipher_str = "cts(cbc(aes))", + .keysize = 16, + .ivsize = 16, + }, + [FSCRYPT_MODE_ADIANTUM] = { + .friendly_name = "Adiantum", + .cipher_str = "adiantum(xchacha12,aes)", + .keysize = 32, + .ivsize = 32, + }, +}; + +static struct fscrypt_mode * +select_encryption_mode(const union fscrypt_policy *policy, + const struct inode *inode) +{ + if (S_ISREG(inode->i_mode)) + return &available_modes[fscrypt_policy_contents_mode(policy)]; + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + return &available_modes[fscrypt_policy_fnames_mode(policy)]; + + WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n", + inode->i_ino, (inode->i_mode & S_IFMT)); + return ERR_PTR(-EINVAL); +} + +/* Create a symmetric cipher object for the given encryption mode and key */ +struct crypto_skcipher *fscrypt_allocate_skcipher(struct fscrypt_mode *mode, + const u8 *raw_key, + const struct inode *inode) +{ + struct crypto_skcipher *tfm; + int err; + + tfm = crypto_alloc_skcipher(mode->cipher_str, 0, 0); + if (IS_ERR(tfm)) { + if (PTR_ERR(tfm) == -ENOENT) { + fscrypt_warn(inode, + "Missing crypto API support for %s (API name: \"%s\")", + mode->friendly_name, mode->cipher_str); + return ERR_PTR(-ENOPKG); + } + fscrypt_err(inode, "Error allocating '%s' transform: %ld", + mode->cipher_str, PTR_ERR(tfm)); + return tfm; + } + if (unlikely(!mode->logged_impl_name)) { + /* + * fscrypt performance can vary greatly depending on which + * crypto algorithm implementation is used. Help people debug + * performance problems by logging the ->cra_driver_name the + * first time a mode is used. Note that multiple threads can + * race here, but it doesn't really matter. + */ + mode->logged_impl_name = true; + pr_info("fscrypt: %s using implementation \"%s\"\n", + mode->friendly_name, + crypto_skcipher_alg(tfm)->base.cra_driver_name); + } + crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); + err = crypto_skcipher_setkey(tfm, raw_key, mode->keysize); + if (err) + goto err_free_tfm; + + return tfm; + +err_free_tfm: + crypto_free_skcipher(tfm); + return ERR_PTR(err); +} + +static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt) +{ + struct crypto_shash *tfm = READ_ONCE(essiv_hash_tfm); + + /* init hash transform on demand */ + if (unlikely(!tfm)) { + struct crypto_shash *prev_tfm; + + tfm = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(tfm)) { + if (PTR_ERR(tfm) == -ENOENT) { + fscrypt_warn(NULL, + "Missing crypto API support for SHA-256"); + return -ENOPKG; + } + fscrypt_err(NULL, + "Error allocating SHA-256 transform: %ld", + PTR_ERR(tfm)); + return PTR_ERR(tfm); + } + prev_tfm = cmpxchg(&essiv_hash_tfm, NULL, tfm); + if (prev_tfm) { + crypto_free_shash(tfm); + tfm = prev_tfm; + } + } + + { + SHASH_DESC_ON_STACK(desc, tfm); + desc->tfm = tfm; + + return crypto_shash_digest(desc, key, keysize, salt); + } +} + +static int init_essiv_generator(struct fscrypt_info *ci, const u8 *raw_key, + int keysize) +{ + int err; + struct crypto_cipher *essiv_tfm; + u8 salt[SHA256_DIGEST_SIZE]; + + if (WARN_ON(ci->ci_mode->ivsize != AES_BLOCK_SIZE)) + return -EINVAL; + + essiv_tfm = crypto_alloc_cipher("aes", 0, 0); + if (IS_ERR(essiv_tfm)) + return PTR_ERR(essiv_tfm); + + ci->ci_essiv_tfm = essiv_tfm; + + err = derive_essiv_salt(raw_key, keysize, salt); + if (err) + goto out; + + /* + * Using SHA256 to derive the salt/key will result in AES-256 being + * used for IV generation. File contents encryption will still use the + * configured keysize (AES-128) nevertheless. + */ + err = crypto_cipher_setkey(essiv_tfm, salt, sizeof(salt)); + if (err) + goto out; + +out: + memzero_explicit(salt, sizeof(salt)); + return err; +} + +/* Given the per-file key, set up the file's crypto transform object(s) */ +int fscrypt_set_derived_key(struct fscrypt_info *ci, const u8 *derived_key) +{ + struct fscrypt_mode *mode = ci->ci_mode; + struct crypto_skcipher *ctfm; + int err; + + ctfm = fscrypt_allocate_skcipher(mode, derived_key, ci->ci_inode); + if (IS_ERR(ctfm)) + return PTR_ERR(ctfm); + + ci->ci_ctfm = ctfm; + + if (mode->needs_essiv) { + err = init_essiv_generator(ci, derived_key, mode->keysize); + if (err) { + fscrypt_warn(ci->ci_inode, + "Error initializing ESSIV generator: %d", + err); + return err; + } + } + return 0; +} + +static int setup_per_mode_key(struct fscrypt_info *ci, + struct fscrypt_master_key *mk) +{ + struct fscrypt_mode *mode = ci->ci_mode; + u8 mode_num = mode - available_modes; + struct crypto_skcipher *tfm, *prev_tfm; + u8 mode_key[FSCRYPT_MAX_KEY_SIZE]; + int err; + + if (WARN_ON(mode_num >= ARRAY_SIZE(mk->mk_mode_keys))) + return -EINVAL; + + /* pairs with cmpxchg() below */ + tfm = READ_ONCE(mk->mk_mode_keys[mode_num]); + if (likely(tfm != NULL)) + goto done; + + BUILD_BUG_ON(sizeof(mode_num) != 1); + err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, + HKDF_CONTEXT_PER_MODE_KEY, + &mode_num, sizeof(mode_num), + mode_key, mode->keysize); + if (err) + return err; + tfm = fscrypt_allocate_skcipher(mode, mode_key, ci->ci_inode); + memzero_explicit(mode_key, mode->keysize); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + /* pairs with READ_ONCE() above */ + prev_tfm = cmpxchg(&mk->mk_mode_keys[mode_num], NULL, tfm); + if (prev_tfm != NULL) { + crypto_free_skcipher(tfm); + tfm = prev_tfm; + } +done: + ci->ci_ctfm = tfm; + return 0; +} + +static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, + struct fscrypt_master_key *mk) +{ + u8 derived_key[FSCRYPT_MAX_KEY_SIZE]; + int err; + + if (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) { + /* + * DIRECT_KEY: instead of deriving per-file keys, the per-file + * nonce will be included in all the IVs. But unlike v1 + * policies, for v2 policies in this case we don't encrypt with + * the master key directly but rather derive a per-mode key. + * This ensures that the master key is consistently used only + * for HKDF, avoiding key reuse issues. + */ + if (!fscrypt_mode_supports_direct_key(ci->ci_mode)) { + fscrypt_warn(ci->ci_inode, + "Direct key flag not allowed with %s", + ci->ci_mode->friendly_name); + return -EINVAL; + } + return setup_per_mode_key(ci, mk); + } + + err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, + HKDF_CONTEXT_PER_FILE_KEY, + ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE, + derived_key, ci->ci_mode->keysize); + if (err) + return err; + + err = fscrypt_set_derived_key(ci, derived_key); + memzero_explicit(derived_key, ci->ci_mode->keysize); + return err; +} + +/* + * Find the master key, then set up the inode's actual encryption key. + * + * If the master key is found in the filesystem-level keyring, then the + * corresponding 'struct key' is returned in *master_key_ret with + * ->mk_secret_sem read-locked. This is needed to ensure that only one task + * links the fscrypt_info into ->mk_decrypted_inodes (as multiple tasks may race + * to create an fscrypt_info for the same inode), and to synchronize the master + * key being removed with a new inode starting to use it. + */ +static int setup_file_encryption_key(struct fscrypt_info *ci, + struct key **master_key_ret) +{ + struct key *key; + struct fscrypt_master_key *mk = NULL; + struct fscrypt_key_specifier mk_spec; + int err; + + switch (ci->ci_policy.version) { + case FSCRYPT_POLICY_V1: + mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR; + memcpy(mk_spec.u.descriptor, + ci->ci_policy.v1.master_key_descriptor, + FSCRYPT_KEY_DESCRIPTOR_SIZE); + break; + case FSCRYPT_POLICY_V2: + mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER; + memcpy(mk_spec.u.identifier, + ci->ci_policy.v2.master_key_identifier, + FSCRYPT_KEY_IDENTIFIER_SIZE); + break; + default: + WARN_ON(1); + return -EINVAL; + } + + key = fscrypt_find_master_key(ci->ci_inode->i_sb, &mk_spec); + if (IS_ERR(key)) { + if (key != ERR_PTR(-ENOKEY) || + ci->ci_policy.version != FSCRYPT_POLICY_V1) + return PTR_ERR(key); + + /* + * As a legacy fallback for v1 policies, search for the key in + * the current task's subscribed keyrings too. Don't move this + * to before the search of ->s_master_keys, since users + * shouldn't be able to override filesystem-level keys. + */ + return fscrypt_setup_v1_file_key_via_subscribed_keyrings(ci); + } + + mk = key->payload.data[0]; + down_read(&mk->mk_secret_sem); + + /* Has the secret been removed (via FS_IOC_REMOVE_ENCRYPTION_KEY)? */ + if (!is_master_key_secret_present(&mk->mk_secret)) { + err = -ENOKEY; + goto out_release_key; + } + + /* + * Require that the master key be at least as long as the derived key. + * Otherwise, the derived key cannot possibly contain as much entropy as + * that required by the encryption mode it will be used for. For v1 + * policies it's also required for the KDF to work at all. + */ + if (mk->mk_secret.size < ci->ci_mode->keysize) { + fscrypt_warn(NULL, + "key with %s %*phN is too short (got %u bytes, need %u+ bytes)", + master_key_spec_type(&mk_spec), + master_key_spec_len(&mk_spec), (u8 *)&mk_spec.u, + mk->mk_secret.size, ci->ci_mode->keysize); + err = -ENOKEY; + goto out_release_key; + } + + switch (ci->ci_policy.version) { + case FSCRYPT_POLICY_V1: + err = fscrypt_setup_v1_file_key(ci, mk->mk_secret.raw); + break; + case FSCRYPT_POLICY_V2: + err = fscrypt_setup_v2_file_key(ci, mk); + break; + default: + WARN_ON(1); + err = -EINVAL; + break; + } + if (err) + goto out_release_key; + + *master_key_ret = key; + return 0; + +out_release_key: + up_read(&mk->mk_secret_sem); + key_put(key); + return err; +} + +static void put_crypt_info(struct fscrypt_info *ci) +{ + struct key *key; + + if (!ci) + return; + + if (ci->ci_direct_key) { + fscrypt_put_direct_key(ci->ci_direct_key); + } else if ((ci->ci_ctfm != NULL || ci->ci_essiv_tfm != NULL) && + !fscrypt_is_direct_key_policy(&ci->ci_policy)) { + crypto_free_skcipher(ci->ci_ctfm); + crypto_free_cipher(ci->ci_essiv_tfm); + } + + key = ci->ci_master_key; + if (key) { + struct fscrypt_master_key *mk = key->payload.data[0]; + + /* + * Remove this inode from the list of inodes that were unlocked + * with the master key. + * + * In addition, if we're removing the last inode from a key that + * already had its secret removed, invalidate the key so that it + * gets removed from ->s_master_keys. + */ + spin_lock(&mk->mk_decrypted_inodes_lock); + list_del(&ci->ci_master_key_link); + spin_unlock(&mk->mk_decrypted_inodes_lock); + if (refcount_dec_and_test(&mk->mk_refcount)) + key_invalidate(key); + key_put(key); + } + kmem_cache_free(fscrypt_info_cachep, ci); +} + +int fscrypt_get_encryption_info(struct inode *inode) +{ + struct fscrypt_info *crypt_info; + union fscrypt_context ctx; + struct fscrypt_mode *mode; + struct key *master_key = NULL; + int res; + + if (fscrypt_has_encryption_key(inode)) + return 0; + + res = fscrypt_initialize(inode->i_sb->s_cop->flags); + if (res) + return res; + + res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (res < 0) { + if (!fscrypt_dummy_context_enabled(inode) || + IS_ENCRYPTED(inode)) { + fscrypt_warn(inode, + "Error %d getting encryption context", + res); + return res; + } + /* Fake up a context for an unencrypted directory */ + memset(&ctx, 0, sizeof(ctx)); + ctx.version = FSCRYPT_CONTEXT_V1; + ctx.v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; + ctx.v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; + memset(ctx.v1.master_key_descriptor, 0x42, + FSCRYPT_KEY_DESCRIPTOR_SIZE); + res = sizeof(ctx.v1); + } + + crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_NOFS); + if (!crypt_info) + return -ENOMEM; + + crypt_info->ci_inode = inode; + + res = fscrypt_policy_from_context(&crypt_info->ci_policy, &ctx, res); + if (res) { + fscrypt_warn(inode, + "Unrecognized or corrupt encryption context"); + goto out; + } + + switch (ctx.version) { + case FSCRYPT_CONTEXT_V1: + memcpy(crypt_info->ci_nonce, ctx.v1.nonce, + FS_KEY_DERIVATION_NONCE_SIZE); + break; + case FSCRYPT_CONTEXT_V2: + memcpy(crypt_info->ci_nonce, ctx.v2.nonce, + FS_KEY_DERIVATION_NONCE_SIZE); + break; + default: + WARN_ON(1); + res = -EINVAL; + goto out; + } + + if (!fscrypt_supported_policy(&crypt_info->ci_policy, inode)) { + res = -EINVAL; + goto out; + } + + mode = select_encryption_mode(&crypt_info->ci_policy, inode); + if (IS_ERR(mode)) { + res = PTR_ERR(mode); + goto out; + } + WARN_ON(mode->ivsize > FSCRYPT_MAX_IV_SIZE); + crypt_info->ci_mode = mode; + + res = setup_file_encryption_key(crypt_info, &master_key); + if (res) + goto out; + + if (cmpxchg_release(&inode->i_crypt_info, NULL, crypt_info) == NULL) { + if (master_key) { + struct fscrypt_master_key *mk = + master_key->payload.data[0]; + + refcount_inc(&mk->mk_refcount); + crypt_info->ci_master_key = key_get(master_key); + spin_lock(&mk->mk_decrypted_inodes_lock); + list_add(&crypt_info->ci_master_key_link, + &mk->mk_decrypted_inodes); + spin_unlock(&mk->mk_decrypted_inodes_lock); + } + crypt_info = NULL; + } + res = 0; +out: + if (master_key) { + struct fscrypt_master_key *mk = master_key->payload.data[0]; + + up_read(&mk->mk_secret_sem); + key_put(master_key); + } + if (res == -ENOKEY) + res = 0; + put_crypt_info(crypt_info); + return res; +} +EXPORT_SYMBOL(fscrypt_get_encryption_info); + +/** + * fscrypt_put_encryption_info - free most of an inode's fscrypt data + * + * Free the inode's fscrypt_info. Filesystems must call this when the inode is + * being evicted. An RCU grace period need not have elapsed yet. + */ +void fscrypt_put_encryption_info(struct inode *inode) +{ + put_crypt_info(inode->i_crypt_info); + inode->i_crypt_info = NULL; +} +EXPORT_SYMBOL(fscrypt_put_encryption_info); + +/** + * fscrypt_free_inode - free an inode's fscrypt data requiring RCU delay + * + * Free the inode's cached decrypted symlink target, if any. Filesystems must + * call this after an RCU grace period, just before they free the inode. + */ +void fscrypt_free_inode(struct inode *inode) +{ + if (IS_ENCRYPTED(inode) && S_ISLNK(inode->i_mode)) { + kfree(inode->i_link); + inode->i_link = NULL; + } +} +EXPORT_SYMBOL(fscrypt_free_inode); + +/** + * fscrypt_drop_inode - check whether the inode's master key has been removed + * + * Filesystems supporting fscrypt must call this from their ->drop_inode() + * method so that encrypted inodes are evicted as soon as they're no longer in + * use and their master key has been removed. + * + * Return: 1 if fscrypt wants the inode to be evicted now, otherwise 0 + */ +int fscrypt_drop_inode(struct inode *inode) +{ + const struct fscrypt_info *ci = READ_ONCE(inode->i_crypt_info); + const struct fscrypt_master_key *mk; + + /* + * If ci is NULL, then the inode doesn't have an encryption key set up + * so it's irrelevant. If ci_master_key is NULL, then the master key + * was provided via the legacy mechanism of the process-subscribed + * keyrings, so we don't know whether it's been removed or not. + */ + if (!ci || !ci->ci_master_key) + return 0; + mk = ci->ci_master_key->payload.data[0]; + + /* + * Note: since we aren't holding ->mk_secret_sem, the result here can + * immediately become outdated. But there's no correctness problem with + * unnecessarily evicting. Nor is there a correctness problem with not + * evicting while iput() is racing with the key being removed, since + * then the thread removing the key will either evict the inode itself + * or will correctly detect that it wasn't evicted due to the race. + */ + return !is_master_key_secret_present(&mk->mk_secret); +} +EXPORT_SYMBOL_GPL(fscrypt_drop_inode); diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c new file mode 100644 index 000000000000..ad1a36c370c3 --- /dev/null +++ b/fs/crypto/keysetup_v1.c @@ -0,0 +1,340 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Key setup for v1 encryption policies + * + * Copyright 2015, 2019 Google LLC + */ + +/* + * This file implements compatibility functions for the original encryption + * policy version ("v1"), including: + * + * - Deriving per-file keys using the AES-128-ECB based KDF + * (rather than the new method of using HKDF-SHA512) + * + * - Retrieving fscrypt master keys from process-subscribed keyrings + * (rather than the new method of using a filesystem-level keyring) + * + * - Handling policies with the DIRECT_KEY flag set using a master key table + * (rather than the new method of implementing DIRECT_KEY with per-mode keys + * managed alongside the master keys in the filesystem-level keyring) + */ + +#include <crypto/algapi.h> +#include <crypto/skcipher.h> +#include <keys/user-type.h> +#include <linux/hashtable.h> +#include <linux/scatterlist.h> + +#include "fscrypt_private.h" + +/* Table of keys referenced by DIRECT_KEY policies */ +static DEFINE_HASHTABLE(fscrypt_direct_keys, 6); /* 6 bits = 64 buckets */ +static DEFINE_SPINLOCK(fscrypt_direct_keys_lock); + +/* + * v1 key derivation function. This generates the derived key by encrypting the + * master key with AES-128-ECB using the nonce as the AES key. This provides a + * unique derived key with sufficient entropy for each inode. However, it's + * nonstandard, non-extensible, doesn't evenly distribute the entropy from the + * master key, and is trivially reversible: an attacker who compromises a + * derived key can "decrypt" it to get back to the master key, then derive any + * other key. For all new code, use HKDF instead. + * + * The master key must be at least as long as the derived key. If the master + * key is longer, then only the first 'derived_keysize' bytes are used. + */ +static int derive_key_aes(const u8 *master_key, + const u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE], + u8 *derived_key, unsigned int derived_keysize) +{ + int res = 0; + struct skcipher_request *req = NULL; + DECLARE_CRYPTO_WAIT(wait); + struct scatterlist src_sg, dst_sg; + struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0); + + if (IS_ERR(tfm)) { + res = PTR_ERR(tfm); + tfm = NULL; + goto out; + } + crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); + req = skcipher_request_alloc(tfm, GFP_NOFS); + if (!req) { + res = -ENOMEM; + goto out; + } + skcipher_request_set_callback(req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + crypto_req_done, &wait); + res = crypto_skcipher_setkey(tfm, nonce, FS_KEY_DERIVATION_NONCE_SIZE); + if (res < 0) + goto out; + + sg_init_one(&src_sg, master_key, derived_keysize); + sg_init_one(&dst_sg, derived_key, derived_keysize); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, derived_keysize, + NULL); + res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); +out: + skcipher_request_free(req); + crypto_free_skcipher(tfm); + return res; +} + +/* + * Search the current task's subscribed keyrings for a "logon" key with + * description prefix:descriptor, and if found acquire a read lock on it and + * return a pointer to its validated payload in *payload_ret. + */ +static struct key * +find_and_lock_process_key(const char *prefix, + const u8 descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE], + unsigned int min_keysize, + const struct fscrypt_key **payload_ret) +{ + char *description; + struct key *key; + const struct user_key_payload *ukp; + const struct fscrypt_key *payload; + + description = kasprintf(GFP_NOFS, "%s%*phN", prefix, + FSCRYPT_KEY_DESCRIPTOR_SIZE, descriptor); + if (!description) + return ERR_PTR(-ENOMEM); + + key = request_key(&key_type_logon, description, NULL); + kfree(description); + if (IS_ERR(key)) + return key; + + down_read(&key->sem); + ukp = user_key_payload_locked(key); + + if (!ukp) /* was the key revoked before we acquired its semaphore? */ + goto invalid; + + payload = (const struct fscrypt_key *)ukp->data; + + if (ukp->datalen != sizeof(struct fscrypt_key) || + payload->size < 1 || payload->size > FSCRYPT_MAX_KEY_SIZE) { + fscrypt_warn(NULL, + "key with description '%s' has invalid payload", + key->description); + goto invalid; + } + + if (payload->size < min_keysize) { + fscrypt_warn(NULL, + "key with description '%s' is too short (got %u bytes, need %u+ bytes)", + key->description, payload->size, min_keysize); + goto invalid; + } + + *payload_ret = payload; + return key; + +invalid: + up_read(&key->sem); + key_put(key); + return ERR_PTR(-ENOKEY); +} + +/* Master key referenced by DIRECT_KEY policy */ +struct fscrypt_direct_key { + struct hlist_node dk_node; + refcount_t dk_refcount; + const struct fscrypt_mode *dk_mode; + struct crypto_skcipher *dk_ctfm; + u8 dk_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE]; + u8 dk_raw[FSCRYPT_MAX_KEY_SIZE]; +}; + +static void free_direct_key(struct fscrypt_direct_key *dk) +{ + if (dk) { + crypto_free_skcipher(dk->dk_ctfm); + kzfree(dk); + } +} + +void fscrypt_put_direct_key(struct fscrypt_direct_key *dk) +{ + if (!refcount_dec_and_lock(&dk->dk_refcount, &fscrypt_direct_keys_lock)) + return; + hash_del(&dk->dk_node); + spin_unlock(&fscrypt_direct_keys_lock); + + free_direct_key(dk); +} + +/* + * Find/insert the given key into the fscrypt_direct_keys table. If found, it + * is returned with elevated refcount, and 'to_insert' is freed if non-NULL. If + * not found, 'to_insert' is inserted and returned if it's non-NULL; otherwise + * NULL is returned. + */ +static struct fscrypt_direct_key * +find_or_insert_direct_key(struct fscrypt_direct_key *to_insert, + const u8 *raw_key, const struct fscrypt_info *ci) +{ + unsigned long hash_key; + struct fscrypt_direct_key *dk; + + /* + * Careful: to avoid potentially leaking secret key bytes via timing + * information, we must key the hash table by descriptor rather than by + * raw key, and use crypto_memneq() when comparing raw keys. + */ + + BUILD_BUG_ON(sizeof(hash_key) > FSCRYPT_KEY_DESCRIPTOR_SIZE); + memcpy(&hash_key, ci->ci_policy.v1.master_key_descriptor, + sizeof(hash_key)); + + spin_lock(&fscrypt_direct_keys_lock); + hash_for_each_possible(fscrypt_direct_keys, dk, dk_node, hash_key) { + if (memcmp(ci->ci_policy.v1.master_key_descriptor, + dk->dk_descriptor, FSCRYPT_KEY_DESCRIPTOR_SIZE) != 0) + continue; + if (ci->ci_mode != dk->dk_mode) + continue; + if (crypto_memneq(raw_key, dk->dk_raw, ci->ci_mode->keysize)) + continue; + /* using existing tfm with same (descriptor, mode, raw_key) */ + refcount_inc(&dk->dk_refcount); + spin_unlock(&fscrypt_direct_keys_lock); + free_direct_key(to_insert); + return dk; + } + if (to_insert) + hash_add(fscrypt_direct_keys, &to_insert->dk_node, hash_key); + spin_unlock(&fscrypt_direct_keys_lock); + return to_insert; +} + +/* Prepare to encrypt directly using the master key in the given mode */ +static struct fscrypt_direct_key * +fscrypt_get_direct_key(const struct fscrypt_info *ci, const u8 *raw_key) +{ + struct fscrypt_direct_key *dk; + int err; + + /* Is there already a tfm for this key? */ + dk = find_or_insert_direct_key(NULL, raw_key, ci); + if (dk) + return dk; + + /* Nope, allocate one. */ + dk = kzalloc(sizeof(*dk), GFP_NOFS); + if (!dk) + return ERR_PTR(-ENOMEM); + refcount_set(&dk->dk_refcount, 1); + dk->dk_mode = ci->ci_mode; + dk->dk_ctfm = fscrypt_allocate_skcipher(ci->ci_mode, raw_key, + ci->ci_inode); + if (IS_ERR(dk->dk_ctfm)) { + err = PTR_ERR(dk->dk_ctfm); + dk->dk_ctfm = NULL; + goto err_free_dk; + } + memcpy(dk->dk_descriptor, ci->ci_policy.v1.master_key_descriptor, + FSCRYPT_KEY_DESCRIPTOR_SIZE); + memcpy(dk->dk_raw, raw_key, ci->ci_mode->keysize); + + return find_or_insert_direct_key(dk, raw_key, ci); + +err_free_dk: + free_direct_key(dk); + return ERR_PTR(err); +} + +/* v1 policy, DIRECT_KEY: use the master key directly */ +static int setup_v1_file_key_direct(struct fscrypt_info *ci, + const u8 *raw_master_key) +{ + const struct fscrypt_mode *mode = ci->ci_mode; + struct fscrypt_direct_key *dk; + + if (!fscrypt_mode_supports_direct_key(mode)) { + fscrypt_warn(ci->ci_inode, + "Direct key mode not allowed with %s", + mode->friendly_name); + return -EINVAL; + } + + if (ci->ci_policy.v1.contents_encryption_mode != + ci->ci_policy.v1.filenames_encryption_mode) { + fscrypt_warn(ci->ci_inode, + "Direct key mode not allowed with different contents and filenames modes"); + return -EINVAL; + } + + /* ESSIV implies 16-byte IVs which implies !DIRECT_KEY */ + if (WARN_ON(mode->needs_essiv)) + return -EINVAL; + + dk = fscrypt_get_direct_key(ci, raw_master_key); + if (IS_ERR(dk)) + return PTR_ERR(dk); + ci->ci_direct_key = dk; + ci->ci_ctfm = dk->dk_ctfm; + return 0; +} + +/* v1 policy, !DIRECT_KEY: derive the file's encryption key */ +static int setup_v1_file_key_derived(struct fscrypt_info *ci, + const u8 *raw_master_key) +{ + u8 *derived_key; + int err; + + /* + * This cannot be a stack buffer because it will be passed to the + * scatterlist crypto API during derive_key_aes(). + */ + derived_key = kmalloc(ci->ci_mode->keysize, GFP_NOFS); + if (!derived_key) + return -ENOMEM; + + err = derive_key_aes(raw_master_key, ci->ci_nonce, + derived_key, ci->ci_mode->keysize); + if (err) + goto out; + + err = fscrypt_set_derived_key(ci, derived_key); +out: + kzfree(derived_key); + return err; +} + +int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, const u8 *raw_master_key) +{ + if (ci->ci_policy.v1.flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) + return setup_v1_file_key_direct(ci, raw_master_key); + else + return setup_v1_file_key_derived(ci, raw_master_key); +} + +int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci) +{ + struct key *key; + const struct fscrypt_key *payload; + int err; + + key = find_and_lock_process_key(FSCRYPT_KEY_DESC_PREFIX, + ci->ci_policy.v1.master_key_descriptor, + ci->ci_mode->keysize, &payload); + if (key == ERR_PTR(-ENOKEY) && ci->ci_inode->i_sb->s_cop->key_prefix) { + key = find_and_lock_process_key(ci->ci_inode->i_sb->s_cop->key_prefix, + ci->ci_policy.v1.master_key_descriptor, + ci->ci_mode->keysize, &payload); + } + if (IS_ERR(key)) + return PTR_ERR(key); + + err = fscrypt_setup_v1_file_key(ci, payload->raw); + up_read(&key->sem); + key_put(key); + return err; +} diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 4941fe8471ce..4072ba644595 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -5,8 +5,9 @@ * Copyright (C) 2015, Google, Inc. * Copyright (C) 2015, Motorola Mobility. * - * Written by Michael Halcrow, 2015. + * Originally written by Michael Halcrow, 2015. * Modified by Jaegeuk Kim, 2015. + * Modified by Eric Biggers, 2019 for v2 policy support. */ #include <linux/random.h> @@ -14,70 +15,303 @@ #include <linux/mount.h> #include "fscrypt_private.h" -/* - * check whether an encryption policy is consistent with an encryption context +/** + * fscrypt_policies_equal - check whether two encryption policies are the same + * + * Return: %true if equal, else %false + */ +bool fscrypt_policies_equal(const union fscrypt_policy *policy1, + const union fscrypt_policy *policy2) +{ + if (policy1->version != policy2->version) + return false; + + return !memcmp(policy1, policy2, fscrypt_policy_size(policy1)); +} + +/** + * fscrypt_supported_policy - check whether an encryption policy is supported + * + * Given an encryption policy, check whether all its encryption modes and other + * settings are supported by this kernel. (But we don't currently don't check + * for crypto API support here, so attempting to use an algorithm not configured + * into the crypto API will still fail later.) + * + * Return: %true if supported, else %false + */ +bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, + const struct inode *inode) +{ + switch (policy_u->version) { + case FSCRYPT_POLICY_V1: { + const struct fscrypt_policy_v1 *policy = &policy_u->v1; + + if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, + policy->filenames_encryption_mode)) { + fscrypt_warn(inode, + "Unsupported encryption modes (contents %d, filenames %d)", + policy->contents_encryption_mode, + policy->filenames_encryption_mode); + return false; + } + + if (policy->flags & ~FSCRYPT_POLICY_FLAGS_VALID) { + fscrypt_warn(inode, + "Unsupported encryption flags (0x%02x)", + policy->flags); + return false; + } + + return true; + } + case FSCRYPT_POLICY_V2: { + const struct fscrypt_policy_v2 *policy = &policy_u->v2; + + if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, + policy->filenames_encryption_mode)) { + fscrypt_warn(inode, + "Unsupported encryption modes (contents %d, filenames %d)", + policy->contents_encryption_mode, + policy->filenames_encryption_mode); + return false; + } + + if (policy->flags & ~FSCRYPT_POLICY_FLAGS_VALID) { + fscrypt_warn(inode, + "Unsupported encryption flags (0x%02x)", + policy->flags); + return false; + } + + if (memchr_inv(policy->__reserved, 0, + sizeof(policy->__reserved))) { + fscrypt_warn(inode, + "Reserved bits set in encryption policy"); + return false; + } + + return true; + } + } + return false; +} + +/** + * fscrypt_new_context_from_policy - create a new fscrypt_context from a policy + * + * Create an fscrypt_context for an inode that is being assigned the given + * encryption policy. A new nonce is randomly generated. + * + * Return: the size of the new context in bytes. */ -static bool is_encryption_context_consistent_with_policy( - const struct fscrypt_context *ctx, - const struct fscrypt_policy *policy) +static int fscrypt_new_context_from_policy(union fscrypt_context *ctx_u, + const union fscrypt_policy *policy_u) { - return memcmp(ctx->master_key_descriptor, policy->master_key_descriptor, - FS_KEY_DESCRIPTOR_SIZE) == 0 && - (ctx->flags == policy->flags) && - (ctx->contents_encryption_mode == - policy->contents_encryption_mode) && - (ctx->filenames_encryption_mode == - policy->filenames_encryption_mode); + memset(ctx_u, 0, sizeof(*ctx_u)); + + switch (policy_u->version) { + case FSCRYPT_POLICY_V1: { + const struct fscrypt_policy_v1 *policy = &policy_u->v1; + struct fscrypt_context_v1 *ctx = &ctx_u->v1; + + ctx->version = FSCRYPT_CONTEXT_V1; + ctx->contents_encryption_mode = + policy->contents_encryption_mode; + ctx->filenames_encryption_mode = + policy->filenames_encryption_mode; + ctx->flags = policy->flags; + memcpy(ctx->master_key_descriptor, + policy->master_key_descriptor, + sizeof(ctx->master_key_descriptor)); + get_random_bytes(ctx->nonce, sizeof(ctx->nonce)); + return sizeof(*ctx); + } + case FSCRYPT_POLICY_V2: { + const struct fscrypt_policy_v2 *policy = &policy_u->v2; + struct fscrypt_context_v2 *ctx = &ctx_u->v2; + + ctx->version = FSCRYPT_CONTEXT_V2; + ctx->contents_encryption_mode = + policy->contents_encryption_mode; + ctx->filenames_encryption_mode = + policy->filenames_encryption_mode; + ctx->flags = policy->flags; + memcpy(ctx->master_key_identifier, + policy->master_key_identifier, + sizeof(ctx->master_key_identifier)); + get_random_bytes(ctx->nonce, sizeof(ctx->nonce)); + return sizeof(*ctx); + } + } + BUG(); } -static int create_encryption_context_from_policy(struct inode *inode, - const struct fscrypt_policy *policy) +/** + * fscrypt_policy_from_context - convert an fscrypt_context to an fscrypt_policy + * + * Given an fscrypt_context, build the corresponding fscrypt_policy. + * + * Return: 0 on success, or -EINVAL if the fscrypt_context has an unrecognized + * version number or size. + * + * This does *not* validate the settings within the policy itself, e.g. the + * modes, flags, and reserved bits. Use fscrypt_supported_policy() for that. + */ +int fscrypt_policy_from_context(union fscrypt_policy *policy_u, + const union fscrypt_context *ctx_u, + int ctx_size) { - struct fscrypt_context ctx; + memset(policy_u, 0, sizeof(*policy_u)); + + if (ctx_size <= 0 || ctx_size != fscrypt_context_size(ctx_u)) + return -EINVAL; + + switch (ctx_u->version) { + case FSCRYPT_CONTEXT_V1: { + const struct fscrypt_context_v1 *ctx = &ctx_u->v1; + struct fscrypt_policy_v1 *policy = &policy_u->v1; + + policy->version = FSCRYPT_POLICY_V1; + policy->contents_encryption_mode = + ctx->contents_encryption_mode; + policy->filenames_encryption_mode = + ctx->filenames_encryption_mode; + policy->flags = ctx->flags; + memcpy(policy->master_key_descriptor, + ctx->master_key_descriptor, + sizeof(policy->master_key_descriptor)); + return 0; + } + case FSCRYPT_CONTEXT_V2: { + const struct fscrypt_context_v2 *ctx = &ctx_u->v2; + struct fscrypt_policy_v2 *policy = &policy_u->v2; + + policy->version = FSCRYPT_POLICY_V2; + policy->contents_encryption_mode = + ctx->contents_encryption_mode; + policy->filenames_encryption_mode = + ctx->filenames_encryption_mode; + policy->flags = ctx->flags; + memcpy(policy->__reserved, ctx->__reserved, + sizeof(policy->__reserved)); + memcpy(policy->master_key_identifier, + ctx->master_key_identifier, + sizeof(policy->master_key_identifier)); + return 0; + } + } + /* unreachable */ + return -EINVAL; +} + +/* Retrieve an inode's encryption policy */ +static int fscrypt_get_policy(struct inode *inode, union fscrypt_policy *policy) +{ + const struct fscrypt_info *ci; + union fscrypt_context ctx; + int ret; + + ci = READ_ONCE(inode->i_crypt_info); + if (ci) { + /* key available, use the cached policy */ + *policy = ci->ci_policy; + return 0; + } + + if (!IS_ENCRYPTED(inode)) + return -ENODATA; - ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; - memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, - FS_KEY_DESCRIPTOR_SIZE); + ret = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (ret < 0) + return (ret == -ERANGE) ? -EINVAL : ret; - if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, - policy->filenames_encryption_mode)) + return fscrypt_policy_from_context(policy, &ctx, ret); +} + +static int set_encryption_policy(struct inode *inode, + const union fscrypt_policy *policy) +{ + union fscrypt_context ctx; + int ctxsize; + int err; + + if (!fscrypt_supported_policy(policy, inode)) return -EINVAL; - if (policy->flags & ~FS_POLICY_FLAGS_VALID) + switch (policy->version) { + case FSCRYPT_POLICY_V1: + /* + * The original encryption policy version provided no way of + * verifying that the correct master key was supplied, which was + * insecure in scenarios where multiple users have access to the + * same encrypted files (even just read-only access). The new + * encryption policy version fixes this and also implies use of + * an improved key derivation function and allows non-root users + * to securely remove keys. So as long as compatibility with + * old kernels isn't required, it is recommended to use the new + * policy version for all new encrypted directories. + */ + pr_warn_once("%s (pid %d) is setting deprecated v1 encryption policy; recommend upgrading to v2.\n", + current->comm, current->pid); + break; + case FSCRYPT_POLICY_V2: + err = fscrypt_verify_key_added(inode->i_sb, + policy->v2.master_key_identifier); + if (err) + return err; + break; + default: + WARN_ON(1); return -EINVAL; + } - ctx.contents_encryption_mode = policy->contents_encryption_mode; - ctx.filenames_encryption_mode = policy->filenames_encryption_mode; - ctx.flags = policy->flags; - BUILD_BUG_ON(sizeof(ctx.nonce) != FS_KEY_DERIVATION_NONCE_SIZE); - get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE); + ctxsize = fscrypt_new_context_from_policy(&ctx, policy); - return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL); + return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, NULL); } int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) { - struct fscrypt_policy policy; + union fscrypt_policy policy; + union fscrypt_policy existing_policy; struct inode *inode = file_inode(filp); + u8 version; + int size; int ret; - struct fscrypt_context ctx; - if (copy_from_user(&policy, arg, sizeof(policy))) + if (get_user(policy.version, (const u8 __user *)arg)) return -EFAULT; + size = fscrypt_policy_size(&policy); + if (size <= 0) + return -EINVAL; + + /* + * We should just copy the remaining 'size - 1' bytes here, but a + * bizarre bug in gcc 7 and earlier (fixed by gcc r255731) causes gcc to + * think that size can be 0 here (despite the check above!) *and* that + * it's a compile-time constant. Thus it would think copy_from_user() + * is passed compile-time constant ULONG_MAX, causing the compile-time + * buffer overflow check to fail, breaking the build. This only occurred + * when building an i386 kernel with -Os and branch profiling enabled. + * + * Work around it by just copying the first byte again... + */ + version = policy.version; + if (copy_from_user(&policy, arg, size)) + return -EFAULT; + policy.version = version; + if (!inode_owner_or_capable(inode)) return -EACCES; - if (policy.version != 0) - return -EINVAL; - ret = mnt_want_write_file(filp); if (ret) return ret; inode_lock(inode); - ret = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + ret = fscrypt_get_policy(inode, &existing_policy); if (ret == -ENODATA) { if (!S_ISDIR(inode->i_mode)) ret = -ENOTDIR; @@ -86,14 +320,10 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) else if (!inode->i_sb->s_cop->empty_dir(inode)) ret = -ENOTEMPTY; else - ret = create_encryption_context_from_policy(inode, - &policy); - } else if (ret == sizeof(ctx) && - is_encryption_context_consistent_with_policy(&ctx, - &policy)) { - /* The file already uses the same encryption policy. */ - ret = 0; - } else if (ret >= 0 || ret == -ERANGE) { + ret = set_encryption_policy(inode, &policy); + } else if (ret == -EINVAL || + (ret == 0 && !fscrypt_policies_equal(&policy, + &existing_policy))) { /* The file already uses a different encryption policy. */ ret = -EEXIST; } @@ -105,37 +335,57 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) } EXPORT_SYMBOL(fscrypt_ioctl_set_policy); +/* Original ioctl version; can only get the original policy version */ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) { - struct inode *inode = file_inode(filp); - struct fscrypt_context ctx; - struct fscrypt_policy policy; - int res; + union fscrypt_policy policy; + int err; - if (!IS_ENCRYPTED(inode)) - return -ENODATA; + err = fscrypt_get_policy(file_inode(filp), &policy); + if (err) + return err; - res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); - if (res < 0 && res != -ERANGE) - return res; - if (res != sizeof(ctx)) - return -EINVAL; - if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1) + if (policy.version != FSCRYPT_POLICY_V1) return -EINVAL; - policy.version = 0; - policy.contents_encryption_mode = ctx.contents_encryption_mode; - policy.filenames_encryption_mode = ctx.filenames_encryption_mode; - policy.flags = ctx.flags; - memcpy(policy.master_key_descriptor, ctx.master_key_descriptor, - FS_KEY_DESCRIPTOR_SIZE); - - if (copy_to_user(arg, &policy, sizeof(policy))) + if (copy_to_user(arg, &policy, sizeof(policy.v1))) return -EFAULT; return 0; } EXPORT_SYMBOL(fscrypt_ioctl_get_policy); +/* Extended ioctl version; can get policies of any version */ +int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *uarg) +{ + struct fscrypt_get_policy_ex_arg arg; + union fscrypt_policy *policy = (union fscrypt_policy *)&arg.policy; + size_t policy_size; + int err; + + /* arg is policy_size, then policy */ + BUILD_BUG_ON(offsetof(typeof(arg), policy_size) != 0); + BUILD_BUG_ON(offsetofend(typeof(arg), policy_size) != + offsetof(typeof(arg), policy)); + BUILD_BUG_ON(sizeof(arg.policy) != sizeof(*policy)); + + err = fscrypt_get_policy(file_inode(filp), policy); + if (err) + return err; + policy_size = fscrypt_policy_size(policy); + + if (copy_from_user(&arg, uarg, sizeof(arg.policy_size))) + return -EFAULT; + + if (policy_size > arg.policy_size) + return -EOVERFLOW; + arg.policy_size = policy_size; + + if (copy_to_user(uarg, &arg, sizeof(arg.policy_size) + policy_size)) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL_GPL(fscrypt_ioctl_get_policy_ex); + /** * fscrypt_has_permitted_context() - is a file's encryption policy permitted * within its directory? @@ -157,10 +407,8 @@ EXPORT_SYMBOL(fscrypt_ioctl_get_policy); */ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) { - const struct fscrypt_operations *cops = parent->i_sb->s_cop; - const struct fscrypt_info *parent_ci, *child_ci; - struct fscrypt_context parent_ctx, child_ctx; - int res; + union fscrypt_policy parent_policy, child_policy; + int err; /* No restrictions on file types which are never encrypted */ if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) && @@ -190,41 +438,22 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) * In any case, if an unexpected error occurs, fall back to "forbidden". */ - res = fscrypt_get_encryption_info(parent); - if (res) + err = fscrypt_get_encryption_info(parent); + if (err) return 0; - res = fscrypt_get_encryption_info(child); - if (res) + err = fscrypt_get_encryption_info(child); + if (err) return 0; - parent_ci = READ_ONCE(parent->i_crypt_info); - child_ci = READ_ONCE(child->i_crypt_info); - - if (parent_ci && child_ci) { - return memcmp(parent_ci->ci_master_key_descriptor, - child_ci->ci_master_key_descriptor, - FS_KEY_DESCRIPTOR_SIZE) == 0 && - (parent_ci->ci_data_mode == child_ci->ci_data_mode) && - (parent_ci->ci_filename_mode == - child_ci->ci_filename_mode) && - (parent_ci->ci_flags == child_ci->ci_flags); - } - res = cops->get_context(parent, &parent_ctx, sizeof(parent_ctx)); - if (res != sizeof(parent_ctx)) + err = fscrypt_get_policy(parent, &parent_policy); + if (err) return 0; - res = cops->get_context(child, &child_ctx, sizeof(child_ctx)); - if (res != sizeof(child_ctx)) + err = fscrypt_get_policy(child, &child_policy); + if (err) return 0; - return memcmp(parent_ctx.master_key_descriptor, - child_ctx.master_key_descriptor, - FS_KEY_DESCRIPTOR_SIZE) == 0 && - (parent_ctx.contents_encryption_mode == - child_ctx.contents_encryption_mode) && - (parent_ctx.filenames_encryption_mode == - child_ctx.filenames_encryption_mode) && - (parent_ctx.flags == child_ctx.flags); + return fscrypt_policies_equal(&parent_policy, &child_policy); } EXPORT_SYMBOL(fscrypt_has_permitted_context); @@ -240,7 +469,8 @@ EXPORT_SYMBOL(fscrypt_has_permitted_context); int fscrypt_inherit_context(struct inode *parent, struct inode *child, void *fs_data, bool preload) { - struct fscrypt_context ctx; + union fscrypt_context ctx; + int ctxsize; struct fscrypt_info *ci; int res; @@ -252,16 +482,10 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child, if (ci == NULL) return -ENOKEY; - ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; - ctx.contents_encryption_mode = ci->ci_data_mode; - ctx.filenames_encryption_mode = ci->ci_filename_mode; - ctx.flags = ci->ci_flags; - memcpy(ctx.master_key_descriptor, ci->ci_master_key_descriptor, - FS_KEY_DESCRIPTOR_SIZE); - get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE); + ctxsize = fscrypt_new_context_from_policy(&ctx, &ci->ci_policy); + BUILD_BUG_ON(sizeof(ctx) != FSCRYPT_SET_CONTEXT_MAX_SIZE); - res = parent->i_sb->s_cop->set_context(child, &ctx, - sizeof(ctx), fs_data); + res = parent->i_sb->s_cop->set_context(child, &ctx, ctxsize, fs_data); if (res) return res; return preload ? fscrypt_get_encryption_info(child): 0; diff --git a/fs/d_path.c b/fs/d_path.c index a7d0a96b35ce..0f1fc1743302 100644 --- a/fs/d_path.c +++ b/fs/d_path.c @@ -116,8 +116,10 @@ restart: vfsmnt = &mnt->mnt; continue; } - if (!error) - error = is_mounted(vfsmnt) ? 1 : 2; + if (is_mounted(vfsmnt) && !is_anon_ns(mnt->mnt_ns)) + error = 1; // absolute root + else + error = 2; // detached or not attached yet break; } parent = dentry->d_parent; diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index beeadca23b05..42e5a766d33c 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -622,7 +622,7 @@ void devpts_pty_kill(struct dentry *dentry) dentry->d_fsdata = NULL; drop_nlink(dentry->d_inode); fsnotify_unlink(d_inode(dentry->d_parent), dentry); - d_delete(dentry); + d_drop(dentry); dput(dentry); /* d_alloc_name() in devpts_pty_new() */ } diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig new file mode 100644 index 000000000000..9d634d3a1845 --- /dev/null +++ b/fs/erofs/Kconfig @@ -0,0 +1,91 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config EROFS_FS + tristate "EROFS filesystem support" + depends on BLOCK + help + EROFS (Enhanced Read-Only File System) is a lightweight + read-only file system with modern designs (eg. page-sized + blocks, inline xattrs/data, etc.) for scenarios which need + high-performance read-only requirements, e.g. Android OS + for mobile phones and LIVECDs. + + It also provides fixed-sized output compression support, + which improves storage density, keeps relatively higher + compression ratios, which is more useful to achieve high + performance for embedded devices with limited memory. + + If unsure, say N. + +config EROFS_FS_DEBUG + bool "EROFS debugging feature" + depends on EROFS_FS + help + Print debugging messages and enable more BUG_ONs which check + filesystem consistency and find potential issues aggressively, + which can be used for Android eng build, for example. + + For daily use, say N. + +config EROFS_FS_XATTR + bool "EROFS extended attributes" + depends on EROFS_FS + default y + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + <http://acl.bestbits.at/> for details). + + If unsure, say N. + +config EROFS_FS_POSIX_ACL + bool "EROFS Access Control Lists" + depends on EROFS_FS_XATTR + select FS_POSIX_ACL + default y + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website <http://acl.bestbits.at/>. + + If you don't know what Access Control Lists are, say N. + +config EROFS_FS_SECURITY + bool "EROFS Security Labels" + depends on EROFS_FS_XATTR + default y + help + Security labels provide an access control facility to support Linux + Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO + Linux. This option enables an extended attribute handler for file + security labels in the erofs filesystem, so that it requires enabling + the extended attribute support in advance. + + If you are not using a security module, say N. + +config EROFS_FS_ZIP + bool "EROFS Data Compression Support" + depends on EROFS_FS + select LZ4_DECOMPRESS + default y + help + Enable fixed-sized output compression for EROFS. + + If you don't want to enable compression feature, say N. + +config EROFS_FS_CLUSTER_PAGE_LIMIT + int "EROFS Cluster Pages Hard Limit" + depends on EROFS_FS_ZIP + range 1 256 + default "1" + help + Indicates maximum # of pages of a compressed + physical cluster. + + For example, if files in a image were compressed + into 8k-unit, hard limit should not be configured + less than 2. Otherwise, the image will be refused + to mount on this kernel. + diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile new file mode 100644 index 000000000000..46f2aa4ba46c --- /dev/null +++ b/fs/erofs/Makefile @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0-only + +EROFS_VERSION = "1.0" + +ccflags-y += -DEROFS_VERSION=\"$(EROFS_VERSION)\" + +obj-$(CONFIG_EROFS_FS) += erofs.o +erofs-objs := super.o inode.o data.o namei.o dir.o utils.o +erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o +erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o + diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h new file mode 100644 index 000000000000..07d279fd5d67 --- /dev/null +++ b/fs/erofs/compress.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2019 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#ifndef __EROFS_FS_COMPRESS_H +#define __EROFS_FS_COMPRESS_H + +#include "internal.h" + +enum { + Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX, + Z_EROFS_COMPRESSION_RUNTIME_MAX +}; + +struct z_erofs_decompress_req { + struct super_block *sb; + struct page **in, **out; + + unsigned short pageofs_out; + unsigned int inputsize, outputsize; + + /* indicate the algorithm will be used for decompression */ + unsigned int alg; + bool inplace_io, partial_decoding; +}; + +/* + * - 0x5A110C8D ('sallocated', Z_EROFS_MAPPING_STAGING) - + * used to mark temporary allocated pages from other + * file/cached pages and NULL mapping pages. + */ +#define Z_EROFS_MAPPING_STAGING ((void *)0x5A110C8D) + +/* check if a page is marked as staging */ +static inline bool z_erofs_page_is_staging(struct page *page) +{ + return page->mapping == Z_EROFS_MAPPING_STAGING; +} + +static inline bool z_erofs_put_stagingpage(struct list_head *pagepool, + struct page *page) +{ + if (!z_erofs_page_is_staging(page)) + return false; + + /* staging pages should not be used by others at the same time */ + if (page_ref_count(page) > 1) + put_page(page); + else + list_add(&page->lru, pagepool); + return true; +} + +int z_erofs_decompress(struct z_erofs_decompress_req *rq, + struct list_head *pagepool); + +#endif + diff --git a/fs/erofs/data.c b/fs/erofs/data.c new file mode 100644 index 000000000000..8a9fcbd0e8ac --- /dev/null +++ b/fs/erofs/data.c @@ -0,0 +1,360 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#include "internal.h" +#include <linux/prefetch.h> + +#include <trace/events/erofs.h> + +static void erofs_readendio(struct bio *bio) +{ + struct bio_vec *bvec; + blk_status_t err = bio->bi_status; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) { + struct page *page = bvec->bv_page; + + /* page is already locked */ + DBG_BUGON(PageUptodate(page)); + + if (err) + SetPageError(page); + else + SetPageUptodate(page); + + unlock_page(page); + /* page could be reclaimed now */ + } + bio_put(bio); +} + +struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr) +{ + struct inode *const bd_inode = sb->s_bdev->bd_inode; + struct address_space *const mapping = bd_inode->i_mapping; + + return read_cache_page_gfp(mapping, blkaddr, + mapping_gfp_constraint(mapping, ~__GFP_FS)); +} + +static int erofs_map_blocks_flatmode(struct inode *inode, + struct erofs_map_blocks *map, + int flags) +{ + int err = 0; + erofs_blk_t nblocks, lastblk; + u64 offset = map->m_la; + struct erofs_inode *vi = EROFS_I(inode); + bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE); + + trace_erofs_map_blocks_flatmode_enter(inode, map, flags); + + nblocks = DIV_ROUND_UP(inode->i_size, PAGE_SIZE); + lastblk = nblocks - tailendpacking; + + if (offset >= inode->i_size) { + /* leave out-of-bound access unmapped */ + map->m_flags = 0; + map->m_plen = 0; + goto out; + } + + /* there is no hole in flatmode */ + map->m_flags = EROFS_MAP_MAPPED; + + if (offset < blknr_to_addr(lastblk)) { + map->m_pa = blknr_to_addr(vi->raw_blkaddr) + map->m_la; + map->m_plen = blknr_to_addr(lastblk) - offset; + } else if (tailendpacking) { + /* 2 - inode inline B: inode, [xattrs], inline last blk... */ + struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); + + map->m_pa = iloc(sbi, vi->nid) + vi->inode_isize + + vi->xattr_isize + erofs_blkoff(map->m_la); + map->m_plen = inode->i_size - offset; + + /* inline data should be located in one meta block */ + if (erofs_blkoff(map->m_pa) + map->m_plen > PAGE_SIZE) { + erofs_err(inode->i_sb, + "inline data cross block boundary @ nid %llu", + vi->nid); + DBG_BUGON(1); + err = -EFSCORRUPTED; + goto err_out; + } + + map->m_flags |= EROFS_MAP_META; + } else { + erofs_err(inode->i_sb, + "internal error @ nid: %llu (size %llu), m_la 0x%llx", + vi->nid, inode->i_size, map->m_la); + DBG_BUGON(1); + err = -EIO; + goto err_out; + } + +out: + map->m_llen = map->m_plen; + +err_out: + trace_erofs_map_blocks_flatmode_exit(inode, map, flags, 0); + return err; +} + +int erofs_map_blocks(struct inode *inode, + struct erofs_map_blocks *map, int flags) +{ + if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) { + int err = z_erofs_map_blocks_iter(inode, map, flags); + + if (map->mpage) { + put_page(map->mpage); + map->mpage = NULL; + } + return err; + } + return erofs_map_blocks_flatmode(inode, map, flags); +} + +static inline struct bio *erofs_read_raw_page(struct bio *bio, + struct address_space *mapping, + struct page *page, + erofs_off_t *last_block, + unsigned int nblocks, + bool ra) +{ + struct inode *const inode = mapping->host; + struct super_block *const sb = inode->i_sb; + erofs_off_t current_block = (erofs_off_t)page->index; + int err; + + DBG_BUGON(!nblocks); + + if (PageUptodate(page)) { + err = 0; + goto has_updated; + } + + /* note that for readpage case, bio also equals to NULL */ + if (bio && + /* not continuous */ + *last_block + 1 != current_block) { +submit_bio_retry: + submit_bio(bio); + bio = NULL; + } + + if (!bio) { + struct erofs_map_blocks map = { + .m_la = blknr_to_addr(current_block), + }; + erofs_blk_t blknr; + unsigned int blkoff; + + err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); + if (err) + goto err_out; + + /* zero out the holed page */ + if (!(map.m_flags & EROFS_MAP_MAPPED)) { + zero_user_segment(page, 0, PAGE_SIZE); + SetPageUptodate(page); + + /* imply err = 0, see erofs_map_blocks */ + goto has_updated; + } + + /* for RAW access mode, m_plen must be equal to m_llen */ + DBG_BUGON(map.m_plen != map.m_llen); + + blknr = erofs_blknr(map.m_pa); + blkoff = erofs_blkoff(map.m_pa); + + /* deal with inline page */ + if (map.m_flags & EROFS_MAP_META) { + void *vsrc, *vto; + struct page *ipage; + + DBG_BUGON(map.m_plen > PAGE_SIZE); + + ipage = erofs_get_meta_page(inode->i_sb, blknr); + + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto err_out; + } + + vsrc = kmap_atomic(ipage); + vto = kmap_atomic(page); + memcpy(vto, vsrc + blkoff, map.m_plen); + memset(vto + map.m_plen, 0, PAGE_SIZE - map.m_plen); + kunmap_atomic(vto); + kunmap_atomic(vsrc); + flush_dcache_page(page); + + SetPageUptodate(page); + /* TODO: could we unlock the page earlier? */ + unlock_page(ipage); + put_page(ipage); + + /* imply err = 0, see erofs_map_blocks */ + goto has_updated; + } + + /* pa must be block-aligned for raw reading */ + DBG_BUGON(erofs_blkoff(map.m_pa)); + + /* max # of continuous pages */ + if (nblocks > DIV_ROUND_UP(map.m_plen, PAGE_SIZE)) + nblocks = DIV_ROUND_UP(map.m_plen, PAGE_SIZE); + if (nblocks > BIO_MAX_PAGES) + nblocks = BIO_MAX_PAGES; + + bio = bio_alloc(GFP_NOIO, nblocks); + + bio->bi_end_io = erofs_readendio; + bio_set_dev(bio, sb->s_bdev); + bio->bi_iter.bi_sector = (sector_t)blknr << + LOG_SECTORS_PER_BLOCK; + bio->bi_opf = REQ_OP_READ; + } + + err = bio_add_page(bio, page, PAGE_SIZE, 0); + /* out of the extent or bio is full */ + if (err < PAGE_SIZE) + goto submit_bio_retry; + + *last_block = current_block; + + /* shift in advance in case of it followed by too many gaps */ + if (bio->bi_iter.bi_size >= bio->bi_max_vecs * PAGE_SIZE) { + /* err should reassign to 0 after submitting */ + err = 0; + goto submit_bio_out; + } + + return bio; + +err_out: + /* for sync reading, set page error immediately */ + if (!ra) { + SetPageError(page); + ClearPageUptodate(page); + } +has_updated: + unlock_page(page); + + /* if updated manually, continuous pages has a gap */ + if (bio) +submit_bio_out: + submit_bio(bio); + return err ? ERR_PTR(err) : NULL; +} + +/* + * since we dont have write or truncate flows, so no inode + * locking needs to be held at the moment. + */ +static int erofs_raw_access_readpage(struct file *file, struct page *page) +{ + erofs_off_t last_block; + struct bio *bio; + + trace_erofs_readpage(page, true); + + bio = erofs_read_raw_page(NULL, page->mapping, + page, &last_block, 1, false); + + if (IS_ERR(bio)) + return PTR_ERR(bio); + + DBG_BUGON(bio); /* since we have only one bio -- must be NULL */ + return 0; +} + +static int erofs_raw_access_readpages(struct file *filp, + struct address_space *mapping, + struct list_head *pages, + unsigned int nr_pages) +{ + erofs_off_t last_block; + struct bio *bio = NULL; + gfp_t gfp = readahead_gfp_mask(mapping); + struct page *page = list_last_entry(pages, struct page, lru); + + trace_erofs_readpages(mapping->host, page, nr_pages, true); + + for (; nr_pages; --nr_pages) { + page = list_entry(pages->prev, struct page, lru); + + prefetchw(&page->flags); + list_del(&page->lru); + + if (!add_to_page_cache_lru(page, mapping, page->index, gfp)) { + bio = erofs_read_raw_page(bio, mapping, page, + &last_block, nr_pages, true); + + /* all the page errors are ignored when readahead */ + if (IS_ERR(bio)) { + pr_err("%s, readahead error at page %lu of nid %llu\n", + __func__, page->index, + EROFS_I(mapping->host)->nid); + + bio = NULL; + } + } + + /* pages could still be locked */ + put_page(page); + } + DBG_BUGON(!list_empty(pages)); + + /* the rare case (end in gaps) */ + if (bio) + submit_bio(bio); + return 0; +} + +static int erofs_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + struct erofs_map_blocks map = { + .m_la = iblock << 9, + }; + int err; + + err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); + if (err) + return err; + + if (map.m_flags & EROFS_MAP_MAPPED) + bh->b_blocknr = erofs_blknr(map.m_pa); + + return err; +} + +static sector_t erofs_bmap(struct address_space *mapping, sector_t block) +{ + struct inode *inode = mapping->host; + + if (EROFS_I(inode)->datalayout == EROFS_INODE_FLAT_INLINE) { + erofs_blk_t blks = i_size_read(inode) >> LOG_BLOCK_SIZE; + + if (block >> LOG_SECTORS_PER_BLOCK >= blks) + return 0; + } + + return generic_block_bmap(mapping, block, erofs_get_block); +} + +/* for uncompressed (aligned) files and raw access for other files */ +const struct address_space_operations erofs_raw_access_aops = { + .readpage = erofs_raw_access_readpage, + .readpages = erofs_raw_access_readpages, + .bmap = erofs_bmap, +}; + diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c new file mode 100644 index 000000000000..19f89f9fb10c --- /dev/null +++ b/fs/erofs/decompressor.c @@ -0,0 +1,338 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2019 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#include "compress.h" +#include <linux/module.h> +#include <linux/lz4.h> + +#ifndef LZ4_DISTANCE_MAX /* history window size */ +#define LZ4_DISTANCE_MAX 65535 /* set to maximum value by default */ +#endif + +#define LZ4_MAX_DISTANCE_PAGES (DIV_ROUND_UP(LZ4_DISTANCE_MAX, PAGE_SIZE) + 1) +#ifndef LZ4_DECOMPRESS_INPLACE_MARGIN +#define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize) (((srcsize) >> 8) + 32) +#endif + +struct z_erofs_decompressor { + /* + * if destpages have sparsed pages, fill them with bounce pages. + * it also check whether destpages indicate continuous physical memory. + */ + int (*prepare_destpages)(struct z_erofs_decompress_req *rq, + struct list_head *pagepool); + int (*decompress)(struct z_erofs_decompress_req *rq, u8 *out); + char *name; +}; + +static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq, + struct list_head *pagepool) +{ + const unsigned int nr = + PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; + struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL }; + unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES, + BITS_PER_LONG)] = { 0 }; + void *kaddr = NULL; + unsigned int i, j, top; + + top = 0; + for (i = j = 0; i < nr; ++i, ++j) { + struct page *const page = rq->out[i]; + struct page *victim; + + if (j >= LZ4_MAX_DISTANCE_PAGES) + j = 0; + + /* 'valid' bounced can only be tested after a complete round */ + if (test_bit(j, bounced)) { + DBG_BUGON(i < LZ4_MAX_DISTANCE_PAGES); + DBG_BUGON(top >= LZ4_MAX_DISTANCE_PAGES); + availables[top++] = rq->out[i - LZ4_MAX_DISTANCE_PAGES]; + } + + if (page) { + __clear_bit(j, bounced); + if (kaddr) { + if (kaddr + PAGE_SIZE == page_address(page)) + kaddr += PAGE_SIZE; + else + kaddr = NULL; + } else if (!i) { + kaddr = page_address(page); + } + continue; + } + kaddr = NULL; + __set_bit(j, bounced); + + if (top) { + victim = availables[--top]; + get_page(victim); + } else { + victim = erofs_allocpage(pagepool, GFP_KERNEL, false); + if (!victim) + return -ENOMEM; + victim->mapping = Z_EROFS_MAPPING_STAGING; + } + rq->out[i] = victim; + } + return kaddr ? 1 : 0; +} + +static void *generic_copy_inplace_data(struct z_erofs_decompress_req *rq, + u8 *src, unsigned int pageofs_in) +{ + /* + * if in-place decompression is ongoing, those decompressed + * pages should be copied in order to avoid being overlapped. + */ + struct page **in = rq->in; + u8 *const tmp = erofs_get_pcpubuf(0); + u8 *tmpp = tmp; + unsigned int inlen = rq->inputsize - pageofs_in; + unsigned int count = min_t(uint, inlen, PAGE_SIZE - pageofs_in); + + while (tmpp < tmp + inlen) { + if (!src) + src = kmap_atomic(*in); + memcpy(tmpp, src + pageofs_in, count); + kunmap_atomic(src); + src = NULL; + tmpp += count; + pageofs_in = 0; + count = PAGE_SIZE; + ++in; + } + return tmp; +} + +static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out) +{ + unsigned int inputmargin, inlen; + u8 *src; + bool copied, support_0padding; + int ret; + + if (rq->inputsize > PAGE_SIZE) + return -EOPNOTSUPP; + + src = kmap_atomic(*rq->in); + inputmargin = 0; + support_0padding = false; + + /* decompression inplace is only safe when 0padding is enabled */ + if (EROFS_SB(rq->sb)->feature_incompat & + EROFS_FEATURE_INCOMPAT_LZ4_0PADDING) { + support_0padding = true; + + while (!src[inputmargin & ~PAGE_MASK]) + if (!(++inputmargin & ~PAGE_MASK)) + break; + + if (inputmargin >= rq->inputsize) { + kunmap_atomic(src); + return -EIO; + } + } + + copied = false; + inlen = rq->inputsize - inputmargin; + if (rq->inplace_io) { + const uint oend = (rq->pageofs_out + + rq->outputsize) & ~PAGE_MASK; + const uint nr = PAGE_ALIGN(rq->pageofs_out + + rq->outputsize) >> PAGE_SHIFT; + + if (rq->partial_decoding || !support_0padding || + rq->out[nr - 1] != rq->in[0] || + rq->inputsize - oend < + LZ4_DECOMPRESS_INPLACE_MARGIN(inlen)) { + src = generic_copy_inplace_data(rq, src, inputmargin); + inputmargin = 0; + copied = true; + } + } + + ret = LZ4_decompress_safe_partial(src + inputmargin, out, + inlen, rq->outputsize, + rq->outputsize); + if (ret < 0) { + erofs_err(rq->sb, "failed to decompress, in[%u, %u] out[%u]", + inlen, inputmargin, rq->outputsize); + WARN_ON(1); + print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET, + 16, 1, src + inputmargin, inlen, true); + print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET, + 16, 1, out, rq->outputsize, true); + ret = -EIO; + } + + if (copied) + erofs_put_pcpubuf(src); + else + kunmap_atomic(src); + return ret; +} + +static struct z_erofs_decompressor decompressors[] = { + [Z_EROFS_COMPRESSION_SHIFTED] = { + .name = "shifted" + }, + [Z_EROFS_COMPRESSION_LZ4] = { + .prepare_destpages = z_erofs_lz4_prepare_destpages, + .decompress = z_erofs_lz4_decompress, + .name = "lz4" + }, +}; + +static void copy_from_pcpubuf(struct page **out, const char *dst, + unsigned short pageofs_out, + unsigned int outputsize) +{ + const char *end = dst + outputsize; + const unsigned int righthalf = PAGE_SIZE - pageofs_out; + const char *cur = dst - pageofs_out; + + while (cur < end) { + struct page *const page = *out++; + + if (page) { + char *buf = kmap_atomic(page); + + if (cur >= dst) { + memcpy(buf, cur, min_t(uint, PAGE_SIZE, + end - cur)); + } else { + memcpy(buf + pageofs_out, cur + pageofs_out, + min_t(uint, righthalf, end - cur)); + } + kunmap_atomic(buf); + } + cur += PAGE_SIZE; + } +} + +static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq, + struct list_head *pagepool) +{ + const unsigned int nrpages_out = + PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; + const struct z_erofs_decompressor *alg = decompressors + rq->alg; + unsigned int dst_maptype; + void *dst; + int ret, i; + + if (nrpages_out == 1 && !rq->inplace_io) { + DBG_BUGON(!*rq->out); + dst = kmap_atomic(*rq->out); + dst_maptype = 0; + goto dstmap_out; + } + + /* + * For the case of small output size (especially much less + * than PAGE_SIZE), memcpy the decompressed data rather than + * compressed data is preferred. + */ + if (rq->outputsize <= PAGE_SIZE * 7 / 8) { + dst = erofs_get_pcpubuf(0); + if (IS_ERR(dst)) + return PTR_ERR(dst); + + rq->inplace_io = false; + ret = alg->decompress(rq, dst); + if (!ret) + copy_from_pcpubuf(rq->out, dst, rq->pageofs_out, + rq->outputsize); + + erofs_put_pcpubuf(dst); + return ret; + } + + ret = alg->prepare_destpages(rq, pagepool); + if (ret < 0) { + return ret; + } else if (ret) { + dst = page_address(*rq->out); + dst_maptype = 1; + goto dstmap_out; + } + + i = 0; + while (1) { + dst = vm_map_ram(rq->out, nrpages_out, -1, PAGE_KERNEL); + + /* retry two more times (totally 3 times) */ + if (dst || ++i >= 3) + break; + vm_unmap_aliases(); + } + + if (!dst) + return -ENOMEM; + + dst_maptype = 2; + +dstmap_out: + ret = alg->decompress(rq, dst + rq->pageofs_out); + + if (!dst_maptype) + kunmap_atomic(dst); + else if (dst_maptype == 2) + vm_unmap_ram(dst, nrpages_out); + return ret; +} + +static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq, + struct list_head *pagepool) +{ + const unsigned int nrpages_out = + PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; + const unsigned int righthalf = PAGE_SIZE - rq->pageofs_out; + unsigned char *src, *dst; + + if (nrpages_out > 2) { + DBG_BUGON(1); + return -EIO; + } + + if (rq->out[0] == *rq->in) { + DBG_BUGON(nrpages_out != 1); + return 0; + } + + src = kmap_atomic(*rq->in); + if (!rq->out[0]) { + dst = NULL; + } else { + dst = kmap_atomic(rq->out[0]); + memcpy(dst + rq->pageofs_out, src, righthalf); + } + + if (rq->out[1] == *rq->in) { + memmove(src, src + righthalf, rq->pageofs_out); + } else if (nrpages_out == 2) { + if (dst) + kunmap_atomic(dst); + DBG_BUGON(!rq->out[1]); + dst = kmap_atomic(rq->out[1]); + memcpy(dst, src + righthalf, rq->pageofs_out); + } + if (dst) + kunmap_atomic(dst); + kunmap_atomic(src); + return 0; +} + +int z_erofs_decompress(struct z_erofs_decompress_req *rq, + struct list_head *pagepool) +{ + if (rq->alg == Z_EROFS_COMPRESSION_SHIFTED) + return z_erofs_shifted_transform(rq, pagepool); + return z_erofs_decompress_generic(rq, pagepool); +} + diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c new file mode 100644 index 000000000000..d28c623dfef9 --- /dev/null +++ b/fs/erofs/dir.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#include "internal.h" + +static void debug_one_dentry(unsigned char d_type, const char *de_name, + unsigned int de_namelen) +{ +#ifdef CONFIG_EROFS_FS_DEBUG + /* since the on-disk name could not have the trailing '\0' */ + unsigned char dbg_namebuf[EROFS_NAME_LEN + 1]; + + memcpy(dbg_namebuf, de_name, de_namelen); + dbg_namebuf[de_namelen] = '\0'; + + erofs_dbg("found dirent %s de_len %u d_type %d", dbg_namebuf, + de_namelen, d_type); +#endif +} + +static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, + void *dentry_blk, unsigned int *ofs, + unsigned int nameoff, unsigned int maxsize) +{ + struct erofs_dirent *de = dentry_blk + *ofs; + const struct erofs_dirent *end = dentry_blk + nameoff; + + while (de < end) { + const char *de_name; + unsigned int de_namelen; + unsigned char d_type; + + d_type = fs_ftype_to_dtype(de->file_type); + + nameoff = le16_to_cpu(de->nameoff); + de_name = (char *)dentry_blk + nameoff; + + /* the last dirent in the block? */ + if (de + 1 >= end) + de_namelen = strnlen(de_name, maxsize - nameoff); + else + de_namelen = le16_to_cpu(de[1].nameoff) - nameoff; + + /* a corrupted entry is found */ + if (nameoff + de_namelen > maxsize || + de_namelen > EROFS_NAME_LEN) { + erofs_err(dir->i_sb, "bogus dirent @ nid %llu", + EROFS_I(dir)->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; + } + + debug_one_dentry(d_type, de_name, de_namelen); + if (!dir_emit(ctx, de_name, de_namelen, + le64_to_cpu(de->nid), d_type)) + /* stopped by some reason */ + return 1; + ++de; + *ofs += sizeof(struct erofs_dirent); + } + *ofs = maxsize; + return 0; +} + +static int erofs_readdir(struct file *f, struct dir_context *ctx) +{ + struct inode *dir = file_inode(f); + struct address_space *mapping = dir->i_mapping; + const size_t dirsize = i_size_read(dir); + unsigned int i = ctx->pos / EROFS_BLKSIZ; + unsigned int ofs = ctx->pos % EROFS_BLKSIZ; + int err = 0; + bool initial = true; + + while (ctx->pos < dirsize) { + struct page *dentry_page; + struct erofs_dirent *de; + unsigned int nameoff, maxsize; + + dentry_page = read_mapping_page(mapping, i, NULL); + if (dentry_page == ERR_PTR(-ENOMEM)) { + err = -ENOMEM; + break; + } else if (IS_ERR(dentry_page)) { + erofs_err(dir->i_sb, + "fail to readdir of logical block %u of nid %llu", + i, EROFS_I(dir)->nid); + err = -EFSCORRUPTED; + break; + } + + de = (struct erofs_dirent *)kmap(dentry_page); + + nameoff = le16_to_cpu(de->nameoff); + + if (nameoff < sizeof(struct erofs_dirent) || + nameoff >= PAGE_SIZE) { + erofs_err(dir->i_sb, + "invalid de[0].nameoff %u @ nid %llu", + nameoff, EROFS_I(dir)->nid); + err = -EFSCORRUPTED; + goto skip_this; + } + + maxsize = min_t(unsigned int, + dirsize - ctx->pos + ofs, PAGE_SIZE); + + /* search dirents at the arbitrary position */ + if (initial) { + initial = false; + + ofs = roundup(ofs, sizeof(struct erofs_dirent)); + if (ofs >= nameoff) + goto skip_this; + } + + err = erofs_fill_dentries(dir, ctx, de, &ofs, + nameoff, maxsize); +skip_this: + kunmap(dentry_page); + + put_page(dentry_page); + + ctx->pos = blknr_to_addr(i) + ofs; + + if (err) + break; + ++i; + ofs = 0; + } + return err < 0 ? err : 0; +} + +const struct file_operations erofs_dir_fops = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate_shared = erofs_readdir, +}; + diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h new file mode 100644 index 000000000000..b1ee5654750d --- /dev/null +++ b/fs/erofs/erofs_fs.h @@ -0,0 +1,316 @@ +/* SPDX-License-Identifier: GPL-2.0-only OR Apache-2.0 */ +/* + * EROFS (Enhanced ROM File System) on-disk format definition + * + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#ifndef __EROFS_FS_H +#define __EROFS_FS_H + +#define EROFS_SUPER_OFFSET 1024 + +/* + * Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should + * be incompatible with this kernel version. + */ +#define EROFS_FEATURE_INCOMPAT_LZ4_0PADDING 0x00000001 +#define EROFS_ALL_FEATURE_INCOMPAT EROFS_FEATURE_INCOMPAT_LZ4_0PADDING + +/* 128-byte erofs on-disk super block */ +struct erofs_super_block { + __le32 magic; /* file system magic number */ + __le32 checksum; /* crc32c(super_block) */ + __le32 feature_compat; + __u8 blkszbits; /* support block_size == PAGE_SIZE only */ + __u8 reserved; + + __le16 root_nid; /* nid of root directory */ + __le64 inos; /* total valid ino # (== f_files - f_favail) */ + + __le64 build_time; /* inode v1 time derivation */ + __le32 build_time_nsec; /* inode v1 time derivation in nano scale */ + __le32 blocks; /* used for statfs */ + __le32 meta_blkaddr; /* start block address of metadata area */ + __le32 xattr_blkaddr; /* start block address of shared xattr area */ + __u8 uuid[16]; /* 128-bit uuid for volume */ + __u8 volume_name[16]; /* volume name */ + __le32 feature_incompat; + + __u8 reserved2[44]; +}; + +/* + * erofs inode datalayout (i_format in on-disk inode): + * 0 - inode plain without inline data A: + * inode, [xattrs], ... | ... | no-holed data + * 1 - inode VLE compression B (legacy): + * inode, [xattrs], extents ... | ... + * 2 - inode plain with inline data C: + * inode, [xattrs], last_inline_data, ... | ... | no-holed data + * 3 - inode compression D: + * inode, [xattrs], map_header, extents ... | ... + * 4~7 - reserved + */ +enum { + EROFS_INODE_FLAT_PLAIN = 0, + EROFS_INODE_FLAT_COMPRESSION_LEGACY = 1, + EROFS_INODE_FLAT_INLINE = 2, + EROFS_INODE_FLAT_COMPRESSION = 3, + EROFS_INODE_DATALAYOUT_MAX +}; + +static inline bool erofs_inode_is_data_compressed(unsigned int datamode) +{ + return datamode == EROFS_INODE_FLAT_COMPRESSION || + datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY; +} + +/* bit definitions of inode i_advise */ +#define EROFS_I_VERSION_BITS 1 +#define EROFS_I_DATALAYOUT_BITS 3 + +#define EROFS_I_VERSION_BIT 0 +#define EROFS_I_DATALAYOUT_BIT 1 + +/* 32-byte reduced form of an ondisk inode */ +struct erofs_inode_compact { + __le16 i_format; /* inode format hints */ + +/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */ + __le16 i_xattr_icount; + __le16 i_mode; + __le16 i_nlink; + __le32 i_size; + __le32 i_reserved; + union { + /* file total compressed blocks for data mapping 1 */ + __le32 compressed_blocks; + __le32 raw_blkaddr; + + /* for device files, used to indicate old/new device # */ + __le32 rdev; + } i_u; + __le32 i_ino; /* only used for 32-bit stat compatibility */ + __le16 i_uid; + __le16 i_gid; + __le32 i_reserved2; +}; + +/* 32 bytes on-disk inode */ +#define EROFS_INODE_LAYOUT_COMPACT 0 +/* 64 bytes on-disk inode */ +#define EROFS_INODE_LAYOUT_EXTENDED 1 + +/* 64-byte complete form of an ondisk inode */ +struct erofs_inode_extended { + __le16 i_format; /* inode format hints */ + +/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */ + __le16 i_xattr_icount; + __le16 i_mode; + __le16 i_reserved; + __le64 i_size; + union { + /* file total compressed blocks for data mapping 1 */ + __le32 compressed_blocks; + __le32 raw_blkaddr; + + /* for device files, used to indicate old/new device # */ + __le32 rdev; + } i_u; + + /* only used for 32-bit stat compatibility */ + __le32 i_ino; + + __le32 i_uid; + __le32 i_gid; + __le64 i_ctime; + __le32 i_ctime_nsec; + __le32 i_nlink; + __u8 i_reserved2[16]; +}; + +#define EROFS_MAX_SHARED_XATTRS (128) +/* h_shared_count between 129 ... 255 are special # */ +#define EROFS_SHARED_XATTR_EXTENT (255) + +/* + * inline xattrs (n == i_xattr_icount): + * erofs_xattr_ibody_header(1) + (n - 1) * 4 bytes + * 12 bytes / \ + * / \ + * /-----------------------\ + * | erofs_xattr_entries+ | + * +-----------------------+ + * inline xattrs must starts in erofs_xattr_ibody_header, + * for read-only fs, no need to introduce h_refcount + */ +struct erofs_xattr_ibody_header { + __le32 h_reserved; + __u8 h_shared_count; + __u8 h_reserved2[7]; + __le32 h_shared_xattrs[0]; /* shared xattr id array */ +}; + +/* Name indexes */ +#define EROFS_XATTR_INDEX_USER 1 +#define EROFS_XATTR_INDEX_POSIX_ACL_ACCESS 2 +#define EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT 3 +#define EROFS_XATTR_INDEX_TRUSTED 4 +#define EROFS_XATTR_INDEX_LUSTRE 5 +#define EROFS_XATTR_INDEX_SECURITY 6 + +/* xattr entry (for both inline & shared xattrs) */ +struct erofs_xattr_entry { + __u8 e_name_len; /* length of name */ + __u8 e_name_index; /* attribute name index */ + __le16 e_value_size; /* size of attribute value */ + /* followed by e_name and e_value */ + char e_name[0]; /* attribute name */ +}; + +static inline unsigned int erofs_xattr_ibody_size(__le16 i_xattr_icount) +{ + if (!i_xattr_icount) + return 0; + + return sizeof(struct erofs_xattr_ibody_header) + + sizeof(__u32) * (le16_to_cpu(i_xattr_icount) - 1); +} + +#define EROFS_XATTR_ALIGN(size) round_up(size, sizeof(struct erofs_xattr_entry)) + +static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e) +{ + return EROFS_XATTR_ALIGN(sizeof(struct erofs_xattr_entry) + + e->e_name_len + le16_to_cpu(e->e_value_size)); +} + +/* available compression algorithm types (for h_algorithmtype) */ +enum { + Z_EROFS_COMPRESSION_LZ4 = 0, + Z_EROFS_COMPRESSION_MAX +}; + +/* + * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on) + * e.g. for 4k logical cluster size, 4B if compacted 2B is off; + * (4B) + 2B + (4B) if compacted 2B is on. + */ +#define Z_EROFS_ADVISE_COMPACTED_2B_BIT 0 + +#define Z_EROFS_ADVISE_COMPACTED_2B (1 << Z_EROFS_ADVISE_COMPACTED_2B_BIT) + +struct z_erofs_map_header { + __le32 h_reserved1; + __le16 h_advise; + /* + * bit 0-3 : algorithm type of head 1 (logical cluster type 01); + * bit 4-7 : algorithm type of head 2 (logical cluster type 11). + */ + __u8 h_algorithmtype; + /* + * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096; + * bit 3-4 : (physical - logical) cluster bits of head 1: + * For example, if logical clustersize = 4096, 1 for 8192. + * bit 5-7 : (physical - logical) cluster bits of head 2. + */ + __u8 h_clusterbits; +}; + +#define Z_EROFS_VLE_LEGACY_HEADER_PADDING 8 + +/* + * Fixed-sized output compression ondisk Logical Extent cluster type: + * 0 - literal (uncompressed) cluster + * 1 - compressed cluster (for the head logical cluster) + * 2 - compressed cluster (for the other logical clusters) + * + * In detail, + * 0 - literal (uncompressed) cluster, + * di_advise = 0 + * di_clusterofs = the literal data offset of the cluster + * di_blkaddr = the blkaddr of the literal cluster + * + * 1 - compressed cluster (for the head logical cluster) + * di_advise = 1 + * di_clusterofs = the decompressed data offset of the cluster + * di_blkaddr = the blkaddr of the compressed cluster + * + * 2 - compressed cluster (for the other logical clusters) + * di_advise = 2 + * di_clusterofs = + * the decompressed data offset in its own head cluster + * di_u.delta[0] = distance to its corresponding head cluster + * di_u.delta[1] = distance to its corresponding tail cluster + * (di_advise could be 0, 1 or 2) + */ +enum { + Z_EROFS_VLE_CLUSTER_TYPE_PLAIN = 0, + Z_EROFS_VLE_CLUSTER_TYPE_HEAD = 1, + Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD = 2, + Z_EROFS_VLE_CLUSTER_TYPE_RESERVED = 3, + Z_EROFS_VLE_CLUSTER_TYPE_MAX +}; + +#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS 2 +#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT 0 + +struct z_erofs_vle_decompressed_index { + __le16 di_advise; + /* where to decompress in the head cluster */ + __le16 di_clusterofs; + + union { + /* for the head cluster */ + __le32 blkaddr; + /* + * for the rest clusters + * eg. for 4k page-sized cluster, maximum 4K*64k = 256M) + * [0] - pointing to the head cluster + * [1] - pointing to the tail cluster + */ + __le16 delta[2]; + } di_u; +}; + +#define Z_EROFS_VLE_LEGACY_INDEX_ALIGN(size) \ + (round_up(size, sizeof(struct z_erofs_vle_decompressed_index)) + \ + sizeof(struct z_erofs_map_header) + Z_EROFS_VLE_LEGACY_HEADER_PADDING) + +/* dirent sorts in alphabet order, thus we can do binary search */ +struct erofs_dirent { + __le64 nid; /* node number */ + __le16 nameoff; /* start offset of file name */ + __u8 file_type; /* file type */ + __u8 reserved; /* reserved */ +} __packed; + +/* + * EROFS file types should match generic FT_* types and + * it seems no need to add BUILD_BUG_ONs since potential + * unmatchness will break other fses as well... + */ + +#define EROFS_NAME_LEN 255 + +/* check the EROFS on-disk layout strictly at compile time */ +static inline void erofs_check_ondisk_layout_definitions(void) +{ + BUILD_BUG_ON(sizeof(struct erofs_super_block) != 128); + BUILD_BUG_ON(sizeof(struct erofs_inode_compact) != 32); + BUILD_BUG_ON(sizeof(struct erofs_inode_extended) != 64); + BUILD_BUG_ON(sizeof(struct erofs_xattr_ibody_header) != 12); + BUILD_BUG_ON(sizeof(struct erofs_xattr_entry) != 4); + BUILD_BUG_ON(sizeof(struct z_erofs_map_header) != 8); + BUILD_BUG_ON(sizeof(struct z_erofs_vle_decompressed_index) != 8); + BUILD_BUG_ON(sizeof(struct erofs_dirent) != 12); + + BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) < + Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1); +} + +#endif + diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c new file mode 100644 index 000000000000..3350ab65d892 --- /dev/null +++ b/fs/erofs/inode.c @@ -0,0 +1,337 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#include "xattr.h" + +#include <trace/events/erofs.h> + +/* no locking */ +static int erofs_read_inode(struct inode *inode, void *data) +{ + struct erofs_inode *vi = EROFS_I(inode); + struct erofs_inode_compact *dic = data; + struct erofs_inode_extended *die; + + const unsigned int ifmt = le16_to_cpu(dic->i_format); + struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); + erofs_blk_t nblks = 0; + + vi->datalayout = erofs_inode_datalayout(ifmt); + + if (vi->datalayout >= EROFS_INODE_DATALAYOUT_MAX) { + erofs_err(inode->i_sb, "unsupported datalayout %u of nid %llu", + vi->datalayout, vi->nid); + DBG_BUGON(1); + return -EOPNOTSUPP; + } + + switch (erofs_inode_version(ifmt)) { + case EROFS_INODE_LAYOUT_EXTENDED: + die = data; + + vi->inode_isize = sizeof(struct erofs_inode_extended); + vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount); + + inode->i_mode = le16_to_cpu(die->i_mode); + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + vi->raw_blkaddr = le32_to_cpu(die->i_u.raw_blkaddr); + break; + case S_IFCHR: + case S_IFBLK: + inode->i_rdev = + new_decode_dev(le32_to_cpu(die->i_u.rdev)); + break; + case S_IFIFO: + case S_IFSOCK: + inode->i_rdev = 0; + break; + default: + goto bogusimode; + } + i_uid_write(inode, le32_to_cpu(die->i_uid)); + i_gid_write(inode, le32_to_cpu(die->i_gid)); + set_nlink(inode, le32_to_cpu(die->i_nlink)); + + /* ns timestamp */ + inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = + le64_to_cpu(die->i_ctime); + inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = + le32_to_cpu(die->i_ctime_nsec); + + inode->i_size = le64_to_cpu(die->i_size); + + /* total blocks for compressed files */ + if (erofs_inode_is_data_compressed(vi->datalayout)) + nblks = le32_to_cpu(die->i_u.compressed_blocks); + break; + case EROFS_INODE_LAYOUT_COMPACT: + vi->inode_isize = sizeof(struct erofs_inode_compact); + vi->xattr_isize = erofs_xattr_ibody_size(dic->i_xattr_icount); + + inode->i_mode = le16_to_cpu(dic->i_mode); + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + vi->raw_blkaddr = le32_to_cpu(dic->i_u.raw_blkaddr); + break; + case S_IFCHR: + case S_IFBLK: + inode->i_rdev = + new_decode_dev(le32_to_cpu(dic->i_u.rdev)); + break; + case S_IFIFO: + case S_IFSOCK: + inode->i_rdev = 0; + break; + default: + goto bogusimode; + } + i_uid_write(inode, le16_to_cpu(dic->i_uid)); + i_gid_write(inode, le16_to_cpu(dic->i_gid)); + set_nlink(inode, le16_to_cpu(dic->i_nlink)); + + /* use build time to derive all file time */ + inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = + sbi->build_time; + inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = + sbi->build_time_nsec; + + inode->i_size = le32_to_cpu(dic->i_size); + if (erofs_inode_is_data_compressed(vi->datalayout)) + nblks = le32_to_cpu(dic->i_u.compressed_blocks); + break; + default: + erofs_err(inode->i_sb, + "unsupported on-disk inode version %u of nid %llu", + erofs_inode_version(ifmt), vi->nid); + DBG_BUGON(1); + return -EOPNOTSUPP; + } + + if (!nblks) + /* measure inode.i_blocks as generic filesystems */ + inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9; + else + inode->i_blocks = nblks << LOG_SECTORS_PER_BLOCK; + return 0; + +bogusimode: + erofs_err(inode->i_sb, "bogus i_mode (%o) @ nid %llu", + inode->i_mode, vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; +} + +static int erofs_fill_symlink(struct inode *inode, void *data, + unsigned int m_pofs) +{ + struct erofs_inode *vi = EROFS_I(inode); + char *lnk; + + /* if it cannot be handled with fast symlink scheme */ + if (vi->datalayout != EROFS_INODE_FLAT_INLINE || + inode->i_size >= PAGE_SIZE) { + inode->i_op = &erofs_symlink_iops; + return 0; + } + + lnk = kmalloc(inode->i_size + 1, GFP_KERNEL); + if (!lnk) + return -ENOMEM; + + m_pofs += vi->inode_isize + vi->xattr_isize; + /* inline symlink data shouldn't cross page boundary as well */ + if (m_pofs + inode->i_size > PAGE_SIZE) { + kfree(lnk); + erofs_err(inode->i_sb, + "inline data cross block boundary @ nid %llu", + vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; + } + + memcpy(lnk, data + m_pofs, inode->i_size); + lnk[inode->i_size] = '\0'; + + inode->i_link = lnk; + inode->i_op = &erofs_fast_symlink_iops; + return 0; +} + +static int erofs_fill_inode(struct inode *inode, int isdir) +{ + struct super_block *sb = inode->i_sb; + struct erofs_inode *vi = EROFS_I(inode); + struct page *page; + void *data; + int err; + erofs_blk_t blkaddr; + unsigned int ofs; + erofs_off_t inode_loc; + + trace_erofs_fill_inode(inode, isdir); + inode_loc = iloc(EROFS_SB(sb), vi->nid); + blkaddr = erofs_blknr(inode_loc); + ofs = erofs_blkoff(inode_loc); + + erofs_dbg("%s, reading inode nid %llu at %u of blkaddr %u", + __func__, vi->nid, ofs, blkaddr); + + page = erofs_get_meta_page(sb, blkaddr); + + if (IS_ERR(page)) { + erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld", + vi->nid, PTR_ERR(page)); + return PTR_ERR(page); + } + + DBG_BUGON(!PageUptodate(page)); + data = page_address(page); + + err = erofs_read_inode(inode, data + ofs); + if (err) + goto out_unlock; + + /* setup the new inode */ + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_op = &erofs_generic_iops; + inode->i_fop = &generic_ro_fops; + break; + case S_IFDIR: + inode->i_op = &erofs_dir_iops; + inode->i_fop = &erofs_dir_fops; + break; + case S_IFLNK: + err = erofs_fill_symlink(inode, data, ofs); + if (err) + goto out_unlock; + inode_nohighmem(inode); + break; + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + inode->i_op = &erofs_generic_iops; + init_special_inode(inode, inode->i_mode, inode->i_rdev); + goto out_unlock; + default: + err = -EFSCORRUPTED; + goto out_unlock; + } + + if (erofs_inode_is_data_compressed(vi->datalayout)) { + err = z_erofs_fill_inode(inode); + goto out_unlock; + } + inode->i_mapping->a_ops = &erofs_raw_access_aops; + +out_unlock: + unlock_page(page); + put_page(page); + return err; +} + +/* + * erofs nid is 64bits, but i_ino is 'unsigned long', therefore + * we should do more for 32-bit platform to find the right inode. + */ +static int erofs_ilookup_test_actor(struct inode *inode, void *opaque) +{ + const erofs_nid_t nid = *(erofs_nid_t *)opaque; + + return EROFS_I(inode)->nid == nid; +} + +static int erofs_iget_set_actor(struct inode *inode, void *opaque) +{ + const erofs_nid_t nid = *(erofs_nid_t *)opaque; + + inode->i_ino = erofs_inode_hash(nid); + return 0; +} + +static inline struct inode *erofs_iget_locked(struct super_block *sb, + erofs_nid_t nid) +{ + const unsigned long hashval = erofs_inode_hash(nid); + + return iget5_locked(sb, hashval, erofs_ilookup_test_actor, + erofs_iget_set_actor, &nid); +} + +struct inode *erofs_iget(struct super_block *sb, + erofs_nid_t nid, + bool isdir) +{ + struct inode *inode = erofs_iget_locked(sb, nid); + + if (!inode) + return ERR_PTR(-ENOMEM); + + if (inode->i_state & I_NEW) { + int err; + struct erofs_inode *vi = EROFS_I(inode); + + vi->nid = nid; + + err = erofs_fill_inode(inode, isdir); + if (!err) + unlock_new_inode(inode); + else { + iget_failed(inode); + inode = ERR_PTR(err); + } + } + return inode; +} + +int erofs_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) +{ + struct inode *const inode = d_inode(path->dentry); + + if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) + stat->attributes |= STATX_ATTR_COMPRESSED; + + stat->attributes |= STATX_ATTR_IMMUTABLE; + stat->attributes_mask |= (STATX_ATTR_COMPRESSED | + STATX_ATTR_IMMUTABLE); + + generic_fillattr(inode, stat); + return 0; +} + +const struct inode_operations erofs_generic_iops = { + .getattr = erofs_getattr, +#ifdef CONFIG_EROFS_FS_XATTR + .listxattr = erofs_listxattr, +#endif + .get_acl = erofs_get_acl, +}; + +const struct inode_operations erofs_symlink_iops = { + .get_link = page_get_link, + .getattr = erofs_getattr, +#ifdef CONFIG_EROFS_FS_XATTR + .listxattr = erofs_listxattr, +#endif + .get_acl = erofs_get_acl, +}; + +const struct inode_operations erofs_fast_symlink_iops = { + .get_link = simple_get_link, + .getattr = erofs_getattr, +#ifdef CONFIG_EROFS_FS_XATTR + .listxattr = erofs_listxattr, +#endif + .get_acl = erofs_get_acl, +}; + diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h new file mode 100644 index 000000000000..544a453f3076 --- /dev/null +++ b/fs/erofs/internal.h @@ -0,0 +1,431 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#ifndef __EROFS_INTERNAL_H +#define __EROFS_INTERNAL_H + +#include <linux/fs.h> +#include <linux/dcache.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/bio.h> +#include <linux/buffer_head.h> +#include <linux/magic.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include "erofs_fs.h" + +/* redefine pr_fmt "erofs: " */ +#undef pr_fmt +#define pr_fmt(fmt) "erofs: " fmt + +__printf(3, 4) void _erofs_err(struct super_block *sb, + const char *function, const char *fmt, ...); +#define erofs_err(sb, fmt, ...) \ + _erofs_err(sb, __func__, fmt "\n", ##__VA_ARGS__) +__printf(3, 4) void _erofs_info(struct super_block *sb, + const char *function, const char *fmt, ...); +#define erofs_info(sb, fmt, ...) \ + _erofs_info(sb, __func__, fmt "\n", ##__VA_ARGS__) +#ifdef CONFIG_EROFS_FS_DEBUG +#define erofs_dbg(x, ...) pr_debug(x "\n", ##__VA_ARGS__) +#define DBG_BUGON BUG_ON +#else +#define erofs_dbg(x, ...) ((void)0) +#define DBG_BUGON(x) ((void)(x)) +#endif /* !CONFIG_EROFS_FS_DEBUG */ + +/* EROFS_SUPER_MAGIC_V1 to represent the whole file system */ +#define EROFS_SUPER_MAGIC EROFS_SUPER_MAGIC_V1 + +typedef u64 erofs_nid_t; +typedef u64 erofs_off_t; +/* data type for filesystem-wide blocks number */ +typedef u32 erofs_blk_t; + +struct erofs_sb_info { +#ifdef CONFIG_EROFS_FS_ZIP + /* list for all registered superblocks, mainly for shrinker */ + struct list_head list; + struct mutex umount_mutex; + + /* the dedicated workstation for compression */ + struct radix_tree_root workstn_tree; + + /* threshold for decompression synchronously */ + unsigned int max_sync_decompress_pages; + + unsigned int shrinker_run_no; + + /* current strategy of how to use managed cache */ + unsigned char cache_strategy; + + /* pseudo inode to manage cached pages */ + struct inode *managed_cache; +#endif /* CONFIG_EROFS_FS_ZIP */ + u32 blocks; + u32 meta_blkaddr; +#ifdef CONFIG_EROFS_FS_XATTR + u32 xattr_blkaddr; +#endif + + /* inode slot unit size in bit shift */ + unsigned char islotbits; + + u32 build_time_nsec; + u64 build_time; + + /* what we really care is nid, rather than ino.. */ + erofs_nid_t root_nid; + /* used for statfs, f_files - f_favail */ + u64 inos; + + u8 uuid[16]; /* 128-bit uuid for volume */ + u8 volume_name[16]; /* volume name */ + u32 feature_incompat; + + unsigned int mount_opt; +}; + +#define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info) +#define EROFS_I_SB(inode) ((struct erofs_sb_info *)(inode)->i_sb->s_fs_info) + +/* Mount flags set via mount options or defaults */ +#define EROFS_MOUNT_XATTR_USER 0x00000010 +#define EROFS_MOUNT_POSIX_ACL 0x00000020 + +#define clear_opt(sbi, option) ((sbi)->mount_opt &= ~EROFS_MOUNT_##option) +#define set_opt(sbi, option) ((sbi)->mount_opt |= EROFS_MOUNT_##option) +#define test_opt(sbi, option) ((sbi)->mount_opt & EROFS_MOUNT_##option) + +#ifdef CONFIG_EROFS_FS_ZIP +enum { + EROFS_ZIP_CACHE_DISABLED, + EROFS_ZIP_CACHE_READAHEAD, + EROFS_ZIP_CACHE_READAROUND +}; + +#define EROFS_LOCKED_MAGIC (INT_MIN | 0xE0F510CCL) + +/* basic unit of the workstation of a super_block */ +struct erofs_workgroup { + /* the workgroup index in the workstation */ + pgoff_t index; + + /* overall workgroup reference count */ + atomic_t refcount; +}; + +#if defined(CONFIG_SMP) +static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp, + int val) +{ + preempt_disable(); + if (val != atomic_cmpxchg(&grp->refcount, val, EROFS_LOCKED_MAGIC)) { + preempt_enable(); + return false; + } + return true; +} + +static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp, + int orig_val) +{ + /* + * other observers should notice all modifications + * in the freezing period. + */ + smp_mb(); + atomic_set(&grp->refcount, orig_val); + preempt_enable(); +} + +static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp) +{ + return atomic_cond_read_relaxed(&grp->refcount, + VAL != EROFS_LOCKED_MAGIC); +} +#else +static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp, + int val) +{ + preempt_disable(); + /* no need to spin on UP platforms, let's just disable preemption. */ + if (val != atomic_read(&grp->refcount)) { + preempt_enable(); + return false; + } + return true; +} + +static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp, + int orig_val) +{ + preempt_enable(); +} + +static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp) +{ + int v = atomic_read(&grp->refcount); + + /* workgroup is never freezed on uniprocessor systems */ + DBG_BUGON(v == EROFS_LOCKED_MAGIC); + return v; +} +#endif /* !CONFIG_SMP */ + +/* hard limit of pages per compressed cluster */ +#define Z_EROFS_CLUSTER_MAX_PAGES (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT) +#define EROFS_PCPUBUF_NR_PAGES Z_EROFS_CLUSTER_MAX_PAGES +#else +#define EROFS_PCPUBUF_NR_PAGES 0 +#endif /* !CONFIG_EROFS_FS_ZIP */ + +/* we strictly follow PAGE_SIZE and no buffer head yet */ +#define LOG_BLOCK_SIZE PAGE_SHIFT + +#undef LOG_SECTORS_PER_BLOCK +#define LOG_SECTORS_PER_BLOCK (PAGE_SHIFT - 9) + +#undef SECTORS_PER_BLOCK +#define SECTORS_PER_BLOCK (1 << SECTORS_PER_BLOCK) + +#define EROFS_BLKSIZ (1 << LOG_BLOCK_SIZE) + +#if (EROFS_BLKSIZ % 4096 || !EROFS_BLKSIZ) +#error erofs cannot be used in this platform +#endif + +#define ROOT_NID(sb) ((sb)->root_nid) + +#define erofs_blknr(addr) ((addr) / EROFS_BLKSIZ) +#define erofs_blkoff(addr) ((addr) % EROFS_BLKSIZ) +#define blknr_to_addr(nr) ((erofs_off_t)(nr) * EROFS_BLKSIZ) + +static inline erofs_off_t iloc(struct erofs_sb_info *sbi, erofs_nid_t nid) +{ + return blknr_to_addr(sbi->meta_blkaddr) + (nid << sbi->islotbits); +} + +/* atomic flag definitions */ +#define EROFS_I_EA_INITED_BIT 0 +#define EROFS_I_Z_INITED_BIT 1 + +/* bitlock definitions (arranged in reverse order) */ +#define EROFS_I_BL_XATTR_BIT (BITS_PER_LONG - 1) +#define EROFS_I_BL_Z_BIT (BITS_PER_LONG - 2) + +struct erofs_inode { + erofs_nid_t nid; + + /* atomic flags (including bitlocks) */ + unsigned long flags; + + unsigned char datalayout; + unsigned char inode_isize; + unsigned short xattr_isize; + + unsigned int xattr_shared_count; + unsigned int *xattr_shared_xattrs; + + union { + erofs_blk_t raw_blkaddr; +#ifdef CONFIG_EROFS_FS_ZIP + struct { + unsigned short z_advise; + unsigned char z_algorithmtype[2]; + unsigned char z_logical_clusterbits; + unsigned char z_physical_clusterbits[2]; + }; +#endif /* CONFIG_EROFS_FS_ZIP */ + }; + /* the corresponding vfs inode */ + struct inode vfs_inode; +}; + +#define EROFS_I(ptr) \ + container_of(ptr, struct erofs_inode, vfs_inode) + +static inline unsigned long erofs_inode_datablocks(struct inode *inode) +{ + /* since i_size cannot be changed */ + return DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ); +} + +static inline unsigned int erofs_bitrange(unsigned int value, unsigned int bit, + unsigned int bits) +{ + + return (value >> bit) & ((1 << bits) - 1); +} + + +static inline unsigned int erofs_inode_version(unsigned int value) +{ + return erofs_bitrange(value, EROFS_I_VERSION_BIT, + EROFS_I_VERSION_BITS); +} + +static inline unsigned int erofs_inode_datalayout(unsigned int value) +{ + return erofs_bitrange(value, EROFS_I_DATALAYOUT_BIT, + EROFS_I_DATALAYOUT_BITS); +} + +extern const struct super_operations erofs_sops; + +extern const struct address_space_operations erofs_raw_access_aops; +#ifdef CONFIG_EROFS_FS_ZIP +extern const struct address_space_operations z_erofs_vle_normalaccess_aops; +#endif + +/* + * Logical to physical block mapping, used by erofs_map_blocks() + * + * Different with other file systems, it is used for 2 access modes: + * + * 1) RAW access mode: + * + * Users pass a valid (m_lblk, m_lofs -- usually 0) pair, + * and get the valid m_pblk, m_pofs and the longest m_len(in bytes). + * + * Note that m_lblk in the RAW access mode refers to the number of + * the compressed ondisk block rather than the uncompressed + * in-memory block for the compressed file. + * + * m_pofs equals to m_lofs except for the inline data page. + * + * 2) Normal access mode: + * + * If the inode is not compressed, it has no difference with + * the RAW access mode. However, if the inode is compressed, + * users should pass a valid (m_lblk, m_lofs) pair, and get + * the needed m_pblk, m_pofs, m_len to get the compressed data + * and the updated m_lblk, m_lofs which indicates the start + * of the corresponding uncompressed data in the file. + */ +enum { + BH_Zipped = BH_PrivateStart, + BH_FullMapped, +}; + +/* Has a disk mapping */ +#define EROFS_MAP_MAPPED (1 << BH_Mapped) +/* Located in metadata (could be copied from bd_inode) */ +#define EROFS_MAP_META (1 << BH_Meta) +/* The extent has been compressed */ +#define EROFS_MAP_ZIPPED (1 << BH_Zipped) +/* The length of extent is full */ +#define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped) + +struct erofs_map_blocks { + erofs_off_t m_pa, m_la; + u64 m_plen, m_llen; + + unsigned int m_flags; + + struct page *mpage; +}; + +/* Flags used by erofs_map_blocks() */ +#define EROFS_GET_BLOCKS_RAW 0x0001 + +/* zmap.c */ +#ifdef CONFIG_EROFS_FS_ZIP +int z_erofs_fill_inode(struct inode *inode); +int z_erofs_map_blocks_iter(struct inode *inode, + struct erofs_map_blocks *map, + int flags); +#else +static inline int z_erofs_fill_inode(struct inode *inode) { return -EOPNOTSUPP; } +static inline int z_erofs_map_blocks_iter(struct inode *inode, + struct erofs_map_blocks *map, + int flags) +{ + return -EOPNOTSUPP; +} +#endif /* !CONFIG_EROFS_FS_ZIP */ + +/* data.c */ +struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr); + +int erofs_map_blocks(struct inode *, struct erofs_map_blocks *, int); + +/* inode.c */ +static inline unsigned long erofs_inode_hash(erofs_nid_t nid) +{ +#if BITS_PER_LONG == 32 + return (nid >> 32) ^ (nid & 0xffffffff); +#else + return nid; +#endif +} + +extern const struct inode_operations erofs_generic_iops; +extern const struct inode_operations erofs_symlink_iops; +extern const struct inode_operations erofs_fast_symlink_iops; + +struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid, bool dir); +int erofs_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags); + +/* namei.c */ +extern const struct inode_operations erofs_dir_iops; + +int erofs_namei(struct inode *dir, struct qstr *name, + erofs_nid_t *nid, unsigned int *d_type); + +/* dir.c */ +extern const struct file_operations erofs_dir_fops; + +/* utils.c / zdata.c */ +struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp, bool nofail); + +#if (EROFS_PCPUBUF_NR_PAGES > 0) +void *erofs_get_pcpubuf(unsigned int pagenr); +#define erofs_put_pcpubuf(buf) do { \ + (void)&(buf); \ + preempt_enable(); \ +} while (0) +#else +static inline void *erofs_get_pcpubuf(unsigned int pagenr) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +#define erofs_put_pcpubuf(buf) do {} while (0) +#endif + +#ifdef CONFIG_EROFS_FS_ZIP +int erofs_workgroup_put(struct erofs_workgroup *grp); +struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, + pgoff_t index, bool *tag); +int erofs_register_workgroup(struct super_block *sb, + struct erofs_workgroup *grp, bool tag); +void erofs_workgroup_free_rcu(struct erofs_workgroup *grp); +void erofs_shrinker_register(struct super_block *sb); +void erofs_shrinker_unregister(struct super_block *sb); +int __init erofs_init_shrinker(void); +void erofs_exit_shrinker(void); +int __init z_erofs_init_zip_subsystem(void); +void z_erofs_exit_zip_subsystem(void); +int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, + struct erofs_workgroup *egrp); +int erofs_try_to_free_cached_page(struct address_space *mapping, + struct page *page); +#else +static inline void erofs_shrinker_register(struct super_block *sb) {} +static inline void erofs_shrinker_unregister(struct super_block *sb) {} +static inline int erofs_init_shrinker(void) { return 0; } +static inline void erofs_exit_shrinker(void) {} +static inline int z_erofs_init_zip_subsystem(void) { return 0; } +static inline void z_erofs_exit_zip_subsystem(void) {} +#endif /* !CONFIG_EROFS_FS_ZIP */ + +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ + +#endif /* __EROFS_INTERNAL_H */ + diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c new file mode 100644 index 000000000000..3abbecbf73de --- /dev/null +++ b/fs/erofs/namei.c @@ -0,0 +1,252 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#include "xattr.h" + +#include <trace/events/erofs.h> + +struct erofs_qstr { + const unsigned char *name; + const unsigned char *end; +}; + +/* based on the end of qn is accurate and it must have the trailing '\0' */ +static inline int erofs_dirnamecmp(const struct erofs_qstr *qn, + const struct erofs_qstr *qd, + unsigned int *matched) +{ + unsigned int i = *matched; + + /* + * on-disk error, let's only BUG_ON in the debugging mode. + * otherwise, it will return 1 to just skip the invalid name + * and go on (in consideration of the lookup performance). + */ + DBG_BUGON(qd->name > qd->end); + + /* qd could not have trailing '\0' */ + /* However it is absolutely safe if < qd->end */ + while (qd->name + i < qd->end && qd->name[i] != '\0') { + if (qn->name[i] != qd->name[i]) { + *matched = i; + return qn->name[i] > qd->name[i] ? 1 : -1; + } + ++i; + } + *matched = i; + /* See comments in __d_alloc on the terminating NUL character */ + return qn->name[i] == '\0' ? 0 : 1; +} + +#define nameoff_from_disk(off, sz) (le16_to_cpu(off) & ((sz) - 1)) + +static struct erofs_dirent *find_target_dirent(struct erofs_qstr *name, + u8 *data, + unsigned int dirblksize, + const int ndirents) +{ + int head, back; + unsigned int startprfx, endprfx; + struct erofs_dirent *const de = (struct erofs_dirent *)data; + + /* since the 1st dirent has been evaluated previously */ + head = 1; + back = ndirents - 1; + startprfx = endprfx = 0; + + while (head <= back) { + const int mid = head + (back - head) / 2; + const int nameoff = nameoff_from_disk(de[mid].nameoff, + dirblksize); + unsigned int matched = min(startprfx, endprfx); + struct erofs_qstr dname = { + .name = data + nameoff, + .end = mid >= ndirents - 1 ? + data + dirblksize : + data + nameoff_from_disk(de[mid + 1].nameoff, + dirblksize) + }; + + /* string comparison without already matched prefix */ + int ret = erofs_dirnamecmp(name, &dname, &matched); + + if (!ret) { + return de + mid; + } else if (ret > 0) { + head = mid + 1; + startprfx = matched; + } else { + back = mid - 1; + endprfx = matched; + } + } + + return ERR_PTR(-ENOENT); +} + +static struct page *find_target_block_classic(struct inode *dir, + struct erofs_qstr *name, + int *_ndirents) +{ + unsigned int startprfx, endprfx; + int head, back; + struct address_space *const mapping = dir->i_mapping; + struct page *candidate = ERR_PTR(-ENOENT); + + startprfx = endprfx = 0; + head = 0; + back = erofs_inode_datablocks(dir) - 1; + + while (head <= back) { + const int mid = head + (back - head) / 2; + struct page *page = read_mapping_page(mapping, mid, NULL); + + if (!IS_ERR(page)) { + struct erofs_dirent *de = kmap_atomic(page); + const int nameoff = nameoff_from_disk(de->nameoff, + EROFS_BLKSIZ); + const int ndirents = nameoff / sizeof(*de); + int diff; + unsigned int matched; + struct erofs_qstr dname; + + if (!ndirents) { + kunmap_atomic(de); + put_page(page); + erofs_err(dir->i_sb, + "corrupted dir block %d @ nid %llu", + mid, EROFS_I(dir)->nid); + DBG_BUGON(1); + page = ERR_PTR(-EFSCORRUPTED); + goto out; + } + + matched = min(startprfx, endprfx); + + dname.name = (u8 *)de + nameoff; + if (ndirents == 1) + dname.end = (u8 *)de + EROFS_BLKSIZ; + else + dname.end = (u8 *)de + + nameoff_from_disk(de[1].nameoff, + EROFS_BLKSIZ); + + /* string comparison without already matched prefix */ + diff = erofs_dirnamecmp(name, &dname, &matched); + kunmap_atomic(de); + + if (!diff) { + *_ndirents = 0; + goto out; + } else if (diff > 0) { + head = mid + 1; + startprfx = matched; + + if (!IS_ERR(candidate)) + put_page(candidate); + candidate = page; + *_ndirents = ndirents; + } else { + put_page(page); + + back = mid - 1; + endprfx = matched; + } + continue; + } +out: /* free if the candidate is valid */ + if (!IS_ERR(candidate)) + put_page(candidate); + return page; + } + return candidate; +} + +int erofs_namei(struct inode *dir, + struct qstr *name, + erofs_nid_t *nid, unsigned int *d_type) +{ + int ndirents; + struct page *page; + void *data; + struct erofs_dirent *de; + struct erofs_qstr qn; + + if (!dir->i_size) + return -ENOENT; + + qn.name = name->name; + qn.end = name->name + name->len; + + ndirents = 0; + page = find_target_block_classic(dir, &qn, &ndirents); + + if (IS_ERR(page)) + return PTR_ERR(page); + + data = kmap_atomic(page); + /* the target page has been mapped */ + if (ndirents) + de = find_target_dirent(&qn, data, EROFS_BLKSIZ, ndirents); + else + de = (struct erofs_dirent *)data; + + if (!IS_ERR(de)) { + *nid = le64_to_cpu(de->nid); + *d_type = de->file_type; + } + + kunmap_atomic(data); + put_page(page); + + return PTR_ERR_OR_ZERO(de); +} + +/* NOTE: i_mutex is already held by vfs */ +static struct dentry *erofs_lookup(struct inode *dir, + struct dentry *dentry, + unsigned int flags) +{ + int err; + erofs_nid_t nid; + unsigned int d_type; + struct inode *inode; + + DBG_BUGON(!d_really_is_negative(dentry)); + /* dentry must be unhashed in lookup, no need to worry about */ + DBG_BUGON(!d_unhashed(dentry)); + + trace_erofs_lookup(dir, dentry, flags); + + /* file name exceeds fs limit */ + if (dentry->d_name.len > EROFS_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + /* false uninitialized warnings on gcc 4.8.x */ + err = erofs_namei(dir, &dentry->d_name, &nid, &d_type); + + if (err == -ENOENT) { + /* negative dentry */ + inode = NULL; + } else if (err) { + inode = ERR_PTR(err); + } else { + erofs_dbg("%s, %s (nid %llu) found, d_type %u", __func__, + dentry->d_name.name, nid, d_type); + inode = erofs_iget(dir->i_sb, nid, d_type == FT_DIR); + } + return d_splice_alias(inode, dentry); +} + +const struct inode_operations erofs_dir_iops = { + .lookup = erofs_lookup, + .getattr = erofs_getattr, +#ifdef CONFIG_EROFS_FS_XATTR + .listxattr = erofs_listxattr, +#endif + .get_acl = erofs_get_acl, +}; + diff --git a/fs/erofs/super.c b/fs/erofs/super.c new file mode 100644 index 000000000000..caf9a95173b0 --- /dev/null +++ b/fs/erofs/super.c @@ -0,0 +1,615 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#include <linux/module.h> +#include <linux/buffer_head.h> +#include <linux/statfs.h> +#include <linux/parser.h> +#include <linux/seq_file.h> +#include "xattr.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/erofs.h> + +static struct kmem_cache *erofs_inode_cachep __read_mostly; + +void _erofs_err(struct super_block *sb, const char *function, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + pr_err("(device %s): %s: %pV", sb->s_id, function, &vaf); + va_end(args); +} + +void _erofs_info(struct super_block *sb, const char *function, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + pr_info("(device %s): %pV", sb->s_id, &vaf); + va_end(args); +} + +static void erofs_inode_init_once(void *ptr) +{ + struct erofs_inode *vi = ptr; + + inode_init_once(&vi->vfs_inode); +} + +static struct inode *erofs_alloc_inode(struct super_block *sb) +{ + struct erofs_inode *vi = + kmem_cache_alloc(erofs_inode_cachep, GFP_KERNEL); + + if (!vi) + return NULL; + + /* zero out everything except vfs_inode */ + memset(vi, 0, offsetof(struct erofs_inode, vfs_inode)); + return &vi->vfs_inode; +} + +static void erofs_free_inode(struct inode *inode) +{ + struct erofs_inode *vi = EROFS_I(inode); + + /* be careful of RCU symlink path */ + if (inode->i_op == &erofs_fast_symlink_iops) + kfree(inode->i_link); + kfree(vi->xattr_shared_xattrs); + + kmem_cache_free(erofs_inode_cachep, vi); +} + +static bool check_layout_compatibility(struct super_block *sb, + struct erofs_super_block *dsb) +{ + const unsigned int feature = le32_to_cpu(dsb->feature_incompat); + + EROFS_SB(sb)->feature_incompat = feature; + + /* check if current kernel meets all mandatory requirements */ + if (feature & (~EROFS_ALL_FEATURE_INCOMPAT)) { + erofs_err(sb, + "unidentified incompatible feature %x, please upgrade kernel version", + feature & ~EROFS_ALL_FEATURE_INCOMPAT); + return false; + } + return true; +} + +static int erofs_read_superblock(struct super_block *sb) +{ + struct erofs_sb_info *sbi; + struct page *page; + struct erofs_super_block *dsb; + unsigned int blkszbits; + void *data; + int ret; + + page = read_mapping_page(sb->s_bdev->bd_inode->i_mapping, 0, NULL); + if (!page) { + erofs_err(sb, "cannot read erofs superblock"); + return -EIO; + } + + sbi = EROFS_SB(sb); + + data = kmap_atomic(page); + dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET); + + ret = -EINVAL; + if (le32_to_cpu(dsb->magic) != EROFS_SUPER_MAGIC_V1) { + erofs_err(sb, "cannot find valid erofs superblock"); + goto out; + } + + blkszbits = dsb->blkszbits; + /* 9(512 bytes) + LOG_SECTORS_PER_BLOCK == LOG_BLOCK_SIZE */ + if (blkszbits != LOG_BLOCK_SIZE) { + erofs_err(sb, "blksize %u isn't supported on this platform", + 1 << blkszbits); + goto out; + } + + if (!check_layout_compatibility(sb, dsb)) + goto out; + + sbi->blocks = le32_to_cpu(dsb->blocks); + sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr); +#ifdef CONFIG_EROFS_FS_XATTR + sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr); +#endif + sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact)); + sbi->root_nid = le16_to_cpu(dsb->root_nid); + sbi->inos = le64_to_cpu(dsb->inos); + + sbi->build_time = le64_to_cpu(dsb->build_time); + sbi->build_time_nsec = le32_to_cpu(dsb->build_time_nsec); + + memcpy(&sb->s_uuid, dsb->uuid, sizeof(dsb->uuid)); + + ret = strscpy(sbi->volume_name, dsb->volume_name, + sizeof(dsb->volume_name)); + if (ret < 0) { /* -E2BIG */ + erofs_err(sb, "bad volume name without NIL terminator"); + ret = -EFSCORRUPTED; + goto out; + } + ret = 0; +out: + kunmap_atomic(data); + put_page(page); + return ret; +} + +#ifdef CONFIG_EROFS_FS_ZIP +static int erofs_build_cache_strategy(struct super_block *sb, + substring_t *args) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + const char *cs = match_strdup(args); + int err = 0; + + if (!cs) { + erofs_err(sb, "Not enough memory to store cache strategy"); + return -ENOMEM; + } + + if (!strcmp(cs, "disabled")) { + sbi->cache_strategy = EROFS_ZIP_CACHE_DISABLED; + } else if (!strcmp(cs, "readahead")) { + sbi->cache_strategy = EROFS_ZIP_CACHE_READAHEAD; + } else if (!strcmp(cs, "readaround")) { + sbi->cache_strategy = EROFS_ZIP_CACHE_READAROUND; + } else { + erofs_err(sb, "Unrecognized cache strategy \"%s\"", cs); + err = -EINVAL; + } + kfree(cs); + return err; +} +#else +static int erofs_build_cache_strategy(struct super_block *sb, + substring_t *args) +{ + erofs_info(sb, "EROFS compression is disabled, so cache strategy is ignored"); + return 0; +} +#endif + +/* set up default EROFS parameters */ +static void erofs_default_options(struct erofs_sb_info *sbi) +{ +#ifdef CONFIG_EROFS_FS_ZIP + sbi->cache_strategy = EROFS_ZIP_CACHE_READAROUND; + sbi->max_sync_decompress_pages = 3; +#endif +#ifdef CONFIG_EROFS_FS_XATTR + set_opt(sbi, XATTR_USER); +#endif +#ifdef CONFIG_EROFS_FS_POSIX_ACL + set_opt(sbi, POSIX_ACL); +#endif +} + +enum { + Opt_user_xattr, + Opt_nouser_xattr, + Opt_acl, + Opt_noacl, + Opt_cache_strategy, + Opt_err +}; + +static match_table_t erofs_tokens = { + {Opt_user_xattr, "user_xattr"}, + {Opt_nouser_xattr, "nouser_xattr"}, + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_cache_strategy, "cache_strategy=%s"}, + {Opt_err, NULL} +}; + +static int erofs_parse_options(struct super_block *sb, char *options) +{ + substring_t args[MAX_OPT_ARGS]; + char *p; + int err; + + if (!options) + return 0; + + while ((p = strsep(&options, ","))) { + int token; + + if (!*p) + continue; + + args[0].to = args[0].from = NULL; + token = match_token(p, erofs_tokens, args); + + switch (token) { +#ifdef CONFIG_EROFS_FS_XATTR + case Opt_user_xattr: + set_opt(EROFS_SB(sb), XATTR_USER); + break; + case Opt_nouser_xattr: + clear_opt(EROFS_SB(sb), XATTR_USER); + break; +#else + case Opt_user_xattr: + erofs_info(sb, "user_xattr options not supported"); + break; + case Opt_nouser_xattr: + erofs_info(sb, "nouser_xattr options not supported"); + break; +#endif +#ifdef CONFIG_EROFS_FS_POSIX_ACL + case Opt_acl: + set_opt(EROFS_SB(sb), POSIX_ACL); + break; + case Opt_noacl: + clear_opt(EROFS_SB(sb), POSIX_ACL); + break; +#else + case Opt_acl: + erofs_info(sb, "acl options not supported"); + break; + case Opt_noacl: + erofs_info(sb, "noacl options not supported"); + break; +#endif + case Opt_cache_strategy: + err = erofs_build_cache_strategy(sb, args); + if (err) + return err; + break; + default: + erofs_err(sb, "Unrecognized mount option \"%s\" or missing value", p); + return -EINVAL; + } + } + return 0; +} + +#ifdef CONFIG_EROFS_FS_ZIP +static const struct address_space_operations managed_cache_aops; + +static int erofs_managed_cache_releasepage(struct page *page, gfp_t gfp_mask) +{ + int ret = 1; /* 0 - busy */ + struct address_space *const mapping = page->mapping; + + DBG_BUGON(!PageLocked(page)); + DBG_BUGON(mapping->a_ops != &managed_cache_aops); + + if (PagePrivate(page)) + ret = erofs_try_to_free_cached_page(mapping, page); + + return ret; +} + +static void erofs_managed_cache_invalidatepage(struct page *page, + unsigned int offset, + unsigned int length) +{ + const unsigned int stop = length + offset; + + DBG_BUGON(!PageLocked(page)); + + /* Check for potential overflow in debug mode */ + DBG_BUGON(stop > PAGE_SIZE || stop < length); + + if (offset == 0 && stop == PAGE_SIZE) + while (!erofs_managed_cache_releasepage(page, GFP_NOFS)) + cond_resched(); +} + +static const struct address_space_operations managed_cache_aops = { + .releasepage = erofs_managed_cache_releasepage, + .invalidatepage = erofs_managed_cache_invalidatepage, +}; + +static int erofs_init_managed_cache(struct super_block *sb) +{ + struct erofs_sb_info *const sbi = EROFS_SB(sb); + struct inode *const inode = new_inode(sb); + + if (!inode) + return -ENOMEM; + + set_nlink(inode, 1); + inode->i_size = OFFSET_MAX; + + inode->i_mapping->a_ops = &managed_cache_aops; + mapping_set_gfp_mask(inode->i_mapping, + GFP_NOFS | __GFP_HIGHMEM | __GFP_MOVABLE); + sbi->managed_cache = inode; + return 0; +} +#else +static int erofs_init_managed_cache(struct super_block *sb) { return 0; } +#endif + +static int erofs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct inode *inode; + struct erofs_sb_info *sbi; + int err; + + sb->s_magic = EROFS_SUPER_MAGIC; + + if (!sb_set_blocksize(sb, EROFS_BLKSIZ)) { + erofs_err(sb, "failed to set erofs blksize"); + return -EINVAL; + } + + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + sb->s_fs_info = sbi; + err = erofs_read_superblock(sb); + if (err) + return err; + + sb->s_flags |= SB_RDONLY | SB_NOATIME; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_time_gran = 1; + + sb->s_op = &erofs_sops; + +#ifdef CONFIG_EROFS_FS_XATTR + sb->s_xattr = erofs_xattr_handlers; +#endif + /* set erofs default mount options */ + erofs_default_options(sbi); + + err = erofs_parse_options(sb, data); + if (err) + return err; + + if (test_opt(sbi, POSIX_ACL)) + sb->s_flags |= SB_POSIXACL; + else + sb->s_flags &= ~SB_POSIXACL; + +#ifdef CONFIG_EROFS_FS_ZIP + INIT_RADIX_TREE(&sbi->workstn_tree, GFP_ATOMIC); +#endif + + /* get the root inode */ + inode = erofs_iget(sb, ROOT_NID(sbi), true); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + if (!S_ISDIR(inode->i_mode)) { + erofs_err(sb, "rootino(nid %llu) is not a directory(i_mode %o)", + ROOT_NID(sbi), inode->i_mode); + iput(inode); + return -EINVAL; + } + + sb->s_root = d_make_root(inode); + if (!sb->s_root) + return -ENOMEM; + + erofs_shrinker_register(sb); + /* sb->s_umount is already locked, SB_ACTIVE and SB_BORN are not set */ + err = erofs_init_managed_cache(sb); + if (err) + return err; + + erofs_info(sb, "mounted with opts: %s, root inode @ nid %llu.", + (char *)data, ROOT_NID(sbi)); + return 0; +} + +static struct dentry *erofs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, erofs_fill_super); +} + +/* + * could be triggered after deactivate_locked_super() + * is called, thus including umount and failed to initialize. + */ +static void erofs_kill_sb(struct super_block *sb) +{ + struct erofs_sb_info *sbi; + + WARN_ON(sb->s_magic != EROFS_SUPER_MAGIC); + + kill_block_super(sb); + + sbi = EROFS_SB(sb); + if (!sbi) + return; + kfree(sbi); + sb->s_fs_info = NULL; +} + +/* called when ->s_root is non-NULL */ +static void erofs_put_super(struct super_block *sb) +{ + struct erofs_sb_info *const sbi = EROFS_SB(sb); + + DBG_BUGON(!sbi); + + erofs_shrinker_unregister(sb); +#ifdef CONFIG_EROFS_FS_ZIP + iput(sbi->managed_cache); + sbi->managed_cache = NULL; +#endif +} + +static struct file_system_type erofs_fs_type = { + .owner = THIS_MODULE, + .name = "erofs", + .mount = erofs_mount, + .kill_sb = erofs_kill_sb, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("erofs"); + +static int __init erofs_module_init(void) +{ + int err; + + erofs_check_ondisk_layout_definitions(); + + erofs_inode_cachep = kmem_cache_create("erofs_inode", + sizeof(struct erofs_inode), 0, + SLAB_RECLAIM_ACCOUNT, + erofs_inode_init_once); + if (!erofs_inode_cachep) { + err = -ENOMEM; + goto icache_err; + } + + err = erofs_init_shrinker(); + if (err) + goto shrinker_err; + + err = z_erofs_init_zip_subsystem(); + if (err) + goto zip_err; + + err = register_filesystem(&erofs_fs_type); + if (err) + goto fs_err; + + return 0; + +fs_err: + z_erofs_exit_zip_subsystem(); +zip_err: + erofs_exit_shrinker(); +shrinker_err: + kmem_cache_destroy(erofs_inode_cachep); +icache_err: + return err; +} + +static void __exit erofs_module_exit(void) +{ + unregister_filesystem(&erofs_fs_type); + z_erofs_exit_zip_subsystem(); + erofs_exit_shrinker(); + + /* Ensure all RCU free inodes are safe before cache is destroyed. */ + rcu_barrier(); + kmem_cache_destroy(erofs_inode_cachep); +} + +/* get filesystem statistics */ +static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct erofs_sb_info *sbi = EROFS_SB(sb); + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + buf->f_type = sb->s_magic; + buf->f_bsize = EROFS_BLKSIZ; + buf->f_blocks = sbi->blocks; + buf->f_bfree = buf->f_bavail = 0; + + buf->f_files = ULLONG_MAX; + buf->f_ffree = ULLONG_MAX - sbi->inos; + + buf->f_namelen = EROFS_NAME_LEN; + + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + return 0; +} + +static int erofs_show_options(struct seq_file *seq, struct dentry *root) +{ + struct erofs_sb_info *sbi __maybe_unused = EROFS_SB(root->d_sb); + +#ifdef CONFIG_EROFS_FS_XATTR + if (test_opt(sbi, XATTR_USER)) + seq_puts(seq, ",user_xattr"); + else + seq_puts(seq, ",nouser_xattr"); +#endif +#ifdef CONFIG_EROFS_FS_POSIX_ACL + if (test_opt(sbi, POSIX_ACL)) + seq_puts(seq, ",acl"); + else + seq_puts(seq, ",noacl"); +#endif +#ifdef CONFIG_EROFS_FS_ZIP + if (sbi->cache_strategy == EROFS_ZIP_CACHE_DISABLED) { + seq_puts(seq, ",cache_strategy=disabled"); + } else if (sbi->cache_strategy == EROFS_ZIP_CACHE_READAHEAD) { + seq_puts(seq, ",cache_strategy=readahead"); + } else if (sbi->cache_strategy == EROFS_ZIP_CACHE_READAROUND) { + seq_puts(seq, ",cache_strategy=readaround"); + } else { + seq_puts(seq, ",cache_strategy=(unknown)"); + DBG_BUGON(1); + } +#endif + return 0; +} + +static int erofs_remount(struct super_block *sb, int *flags, char *data) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + unsigned int org_mnt_opt = sbi->mount_opt; + int err; + + DBG_BUGON(!sb_rdonly(sb)); + err = erofs_parse_options(sb, data); + if (err) + goto out; + + if (test_opt(sbi, POSIX_ACL)) + sb->s_flags |= SB_POSIXACL; + else + sb->s_flags &= ~SB_POSIXACL; + + *flags |= SB_RDONLY; + return 0; +out: + sbi->mount_opt = org_mnt_opt; + return err; +} + +const struct super_operations erofs_sops = { + .put_super = erofs_put_super, + .alloc_inode = erofs_alloc_inode, + .free_inode = erofs_free_inode, + .statfs = erofs_statfs, + .show_options = erofs_show_options, + .remount_fs = erofs_remount, +}; + +module_init(erofs_module_init); +module_exit(erofs_module_exit); + +MODULE_DESCRIPTION("Enhanced ROM File System"); +MODULE_AUTHOR("Gao Xiang, Chao Yu, Miao Xie, CONSUMER BG, HUAWEI Inc."); +MODULE_LICENSE("GPL"); + diff --git a/fs/erofs/tagptr.h b/fs/erofs/tagptr.h new file mode 100644 index 000000000000..a72897c86744 --- /dev/null +++ b/fs/erofs/tagptr.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * A tagged pointer implementation + * + * Copyright (C) 2018 Gao Xiang <gaoxiang25@huawei.com> + */ +#ifndef __EROFS_FS_TAGPTR_H +#define __EROFS_FS_TAGPTR_H + +#include <linux/types.h> +#include <linux/build_bug.h> + +/* + * the name of tagged pointer types are tagptr{1, 2, 3...}_t + * avoid directly using the internal structs __tagptr{1, 2, 3...} + */ +#define __MAKE_TAGPTR(n) \ +typedef struct __tagptr##n { \ + uintptr_t v; \ +} tagptr##n##_t; + +__MAKE_TAGPTR(1) +__MAKE_TAGPTR(2) +__MAKE_TAGPTR(3) +__MAKE_TAGPTR(4) + +#undef __MAKE_TAGPTR + +extern void __compiletime_error("bad tagptr tags") + __bad_tagptr_tags(void); + +extern void __compiletime_error("bad tagptr type") + __bad_tagptr_type(void); + +/* fix the broken usage of "#define tagptr2_t tagptr3_t" by users */ +#define __tagptr_mask_1(ptr, n) \ + __builtin_types_compatible_p(typeof(ptr), struct __tagptr##n) ? \ + (1UL << (n)) - 1 : + +#define __tagptr_mask(ptr) (\ + __tagptr_mask_1(ptr, 1) ( \ + __tagptr_mask_1(ptr, 2) ( \ + __tagptr_mask_1(ptr, 3) ( \ + __tagptr_mask_1(ptr, 4) ( \ + __bad_tagptr_type(), 0))))) + +/* generate a tagged pointer from a raw value */ +#define tagptr_init(type, val) \ + ((typeof(type)){ .v = (uintptr_t)(val) }) + +/* + * directly cast a tagged pointer to the native pointer type, which + * could be used for backward compatibility of existing code. + */ +#define tagptr_cast_ptr(tptr) ((void *)(tptr).v) + +/* encode tagged pointers */ +#define tagptr_fold(type, ptr, _tags) ({ \ + const typeof(_tags) tags = (_tags); \ + if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(type))) \ + __bad_tagptr_tags(); \ +tagptr_init(type, (uintptr_t)(ptr) | tags); }) + +/* decode tagged pointers */ +#define tagptr_unfold_ptr(tptr) \ + ((void *)((tptr).v & ~__tagptr_mask(tptr))) + +#define tagptr_unfold_tags(tptr) \ + ((tptr).v & __tagptr_mask(tptr)) + +/* operations for the tagger pointer */ +#define tagptr_eq(_tptr1, _tptr2) ({ \ + typeof(_tptr1) tptr1 = (_tptr1); \ + typeof(_tptr2) tptr2 = (_tptr2); \ + (void)(&tptr1 == &tptr2); \ +(tptr1).v == (tptr2).v; }) + +/* lock-free CAS operation */ +#define tagptr_cmpxchg(_ptptr, _o, _n) ({ \ + typeof(_ptptr) ptptr = (_ptptr); \ + typeof(_o) o = (_o); \ + typeof(_n) n = (_n); \ + (void)(&o == &n); \ + (void)(&o == ptptr); \ +tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); }) + +/* wrap WRITE_ONCE if atomic update is needed */ +#define tagptr_replace_tags(_ptptr, tags) ({ \ + typeof(_ptptr) ptptr = (_ptptr); \ + *ptptr = tagptr_fold(*ptptr, tagptr_unfold_ptr(*ptptr), tags); \ +*ptptr; }) + +#define tagptr_set_tags(_ptptr, _tags) ({ \ + typeof(_ptptr) ptptr = (_ptptr); \ + const typeof(_tags) tags = (_tags); \ + if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \ + __bad_tagptr_tags(); \ + ptptr->v |= tags; \ +*ptptr; }) + +#define tagptr_clear_tags(_ptptr, _tags) ({ \ + typeof(_ptptr) ptptr = (_ptptr); \ + const typeof(_tags) tags = (_tags); \ + if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \ + __bad_tagptr_tags(); \ + ptptr->v &= ~tags; \ +*ptptr; }) + +#endif /* __EROFS_FS_TAGPTR_H */ + diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c new file mode 100644 index 000000000000..d92b3e753a6f --- /dev/null +++ b/fs/erofs/utils.c @@ -0,0 +1,333 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#include "internal.h" +#include <linux/pagevec.h> + +struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp, bool nofail) +{ + struct page *page; + + if (!list_empty(pool)) { + page = lru_to_page(pool); + DBG_BUGON(page_ref_count(page) != 1); + list_del(&page->lru); + } else { + page = alloc_pages(gfp | (nofail ? __GFP_NOFAIL : 0), 0); + } + return page; +} + +#if (EROFS_PCPUBUF_NR_PAGES > 0) +static struct { + u8 data[PAGE_SIZE * EROFS_PCPUBUF_NR_PAGES]; +} ____cacheline_aligned_in_smp erofs_pcpubuf[NR_CPUS]; + +void *erofs_get_pcpubuf(unsigned int pagenr) +{ + preempt_disable(); + return &erofs_pcpubuf[smp_processor_id()].data[pagenr * PAGE_SIZE]; +} +#endif + +#ifdef CONFIG_EROFS_FS_ZIP +/* global shrink count (for all mounted EROFS instances) */ +static atomic_long_t erofs_global_shrink_cnt; + +#define __erofs_workgroup_get(grp) atomic_inc(&(grp)->refcount) +#define __erofs_workgroup_put(grp) atomic_dec(&(grp)->refcount) + +static int erofs_workgroup_get(struct erofs_workgroup *grp) +{ + int o; + +repeat: + o = erofs_wait_on_workgroup_freezed(grp); + if (o <= 0) + return -1; + + if (atomic_cmpxchg(&grp->refcount, o, o + 1) != o) + goto repeat; + + /* decrease refcount paired by erofs_workgroup_put */ + if (o == 1) + atomic_long_dec(&erofs_global_shrink_cnt); + return 0; +} + +struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, + pgoff_t index, bool *tag) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + struct erofs_workgroup *grp; + +repeat: + rcu_read_lock(); + grp = radix_tree_lookup(&sbi->workstn_tree, index); + if (grp) { + *tag = xa_pointer_tag(grp); + grp = xa_untag_pointer(grp); + + if (erofs_workgroup_get(grp)) { + /* prefer to relax rcu read side */ + rcu_read_unlock(); + goto repeat; + } + + DBG_BUGON(index != grp->index); + } + rcu_read_unlock(); + return grp; +} + +int erofs_register_workgroup(struct super_block *sb, + struct erofs_workgroup *grp, + bool tag) +{ + struct erofs_sb_info *sbi; + int err; + + /* grp shouldn't be broken or used before */ + if (atomic_read(&grp->refcount) != 1) { + DBG_BUGON(1); + return -EINVAL; + } + + err = radix_tree_preload(GFP_NOFS); + if (err) + return err; + + sbi = EROFS_SB(sb); + xa_lock(&sbi->workstn_tree); + + grp = xa_tag_pointer(grp, tag); + + /* + * Bump up reference count before making this workgroup + * visible to other users in order to avoid potential UAF + * without serialized by workstn_lock. + */ + __erofs_workgroup_get(grp); + + err = radix_tree_insert(&sbi->workstn_tree, grp->index, grp); + if (err) + /* + * it's safe to decrease since the workgroup isn't visible + * and refcount >= 2 (cannot be freezed). + */ + __erofs_workgroup_put(grp); + + xa_unlock(&sbi->workstn_tree); + radix_tree_preload_end(); + return err; +} + +static void __erofs_workgroup_free(struct erofs_workgroup *grp) +{ + atomic_long_dec(&erofs_global_shrink_cnt); + erofs_workgroup_free_rcu(grp); +} + +int erofs_workgroup_put(struct erofs_workgroup *grp) +{ + int count = atomic_dec_return(&grp->refcount); + + if (count == 1) + atomic_long_inc(&erofs_global_shrink_cnt); + else if (!count) + __erofs_workgroup_free(grp); + return count; +} + +static void erofs_workgroup_unfreeze_final(struct erofs_workgroup *grp) +{ + erofs_workgroup_unfreeze(grp, 0); + __erofs_workgroup_free(grp); +} + +static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, + struct erofs_workgroup *grp, + bool cleanup) +{ + /* + * If managed cache is on, refcount of workgroups + * themselves could be < 0 (freezed). In other words, + * there is no guarantee that all refcounts > 0. + */ + if (!erofs_workgroup_try_to_freeze(grp, 1)) + return false; + + /* + * Note that all cached pages should be unattached + * before deleted from the radix tree. Otherwise some + * cached pages could be still attached to the orphan + * old workgroup when the new one is available in the tree. + */ + if (erofs_try_to_free_all_cached_pages(sbi, grp)) { + erofs_workgroup_unfreeze(grp, 1); + return false; + } + + /* + * It's impossible to fail after the workgroup is freezed, + * however in order to avoid some race conditions, add a + * DBG_BUGON to observe this in advance. + */ + DBG_BUGON(xa_untag_pointer(radix_tree_delete(&sbi->workstn_tree, + grp->index)) != grp); + + /* + * If managed cache is on, last refcount should indicate + * the related workstation. + */ + erofs_workgroup_unfreeze_final(grp); + return true; +} + +static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi, + unsigned long nr_shrink, + bool cleanup) +{ + pgoff_t first_index = 0; + void *batch[PAGEVEC_SIZE]; + unsigned int freed = 0; + + int i, found; +repeat: + xa_lock(&sbi->workstn_tree); + + found = radix_tree_gang_lookup(&sbi->workstn_tree, + batch, first_index, PAGEVEC_SIZE); + + for (i = 0; i < found; ++i) { + struct erofs_workgroup *grp = xa_untag_pointer(batch[i]); + + first_index = grp->index + 1; + + /* try to shrink each valid workgroup */ + if (!erofs_try_to_release_workgroup(sbi, grp, cleanup)) + continue; + + ++freed; + if (!--nr_shrink) + break; + } + xa_unlock(&sbi->workstn_tree); + + if (i && nr_shrink) + goto repeat; + return freed; +} + +/* protected by 'erofs_sb_list_lock' */ +static unsigned int shrinker_run_no; + +/* protects the mounted 'erofs_sb_list' */ +static DEFINE_SPINLOCK(erofs_sb_list_lock); +static LIST_HEAD(erofs_sb_list); + +void erofs_shrinker_register(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + + mutex_init(&sbi->umount_mutex); + + spin_lock(&erofs_sb_list_lock); + list_add(&sbi->list, &erofs_sb_list); + spin_unlock(&erofs_sb_list_lock); +} + +void erofs_shrinker_unregister(struct super_block *sb) +{ + struct erofs_sb_info *const sbi = EROFS_SB(sb); + + mutex_lock(&sbi->umount_mutex); + erofs_shrink_workstation(sbi, ~0UL, true); + + spin_lock(&erofs_sb_list_lock); + list_del(&sbi->list); + spin_unlock(&erofs_sb_list_lock); + mutex_unlock(&sbi->umount_mutex); +} + +static unsigned long erofs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + return atomic_long_read(&erofs_global_shrink_cnt); +} + +static unsigned long erofs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct erofs_sb_info *sbi; + struct list_head *p; + + unsigned long nr = sc->nr_to_scan; + unsigned int run_no; + unsigned long freed = 0; + + spin_lock(&erofs_sb_list_lock); + do { + run_no = ++shrinker_run_no; + } while (run_no == 0); + + /* Iterate over all mounted superblocks and try to shrink them */ + p = erofs_sb_list.next; + while (p != &erofs_sb_list) { + sbi = list_entry(p, struct erofs_sb_info, list); + + /* + * We move the ones we do to the end of the list, so we stop + * when we see one we have already done. + */ + if (sbi->shrinker_run_no == run_no) + break; + + if (!mutex_trylock(&sbi->umount_mutex)) { + p = p->next; + continue; + } + + spin_unlock(&erofs_sb_list_lock); + sbi->shrinker_run_no = run_no; + + freed += erofs_shrink_workstation(sbi, nr, false); + + spin_lock(&erofs_sb_list_lock); + /* Get the next list element before we move this one */ + p = p->next; + + /* + * Move this one to the end of the list to provide some + * fairness. + */ + list_move_tail(&sbi->list, &erofs_sb_list); + mutex_unlock(&sbi->umount_mutex); + + if (freed >= nr) + break; + } + spin_unlock(&erofs_sb_list_lock); + return freed; +} + +static struct shrinker erofs_shrinker_info = { + .scan_objects = erofs_shrink_scan, + .count_objects = erofs_shrink_count, + .seeks = DEFAULT_SEEKS, +}; + +int __init erofs_init_shrinker(void) +{ + return register_shrinker(&erofs_shrinker_info); +} + +void erofs_exit_shrinker(void) +{ + unregister_shrinker(&erofs_shrinker_info); +} +#endif /* !CONFIG_EROFS_FS_ZIP */ + diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c new file mode 100644 index 000000000000..a13a78725c57 --- /dev/null +++ b/fs/erofs/xattr.c @@ -0,0 +1,704 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#include <linux/security.h> +#include "xattr.h" + +struct xattr_iter { + struct super_block *sb; + struct page *page; + void *kaddr; + + erofs_blk_t blkaddr; + unsigned int ofs; +}; + +static inline void xattr_iter_end(struct xattr_iter *it, bool atomic) +{ + /* the only user of kunmap() is 'init_inode_xattrs' */ + if (!atomic) + kunmap(it->page); + else + kunmap_atomic(it->kaddr); + + unlock_page(it->page); + put_page(it->page); +} + +static inline void xattr_iter_end_final(struct xattr_iter *it) +{ + if (!it->page) + return; + + xattr_iter_end(it, true); +} + +static int init_inode_xattrs(struct inode *inode) +{ + struct erofs_inode *const vi = EROFS_I(inode); + struct xattr_iter it; + unsigned int i; + struct erofs_xattr_ibody_header *ih; + struct super_block *sb; + struct erofs_sb_info *sbi; + bool atomic_map; + int ret = 0; + + /* the most case is that xattrs of this inode are initialized. */ + if (test_bit(EROFS_I_EA_INITED_BIT, &vi->flags)) + return 0; + + if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_XATTR_BIT, TASK_KILLABLE)) + return -ERESTARTSYS; + + /* someone has initialized xattrs for us? */ + if (test_bit(EROFS_I_EA_INITED_BIT, &vi->flags)) + goto out_unlock; + + /* + * bypass all xattr operations if ->xattr_isize is not greater than + * sizeof(struct erofs_xattr_ibody_header), in detail: + * 1) it is not enough to contain erofs_xattr_ibody_header then + * ->xattr_isize should be 0 (it means no xattr); + * 2) it is just to contain erofs_xattr_ibody_header, which is on-disk + * undefined right now (maybe use later with some new sb feature). + */ + if (vi->xattr_isize == sizeof(struct erofs_xattr_ibody_header)) { + erofs_err(inode->i_sb, + "xattr_isize %d of nid %llu is not supported yet", + vi->xattr_isize, vi->nid); + ret = -EOPNOTSUPP; + goto out_unlock; + } else if (vi->xattr_isize < sizeof(struct erofs_xattr_ibody_header)) { + if (vi->xattr_isize) { + erofs_err(inode->i_sb, + "bogus xattr ibody @ nid %llu", vi->nid); + DBG_BUGON(1); + ret = -EFSCORRUPTED; + goto out_unlock; /* xattr ondisk layout error */ + } + ret = -ENOATTR; + goto out_unlock; + } + + sb = inode->i_sb; + sbi = EROFS_SB(sb); + it.blkaddr = erofs_blknr(iloc(sbi, vi->nid) + vi->inode_isize); + it.ofs = erofs_blkoff(iloc(sbi, vi->nid) + vi->inode_isize); + + it.page = erofs_get_meta_page(sb, it.blkaddr); + if (IS_ERR(it.page)) { + ret = PTR_ERR(it.page); + goto out_unlock; + } + + /* read in shared xattr array (non-atomic, see kmalloc below) */ + it.kaddr = kmap(it.page); + atomic_map = false; + + ih = (struct erofs_xattr_ibody_header *)(it.kaddr + it.ofs); + + vi->xattr_shared_count = ih->h_shared_count; + vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count, + sizeof(uint), GFP_KERNEL); + if (!vi->xattr_shared_xattrs) { + xattr_iter_end(&it, atomic_map); + ret = -ENOMEM; + goto out_unlock; + } + + /* let's skip ibody header */ + it.ofs += sizeof(struct erofs_xattr_ibody_header); + + for (i = 0; i < vi->xattr_shared_count; ++i) { + if (it.ofs >= EROFS_BLKSIZ) { + /* cannot be unaligned */ + DBG_BUGON(it.ofs != EROFS_BLKSIZ); + xattr_iter_end(&it, atomic_map); + + it.page = erofs_get_meta_page(sb, ++it.blkaddr); + if (IS_ERR(it.page)) { + kfree(vi->xattr_shared_xattrs); + vi->xattr_shared_xattrs = NULL; + ret = PTR_ERR(it.page); + goto out_unlock; + } + + it.kaddr = kmap_atomic(it.page); + atomic_map = true; + it.ofs = 0; + } + vi->xattr_shared_xattrs[i] = + le32_to_cpu(*(__le32 *)(it.kaddr + it.ofs)); + it.ofs += sizeof(__le32); + } + xattr_iter_end(&it, atomic_map); + + set_bit(EROFS_I_EA_INITED_BIT, &vi->flags); + +out_unlock: + clear_and_wake_up_bit(EROFS_I_BL_XATTR_BIT, &vi->flags); + return ret; +} + +/* + * the general idea for these return values is + * if 0 is returned, go on processing the current xattr; + * 1 (> 0) is returned, skip this round to process the next xattr; + * -err (< 0) is returned, an error (maybe ENOXATTR) occurred + * and need to be handled + */ +struct xattr_iter_handlers { + int (*entry)(struct xattr_iter *_it, struct erofs_xattr_entry *entry); + int (*name)(struct xattr_iter *_it, unsigned int processed, char *buf, + unsigned int len); + int (*alloc_buffer)(struct xattr_iter *_it, unsigned int value_sz); + void (*value)(struct xattr_iter *_it, unsigned int processed, char *buf, + unsigned int len); +}; + +static inline int xattr_iter_fixup(struct xattr_iter *it) +{ + if (it->ofs < EROFS_BLKSIZ) + return 0; + + xattr_iter_end(it, true); + + it->blkaddr += erofs_blknr(it->ofs); + + it->page = erofs_get_meta_page(it->sb, it->blkaddr); + if (IS_ERR(it->page)) { + int err = PTR_ERR(it->page); + + it->page = NULL; + return err; + } + + it->kaddr = kmap_atomic(it->page); + it->ofs = erofs_blkoff(it->ofs); + return 0; +} + +static int inline_xattr_iter_begin(struct xattr_iter *it, + struct inode *inode) +{ + struct erofs_inode *const vi = EROFS_I(inode); + struct erofs_sb_info *const sbi = EROFS_SB(inode->i_sb); + unsigned int xattr_header_sz, inline_xattr_ofs; + + xattr_header_sz = inlinexattr_header_size(inode); + if (xattr_header_sz >= vi->xattr_isize) { + DBG_BUGON(xattr_header_sz > vi->xattr_isize); + return -ENOATTR; + } + + inline_xattr_ofs = vi->inode_isize + xattr_header_sz; + + it->blkaddr = erofs_blknr(iloc(sbi, vi->nid) + inline_xattr_ofs); + it->ofs = erofs_blkoff(iloc(sbi, vi->nid) + inline_xattr_ofs); + + it->page = erofs_get_meta_page(inode->i_sb, it->blkaddr); + if (IS_ERR(it->page)) + return PTR_ERR(it->page); + + it->kaddr = kmap_atomic(it->page); + return vi->xattr_isize - xattr_header_sz; +} + +/* + * Regardless of success or failure, `xattr_foreach' will end up with + * `ofs' pointing to the next xattr item rather than an arbitrary position. + */ +static int xattr_foreach(struct xattr_iter *it, + const struct xattr_iter_handlers *op, + unsigned int *tlimit) +{ + struct erofs_xattr_entry entry; + unsigned int value_sz, processed, slice; + int err; + + /* 0. fixup blkaddr, ofs, ipage */ + err = xattr_iter_fixup(it); + if (err) + return err; + + /* + * 1. read xattr entry to the memory, + * since we do EROFS_XATTR_ALIGN + * therefore entry should be in the page + */ + entry = *(struct erofs_xattr_entry *)(it->kaddr + it->ofs); + if (tlimit) { + unsigned int entry_sz = erofs_xattr_entry_size(&entry); + + /* xattr on-disk corruption: xattr entry beyond xattr_isize */ + if (*tlimit < entry_sz) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } + *tlimit -= entry_sz; + } + + it->ofs += sizeof(struct erofs_xattr_entry); + value_sz = le16_to_cpu(entry.e_value_size); + + /* handle entry */ + err = op->entry(it, &entry); + if (err) { + it->ofs += entry.e_name_len + value_sz; + goto out; + } + + /* 2. handle xattr name (ofs will finally be at the end of name) */ + processed = 0; + + while (processed < entry.e_name_len) { + if (it->ofs >= EROFS_BLKSIZ) { + DBG_BUGON(it->ofs > EROFS_BLKSIZ); + + err = xattr_iter_fixup(it); + if (err) + goto out; + it->ofs = 0; + } + + slice = min_t(unsigned int, PAGE_SIZE - it->ofs, + entry.e_name_len - processed); + + /* handle name */ + err = op->name(it, processed, it->kaddr + it->ofs, slice); + if (err) { + it->ofs += entry.e_name_len - processed + value_sz; + goto out; + } + + it->ofs += slice; + processed += slice; + } + + /* 3. handle xattr value */ + processed = 0; + + if (op->alloc_buffer) { + err = op->alloc_buffer(it, value_sz); + if (err) { + it->ofs += value_sz; + goto out; + } + } + + while (processed < value_sz) { + if (it->ofs >= EROFS_BLKSIZ) { + DBG_BUGON(it->ofs > EROFS_BLKSIZ); + + err = xattr_iter_fixup(it); + if (err) + goto out; + it->ofs = 0; + } + + slice = min_t(unsigned int, PAGE_SIZE - it->ofs, + value_sz - processed); + op->value(it, processed, it->kaddr + it->ofs, slice); + it->ofs += slice; + processed += slice; + } + +out: + /* xattrs should be 4-byte aligned (on-disk constraint) */ + it->ofs = EROFS_XATTR_ALIGN(it->ofs); + return err < 0 ? err : 0; +} + +struct getxattr_iter { + struct xattr_iter it; + + char *buffer; + int buffer_size, index; + struct qstr name; +}; + +static int xattr_entrymatch(struct xattr_iter *_it, + struct erofs_xattr_entry *entry) +{ + struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); + + return (it->index != entry->e_name_index || + it->name.len != entry->e_name_len) ? -ENOATTR : 0; +} + +static int xattr_namematch(struct xattr_iter *_it, + unsigned int processed, char *buf, unsigned int len) +{ + struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); + + return memcmp(buf, it->name.name + processed, len) ? -ENOATTR : 0; +} + +static int xattr_checkbuffer(struct xattr_iter *_it, + unsigned int value_sz) +{ + struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); + int err = it->buffer_size < value_sz ? -ERANGE : 0; + + it->buffer_size = value_sz; + return !it->buffer ? 1 : err; +} + +static void xattr_copyvalue(struct xattr_iter *_it, + unsigned int processed, + char *buf, unsigned int len) +{ + struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); + + memcpy(it->buffer + processed, buf, len); +} + +static const struct xattr_iter_handlers find_xattr_handlers = { + .entry = xattr_entrymatch, + .name = xattr_namematch, + .alloc_buffer = xattr_checkbuffer, + .value = xattr_copyvalue +}; + +static int inline_getxattr(struct inode *inode, struct getxattr_iter *it) +{ + int ret; + unsigned int remaining; + + ret = inline_xattr_iter_begin(&it->it, inode); + if (ret < 0) + return ret; + + remaining = ret; + while (remaining) { + ret = xattr_foreach(&it->it, &find_xattr_handlers, &remaining); + if (ret != -ENOATTR) + break; + } + xattr_iter_end_final(&it->it); + + return ret ? ret : it->buffer_size; +} + +static int shared_getxattr(struct inode *inode, struct getxattr_iter *it) +{ + struct erofs_inode *const vi = EROFS_I(inode); + struct super_block *const sb = inode->i_sb; + struct erofs_sb_info *const sbi = EROFS_SB(sb); + unsigned int i; + int ret = -ENOATTR; + + for (i = 0; i < vi->xattr_shared_count; ++i) { + erofs_blk_t blkaddr = + xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]); + + it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]); + + if (!i || blkaddr != it->it.blkaddr) { + if (i) + xattr_iter_end(&it->it, true); + + it->it.page = erofs_get_meta_page(sb, blkaddr); + if (IS_ERR(it->it.page)) + return PTR_ERR(it->it.page); + + it->it.kaddr = kmap_atomic(it->it.page); + it->it.blkaddr = blkaddr; + } + + ret = xattr_foreach(&it->it, &find_xattr_handlers, NULL); + if (ret != -ENOATTR) + break; + } + if (vi->xattr_shared_count) + xattr_iter_end_final(&it->it); + + return ret ? ret : it->buffer_size; +} + +static bool erofs_xattr_user_list(struct dentry *dentry) +{ + return test_opt(EROFS_SB(dentry->d_sb), XATTR_USER); +} + +static bool erofs_xattr_trusted_list(struct dentry *dentry) +{ + return capable(CAP_SYS_ADMIN); +} + +int erofs_getxattr(struct inode *inode, int index, + const char *name, + void *buffer, size_t buffer_size) +{ + int ret; + struct getxattr_iter it; + + if (!name) + return -EINVAL; + + ret = init_inode_xattrs(inode); + if (ret) + return ret; + + it.index = index; + + it.name.len = strlen(name); + if (it.name.len > EROFS_NAME_LEN) + return -ERANGE; + it.name.name = name; + + it.buffer = buffer; + it.buffer_size = buffer_size; + + it.it.sb = inode->i_sb; + ret = inline_getxattr(inode, &it); + if (ret == -ENOATTR) + ret = shared_getxattr(inode, &it); + return ret; +} + +static int erofs_xattr_generic_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + struct erofs_sb_info *const sbi = EROFS_I_SB(inode); + + switch (handler->flags) { + case EROFS_XATTR_INDEX_USER: + if (!test_opt(sbi, XATTR_USER)) + return -EOPNOTSUPP; + break; + case EROFS_XATTR_INDEX_TRUSTED: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + break; + case EROFS_XATTR_INDEX_SECURITY: + break; + default: + return -EINVAL; + } + + return erofs_getxattr(inode, handler->flags, name, buffer, size); +} + +const struct xattr_handler erofs_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .flags = EROFS_XATTR_INDEX_USER, + .list = erofs_xattr_user_list, + .get = erofs_xattr_generic_get, +}; + +const struct xattr_handler erofs_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .flags = EROFS_XATTR_INDEX_TRUSTED, + .list = erofs_xattr_trusted_list, + .get = erofs_xattr_generic_get, +}; + +#ifdef CONFIG_EROFS_FS_SECURITY +const struct xattr_handler __maybe_unused erofs_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .flags = EROFS_XATTR_INDEX_SECURITY, + .get = erofs_xattr_generic_get, +}; +#endif + +const struct xattr_handler *erofs_xattr_handlers[] = { + &erofs_xattr_user_handler, +#ifdef CONFIG_EROFS_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + &erofs_xattr_trusted_handler, +#ifdef CONFIG_EROFS_FS_SECURITY + &erofs_xattr_security_handler, +#endif + NULL, +}; + +struct listxattr_iter { + struct xattr_iter it; + + struct dentry *dentry; + char *buffer; + int buffer_size, buffer_ofs; +}; + +static int xattr_entrylist(struct xattr_iter *_it, + struct erofs_xattr_entry *entry) +{ + struct listxattr_iter *it = + container_of(_it, struct listxattr_iter, it); + unsigned int prefix_len; + const char *prefix; + + const struct xattr_handler *h = + erofs_xattr_handler(entry->e_name_index); + + if (!h || (h->list && !h->list(it->dentry))) + return 1; + + prefix = xattr_prefix(h); + prefix_len = strlen(prefix); + + if (!it->buffer) { + it->buffer_ofs += prefix_len + entry->e_name_len + 1; + return 1; + } + + if (it->buffer_ofs + prefix_len + + entry->e_name_len + 1 > it->buffer_size) + return -ERANGE; + + memcpy(it->buffer + it->buffer_ofs, prefix, prefix_len); + it->buffer_ofs += prefix_len; + return 0; +} + +static int xattr_namelist(struct xattr_iter *_it, + unsigned int processed, char *buf, unsigned int len) +{ + struct listxattr_iter *it = + container_of(_it, struct listxattr_iter, it); + + memcpy(it->buffer + it->buffer_ofs, buf, len); + it->buffer_ofs += len; + return 0; +} + +static int xattr_skipvalue(struct xattr_iter *_it, + unsigned int value_sz) +{ + struct listxattr_iter *it = + container_of(_it, struct listxattr_iter, it); + + it->buffer[it->buffer_ofs++] = '\0'; + return 1; +} + +static const struct xattr_iter_handlers list_xattr_handlers = { + .entry = xattr_entrylist, + .name = xattr_namelist, + .alloc_buffer = xattr_skipvalue, + .value = NULL +}; + +static int inline_listxattr(struct listxattr_iter *it) +{ + int ret; + unsigned int remaining; + + ret = inline_xattr_iter_begin(&it->it, d_inode(it->dentry)); + if (ret < 0) + return ret; + + remaining = ret; + while (remaining) { + ret = xattr_foreach(&it->it, &list_xattr_handlers, &remaining); + if (ret) + break; + } + xattr_iter_end_final(&it->it); + return ret ? ret : it->buffer_ofs; +} + +static int shared_listxattr(struct listxattr_iter *it) +{ + struct inode *const inode = d_inode(it->dentry); + struct erofs_inode *const vi = EROFS_I(inode); + struct super_block *const sb = inode->i_sb; + struct erofs_sb_info *const sbi = EROFS_SB(sb); + unsigned int i; + int ret = 0; + + for (i = 0; i < vi->xattr_shared_count; ++i) { + erofs_blk_t blkaddr = + xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]); + + it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]); + if (!i || blkaddr != it->it.blkaddr) { + if (i) + xattr_iter_end(&it->it, true); + + it->it.page = erofs_get_meta_page(sb, blkaddr); + if (IS_ERR(it->it.page)) + return PTR_ERR(it->it.page); + + it->it.kaddr = kmap_atomic(it->it.page); + it->it.blkaddr = blkaddr; + } + + ret = xattr_foreach(&it->it, &list_xattr_handlers, NULL); + if (ret) + break; + } + if (vi->xattr_shared_count) + xattr_iter_end_final(&it->it); + + return ret ? ret : it->buffer_ofs; +} + +ssize_t erofs_listxattr(struct dentry *dentry, + char *buffer, size_t buffer_size) +{ + int ret; + struct listxattr_iter it; + + ret = init_inode_xattrs(d_inode(dentry)); + if (ret) + return ret; + + it.dentry = dentry; + it.buffer = buffer; + it.buffer_size = buffer_size; + it.buffer_ofs = 0; + + it.it.sb = dentry->d_sb; + + ret = inline_listxattr(&it); + if (ret < 0 && ret != -ENOATTR) + return ret; + return shared_listxattr(&it); +} + +#ifdef CONFIG_EROFS_FS_POSIX_ACL +struct posix_acl *erofs_get_acl(struct inode *inode, int type) +{ + struct posix_acl *acl; + int prefix, rc; + char *value = NULL; + + switch (type) { + case ACL_TYPE_ACCESS: + prefix = EROFS_XATTR_INDEX_POSIX_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + prefix = EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT; + break; + default: + return ERR_PTR(-EINVAL); + } + + rc = erofs_getxattr(inode, prefix, "", NULL, 0); + if (rc > 0) { + value = kmalloc(rc, GFP_KERNEL); + if (!value) + return ERR_PTR(-ENOMEM); + rc = erofs_getxattr(inode, prefix, "", value, rc); + } + + if (rc == -ENOATTR) + acl = NULL; + else if (rc < 0) + acl = ERR_PTR(rc); + else + acl = posix_acl_from_xattr(&init_user_ns, value, rc); + kfree(value); + return acl; +} +#endif + diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h new file mode 100644 index 000000000000..3585b84d2f20 --- /dev/null +++ b/fs/erofs/xattr.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#ifndef __EROFS_XATTR_H +#define __EROFS_XATTR_H + +#include "internal.h" +#include <linux/posix_acl_xattr.h> +#include <linux/xattr.h> + +/* Attribute not found */ +#define ENOATTR ENODATA + +static inline unsigned int inlinexattr_header_size(struct inode *inode) +{ + return sizeof(struct erofs_xattr_ibody_header) + + sizeof(u32) * EROFS_I(inode)->xattr_shared_count; +} + +static inline erofs_blk_t xattrblock_addr(struct erofs_sb_info *sbi, + unsigned int xattr_id) +{ +#ifdef CONFIG_EROFS_FS_XATTR + return sbi->xattr_blkaddr + + xattr_id * sizeof(__u32) / EROFS_BLKSIZ; +#else + return 0; +#endif +} + +static inline unsigned int xattrblock_offset(struct erofs_sb_info *sbi, + unsigned int xattr_id) +{ + return (xattr_id * sizeof(__u32)) % EROFS_BLKSIZ; +} + +#ifdef CONFIG_EROFS_FS_XATTR +extern const struct xattr_handler erofs_xattr_user_handler; +extern const struct xattr_handler erofs_xattr_trusted_handler; +#ifdef CONFIG_EROFS_FS_SECURITY +extern const struct xattr_handler erofs_xattr_security_handler; +#endif + +static inline const struct xattr_handler *erofs_xattr_handler(unsigned int idx) +{ +static const struct xattr_handler *xattr_handler_map[] = { + [EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler, +#ifdef CONFIG_EROFS_FS_POSIX_ACL + [EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, + [EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] = + &posix_acl_default_xattr_handler, +#endif + [EROFS_XATTR_INDEX_TRUSTED] = &erofs_xattr_trusted_handler, +#ifdef CONFIG_EROFS_FS_SECURITY + [EROFS_XATTR_INDEX_SECURITY] = &erofs_xattr_security_handler, +#endif +}; + + return idx && idx < ARRAY_SIZE(xattr_handler_map) ? + xattr_handler_map[idx] : NULL; +} + +extern const struct xattr_handler *erofs_xattr_handlers[]; + +int erofs_getxattr(struct inode *, int, const char *, void *, size_t); +ssize_t erofs_listxattr(struct dentry *, char *, size_t); +#else +static inline int erofs_getxattr(struct inode *inode, int index, + const char *name, void *buffer, + size_t buffer_size) +{ + return -EOPNOTSUPP; +} + +static inline ssize_t erofs_listxattr(struct dentry *dentry, + char *buffer, size_t buffer_size) +{ + return -EOPNOTSUPP; +} +#endif /* !CONFIG_EROFS_FS_XATTR */ + +#ifdef CONFIG_EROFS_FS_POSIX_ACL +struct posix_acl *erofs_get_acl(struct inode *inode, int type); +#else +#define erofs_get_acl (NULL) +#endif + +#endif + diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c new file mode 100644 index 000000000000..96e34c90f814 --- /dev/null +++ b/fs/erofs/zdata.c @@ -0,0 +1,1431 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#include "zdata.h" +#include "compress.h" +#include <linux/prefetch.h> + +#include <trace/events/erofs.h> + +/* + * a compressed_pages[] placeholder in order to avoid + * being filled with file pages for in-place decompression. + */ +#define PAGE_UNALLOCATED ((void *)0x5F0E4B1D) + +/* how to allocate cached pages for a pcluster */ +enum z_erofs_cache_alloctype { + DONTALLOC, /* don't allocate any cached pages */ + DELAYEDALLOC, /* delayed allocation (at the time of submitting io) */ +}; + +/* + * tagged pointer with 1-bit tag for all compressed pages + * tag 0 - the page is just found with an extra page reference + */ +typedef tagptr1_t compressed_page_t; + +#define tag_compressed_page_justfound(page) \ + tagptr_fold(compressed_page_t, page, 1) + +static struct workqueue_struct *z_erofs_workqueue __read_mostly; +static struct kmem_cache *pcluster_cachep __read_mostly; + +void z_erofs_exit_zip_subsystem(void) +{ + destroy_workqueue(z_erofs_workqueue); + kmem_cache_destroy(pcluster_cachep); +} + +static inline int z_erofs_init_workqueue(void) +{ + const unsigned int onlinecpus = num_possible_cpus(); + const unsigned int flags = WQ_UNBOUND | WQ_HIGHPRI | WQ_CPU_INTENSIVE; + + /* + * no need to spawn too many threads, limiting threads could minimum + * scheduling overhead, perhaps per-CPU threads should be better? + */ + z_erofs_workqueue = alloc_workqueue("erofs_unzipd", flags, + onlinecpus + onlinecpus / 4); + return z_erofs_workqueue ? 0 : -ENOMEM; +} + +static void z_erofs_pcluster_init_once(void *ptr) +{ + struct z_erofs_pcluster *pcl = ptr; + struct z_erofs_collection *cl = z_erofs_primarycollection(pcl); + unsigned int i; + + mutex_init(&cl->lock); + cl->nr_pages = 0; + cl->vcnt = 0; + for (i = 0; i < Z_EROFS_CLUSTER_MAX_PAGES; ++i) + pcl->compressed_pages[i] = NULL; +} + +static void z_erofs_pcluster_init_always(struct z_erofs_pcluster *pcl) +{ + struct z_erofs_collection *cl = z_erofs_primarycollection(pcl); + + atomic_set(&pcl->obj.refcount, 1); + + DBG_BUGON(cl->nr_pages); + DBG_BUGON(cl->vcnt); +} + +int __init z_erofs_init_zip_subsystem(void) +{ + pcluster_cachep = kmem_cache_create("erofs_compress", + Z_EROFS_WORKGROUP_SIZE, 0, + SLAB_RECLAIM_ACCOUNT, + z_erofs_pcluster_init_once); + if (pcluster_cachep) { + if (!z_erofs_init_workqueue()) + return 0; + + kmem_cache_destroy(pcluster_cachep); + } + return -ENOMEM; +} + +enum z_erofs_collectmode { + COLLECT_SECONDARY, + COLLECT_PRIMARY, + /* + * The current collection was the tail of an exist chain, in addition + * that the previous processed chained collections are all decided to + * be hooked up to it. + * A new chain will be created for the remaining collections which are + * not processed yet, therefore different from COLLECT_PRIMARY_FOLLOWED, + * the next collection cannot reuse the whole page safely in + * the following scenario: + * ________________________________________________________________ + * | tail (partial) page | head (partial) page | + * | (belongs to the next cl) | (belongs to the current cl) | + * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________| + */ + COLLECT_PRIMARY_HOOKED, + COLLECT_PRIMARY_FOLLOWED_NOINPLACE, + /* + * The current collection has been linked with the owned chain, and + * could also be linked with the remaining collections, which means + * if the processing page is the tail page of the collection, thus + * the current collection can safely use the whole page (since + * the previous collection is under control) for in-place I/O, as + * illustrated below: + * ________________________________________________________________ + * | tail (partial) page | head (partial) page | + * | (of the current cl) | (of the previous collection) | + * | PRIMARY_FOLLOWED or | | + * |_____PRIMARY_HOOKED___|____________PRIMARY_FOLLOWED____________| + * + * [ (*) the above page can be used as inplace I/O. ] + */ + COLLECT_PRIMARY_FOLLOWED, +}; + +struct z_erofs_collector { + struct z_erofs_pagevec_ctor vector; + + struct z_erofs_pcluster *pcl, *tailpcl; + struct z_erofs_collection *cl; + struct page **compressedpages; + z_erofs_next_pcluster_t owned_head; + + enum z_erofs_collectmode mode; +}; + +struct z_erofs_decompress_frontend { + struct inode *const inode; + + struct z_erofs_collector clt; + struct erofs_map_blocks map; + + /* used for applying cache strategy on the fly */ + bool backmost; + erofs_off_t headoffset; +}; + +#define COLLECTOR_INIT() { \ + .owned_head = Z_EROFS_PCLUSTER_TAIL, \ + .mode = COLLECT_PRIMARY_FOLLOWED } + +#define DECOMPRESS_FRONTEND_INIT(__i) { \ + .inode = __i, .clt = COLLECTOR_INIT(), \ + .backmost = true, } + +static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES]; +static DEFINE_MUTEX(z_pagemap_global_lock); + +static void preload_compressed_pages(struct z_erofs_collector *clt, + struct address_space *mc, + enum z_erofs_cache_alloctype type, + struct list_head *pagepool) +{ + const struct z_erofs_pcluster *pcl = clt->pcl; + const unsigned int clusterpages = BIT(pcl->clusterbits); + struct page **pages = clt->compressedpages; + pgoff_t index = pcl->obj.index + (pages - pcl->compressed_pages); + bool standalone = true; + + if (clt->mode < COLLECT_PRIMARY_FOLLOWED) + return; + + for (; pages < pcl->compressed_pages + clusterpages; ++pages) { + struct page *page; + compressed_page_t t; + + /* the compressed page was loaded before */ + if (READ_ONCE(*pages)) + continue; + + page = find_get_page(mc, index); + + if (page) { + t = tag_compressed_page_justfound(page); + } else if (type == DELAYEDALLOC) { + t = tagptr_init(compressed_page_t, PAGE_UNALLOCATED); + } else { /* DONTALLOC */ + if (standalone) + clt->compressedpages = pages; + standalone = false; + continue; + } + + if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t))) + continue; + + if (page) + put_page(page); + } + + if (standalone) /* downgrade to PRIMARY_FOLLOWED_NOINPLACE */ + clt->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; +} + +/* called by erofs_shrinker to get rid of all compressed_pages */ +int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, + struct erofs_workgroup *grp) +{ + struct z_erofs_pcluster *const pcl = + container_of(grp, struct z_erofs_pcluster, obj); + struct address_space *const mapping = MNGD_MAPPING(sbi); + const unsigned int clusterpages = BIT(pcl->clusterbits); + int i; + + /* + * refcount of workgroup is now freezed as 1, + * therefore no need to worry about available decompression users. + */ + for (i = 0; i < clusterpages; ++i) { + struct page *page = pcl->compressed_pages[i]; + + if (!page) + continue; + + /* block other users from reclaiming or migrating the page */ + if (!trylock_page(page)) + return -EBUSY; + + if (page->mapping != mapping) + continue; + + /* barrier is implied in the following 'unlock_page' */ + WRITE_ONCE(pcl->compressed_pages[i], NULL); + set_page_private(page, 0); + ClearPagePrivate(page); + + unlock_page(page); + put_page(page); + } + return 0; +} + +int erofs_try_to_free_cached_page(struct address_space *mapping, + struct page *page) +{ + struct z_erofs_pcluster *const pcl = (void *)page_private(page); + const unsigned int clusterpages = BIT(pcl->clusterbits); + int ret = 0; /* 0 - busy */ + + if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) { + unsigned int i; + + for (i = 0; i < clusterpages; ++i) { + if (pcl->compressed_pages[i] == page) { + WRITE_ONCE(pcl->compressed_pages[i], NULL); + ret = 1; + break; + } + } + erofs_workgroup_unfreeze(&pcl->obj, 1); + + if (ret) { + ClearPagePrivate(page); + put_page(page); + } + } + return ret; +} + +/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */ +static inline bool z_erofs_try_inplace_io(struct z_erofs_collector *clt, + struct page *page) +{ + struct z_erofs_pcluster *const pcl = clt->pcl; + const unsigned int clusterpages = BIT(pcl->clusterbits); + + while (clt->compressedpages < pcl->compressed_pages + clusterpages) { + if (!cmpxchg(clt->compressedpages++, NULL, page)) + return true; + } + return false; +} + +/* callers must be with collection lock held */ +static int z_erofs_attach_page(struct z_erofs_collector *clt, + struct page *page, + enum z_erofs_page_type type) +{ + int ret; + bool occupied; + + /* give priority for inplaceio */ + if (clt->mode >= COLLECT_PRIMARY && + type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && + z_erofs_try_inplace_io(clt, page)) + return 0; + + ret = z_erofs_pagevec_enqueue(&clt->vector, + page, type, &occupied); + clt->cl->vcnt += (unsigned int)ret; + + return ret ? 0 : -EAGAIN; +} + +static enum z_erofs_collectmode +try_to_claim_pcluster(struct z_erofs_pcluster *pcl, + z_erofs_next_pcluster_t *owned_head) +{ + /* let's claim these following types of pclusters */ +retry: + if (pcl->next == Z_EROFS_PCLUSTER_NIL) { + /* type 1, nil pcluster */ + if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL, + *owned_head) != Z_EROFS_PCLUSTER_NIL) + goto retry; + + *owned_head = &pcl->next; + /* lucky, I am the followee :) */ + return COLLECT_PRIMARY_FOLLOWED; + } else if (pcl->next == Z_EROFS_PCLUSTER_TAIL) { + /* + * type 2, link to the end of a existing open chain, + * be careful that its submission itself is governed + * by the original owned chain. + */ + if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, + *owned_head) != Z_EROFS_PCLUSTER_TAIL) + goto retry; + *owned_head = Z_EROFS_PCLUSTER_TAIL; + return COLLECT_PRIMARY_HOOKED; + } + return COLLECT_PRIMARY; /* :( better luck next time */ +} + +static struct z_erofs_collection *cllookup(struct z_erofs_collector *clt, + struct inode *inode, + struct erofs_map_blocks *map) +{ + struct erofs_workgroup *grp; + struct z_erofs_pcluster *pcl; + struct z_erofs_collection *cl; + unsigned int length; + bool tag; + + grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT, &tag); + if (!grp) + return NULL; + + pcl = container_of(grp, struct z_erofs_pcluster, obj); + if (clt->owned_head == &pcl->next || pcl == clt->tailpcl) { + DBG_BUGON(1); + erofs_workgroup_put(grp); + return ERR_PTR(-EFSCORRUPTED); + } + + cl = z_erofs_primarycollection(pcl); + if (cl->pageofs != (map->m_la & ~PAGE_MASK)) { + DBG_BUGON(1); + erofs_workgroup_put(grp); + return ERR_PTR(-EFSCORRUPTED); + } + + length = READ_ONCE(pcl->length); + if (length & Z_EROFS_PCLUSTER_FULL_LENGTH) { + if ((map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) > length) { + DBG_BUGON(1); + erofs_workgroup_put(grp); + return ERR_PTR(-EFSCORRUPTED); + } + } else { + unsigned int llen = map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT; + + if (map->m_flags & EROFS_MAP_FULL_MAPPED) + llen |= Z_EROFS_PCLUSTER_FULL_LENGTH; + + while (llen > length && + length != cmpxchg_relaxed(&pcl->length, length, llen)) { + cpu_relax(); + length = READ_ONCE(pcl->length); + } + } + mutex_lock(&cl->lock); + /* used to check tail merging loop due to corrupted images */ + if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) + clt->tailpcl = pcl; + clt->mode = try_to_claim_pcluster(pcl, &clt->owned_head); + /* clean tailpcl if the current owned_head is Z_EROFS_PCLUSTER_TAIL */ + if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) + clt->tailpcl = NULL; + clt->pcl = pcl; + clt->cl = cl; + return cl; +} + +static struct z_erofs_collection *clregister(struct z_erofs_collector *clt, + struct inode *inode, + struct erofs_map_blocks *map) +{ + struct z_erofs_pcluster *pcl; + struct z_erofs_collection *cl; + int err; + + /* no available workgroup, let's allocate one */ + pcl = kmem_cache_alloc(pcluster_cachep, GFP_NOFS); + if (!pcl) + return ERR_PTR(-ENOMEM); + + z_erofs_pcluster_init_always(pcl); + pcl->obj.index = map->m_pa >> PAGE_SHIFT; + + pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) | + (map->m_flags & EROFS_MAP_FULL_MAPPED ? + Z_EROFS_PCLUSTER_FULL_LENGTH : 0); + + if (map->m_flags & EROFS_MAP_ZIPPED) + pcl->algorithmformat = Z_EROFS_COMPRESSION_LZ4; + else + pcl->algorithmformat = Z_EROFS_COMPRESSION_SHIFTED; + + pcl->clusterbits = EROFS_I(inode)->z_physical_clusterbits[0]; + pcl->clusterbits -= PAGE_SHIFT; + + /* new pclusters should be claimed as type 1, primary and followed */ + pcl->next = clt->owned_head; + clt->mode = COLLECT_PRIMARY_FOLLOWED; + + cl = z_erofs_primarycollection(pcl); + cl->pageofs = map->m_la & ~PAGE_MASK; + + /* + * lock all primary followed works before visible to others + * and mutex_trylock *never* fails for a new pcluster. + */ + mutex_trylock(&cl->lock); + + err = erofs_register_workgroup(inode->i_sb, &pcl->obj, 0); + if (err) { + mutex_unlock(&cl->lock); + kmem_cache_free(pcluster_cachep, pcl); + return ERR_PTR(-EAGAIN); + } + /* used to check tail merging loop due to corrupted images */ + if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) + clt->tailpcl = pcl; + clt->owned_head = &pcl->next; + clt->pcl = pcl; + clt->cl = cl; + return cl; +} + +static int z_erofs_collector_begin(struct z_erofs_collector *clt, + struct inode *inode, + struct erofs_map_blocks *map) +{ + struct z_erofs_collection *cl; + + DBG_BUGON(clt->cl); + + /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous collection */ + DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_NIL); + DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); + + if (!PAGE_ALIGNED(map->m_pa)) { + DBG_BUGON(1); + return -EINVAL; + } + +repeat: + cl = cllookup(clt, inode, map); + if (!cl) { + cl = clregister(clt, inode, map); + + if (cl == ERR_PTR(-EAGAIN)) + goto repeat; + } + + if (IS_ERR(cl)) + return PTR_ERR(cl); + + z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS, + cl->pagevec, cl->vcnt); + + clt->compressedpages = clt->pcl->compressed_pages; + if (clt->mode <= COLLECT_PRIMARY) /* cannot do in-place I/O */ + clt->compressedpages += Z_EROFS_CLUSTER_MAX_PAGES; + return 0; +} + +/* + * keep in mind that no referenced pclusters will be freed + * only after a RCU grace period. + */ +static void z_erofs_rcu_callback(struct rcu_head *head) +{ + struct z_erofs_collection *const cl = + container_of(head, struct z_erofs_collection, rcu); + + kmem_cache_free(pcluster_cachep, + container_of(cl, struct z_erofs_pcluster, + primary_collection)); +} + +void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) +{ + struct z_erofs_pcluster *const pcl = + container_of(grp, struct z_erofs_pcluster, obj); + struct z_erofs_collection *const cl = z_erofs_primarycollection(pcl); + + call_rcu(&cl->rcu, z_erofs_rcu_callback); +} + +static void z_erofs_collection_put(struct z_erofs_collection *cl) +{ + struct z_erofs_pcluster *const pcl = + container_of(cl, struct z_erofs_pcluster, primary_collection); + + erofs_workgroup_put(&pcl->obj); +} + +static bool z_erofs_collector_end(struct z_erofs_collector *clt) +{ + struct z_erofs_collection *cl = clt->cl; + + if (!cl) + return false; + + z_erofs_pagevec_ctor_exit(&clt->vector, false); + mutex_unlock(&cl->lock); + + /* + * if all pending pages are added, don't hold its reference + * any longer if the pcluster isn't hosted by ourselves. + */ + if (clt->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE) + z_erofs_collection_put(cl); + + clt->cl = NULL; + return true; +} + +static inline struct page *__stagingpage_alloc(struct list_head *pagepool, + gfp_t gfp) +{ + struct page *page = erofs_allocpage(pagepool, gfp, true); + + page->mapping = Z_EROFS_MAPPING_STAGING; + return page; +} + +static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe, + unsigned int cachestrategy, + erofs_off_t la) +{ + if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED) + return false; + + if (fe->backmost) + return true; + + return cachestrategy >= EROFS_ZIP_CACHE_READAROUND && + la < fe->headoffset; +} + +static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, + struct page *page, + struct list_head *pagepool) +{ + struct inode *const inode = fe->inode; + struct erofs_sb_info *const sbi __maybe_unused = EROFS_I_SB(inode); + struct erofs_map_blocks *const map = &fe->map; + struct z_erofs_collector *const clt = &fe->clt; + const loff_t offset = page_offset(page); + bool tight = (clt->mode >= COLLECT_PRIMARY_HOOKED); + + enum z_erofs_cache_alloctype cache_strategy; + enum z_erofs_page_type page_type; + unsigned int cur, end, spiltted, index; + int err = 0; + + /* register locked file pages as online pages in pack */ + z_erofs_onlinepage_init(page); + + spiltted = 0; + end = PAGE_SIZE; +repeat: + cur = end - 1; + + /* lucky, within the range of the current map_blocks */ + if (offset + cur >= map->m_la && + offset + cur < map->m_la + map->m_llen) { + /* didn't get a valid collection previously (very rare) */ + if (!clt->cl) + goto restart_now; + goto hitted; + } + + /* go ahead the next map_blocks */ + erofs_dbg("%s: [out-of-range] pos %llu", __func__, offset + cur); + + if (z_erofs_collector_end(clt)) + fe->backmost = false; + + map->m_la = offset + cur; + map->m_llen = 0; + err = z_erofs_map_blocks_iter(inode, map, 0); + if (err) + goto err_out; + +restart_now: + if (!(map->m_flags & EROFS_MAP_MAPPED)) + goto hitted; + + err = z_erofs_collector_begin(clt, inode, map); + if (err) + goto err_out; + + /* preload all compressed pages (maybe downgrade role if necessary) */ + if (should_alloc_managed_pages(fe, sbi->cache_strategy, map->m_la)) + cache_strategy = DELAYEDALLOC; + else + cache_strategy = DONTALLOC; + + preload_compressed_pages(clt, MNGD_MAPPING(sbi), + cache_strategy, pagepool); + + tight &= (clt->mode >= COLLECT_PRIMARY_HOOKED); +hitted: + cur = end - min_t(unsigned int, offset + end - map->m_la, end); + if (!(map->m_flags & EROFS_MAP_MAPPED)) { + zero_user_segment(page, cur, end); + goto next_part; + } + + /* let's derive page type */ + page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD : + (!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE : + (tight ? Z_EROFS_PAGE_TYPE_EXCLUSIVE : + Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED)); + + if (cur) + tight &= (clt->mode >= COLLECT_PRIMARY_FOLLOWED); + +retry: + err = z_erofs_attach_page(clt, page, page_type); + /* should allocate an additional staging page for pagevec */ + if (err == -EAGAIN) { + struct page *const newpage = + __stagingpage_alloc(pagepool, GFP_NOFS); + + err = z_erofs_attach_page(clt, newpage, + Z_EROFS_PAGE_TYPE_EXCLUSIVE); + if (!err) + goto retry; + } + + if (err) + goto err_out; + + index = page->index - (map->m_la >> PAGE_SHIFT); + + z_erofs_onlinepage_fixup(page, index, true); + + /* bump up the number of spiltted parts of a page */ + ++spiltted; + /* also update nr_pages */ + clt->cl->nr_pages = max_t(pgoff_t, clt->cl->nr_pages, index + 1); +next_part: + /* can be used for verification */ + map->m_llen = offset + cur - map->m_la; + + end = cur; + if (end > 0) + goto repeat; + +out: + z_erofs_onlinepage_endio(page); + + erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu", + __func__, page, spiltted, map->m_llen); + return err; + + /* if some error occurred while processing this page */ +err_out: + SetPageError(page); + goto out; +} + +static void z_erofs_vle_unzip_kickoff(void *ptr, int bios) +{ + tagptr1_t t = tagptr_init(tagptr1_t, ptr); + struct z_erofs_unzip_io *io = tagptr_unfold_ptr(t); + bool background = tagptr_unfold_tags(t); + + if (!background) { + unsigned long flags; + + spin_lock_irqsave(&io->u.wait.lock, flags); + if (!atomic_add_return(bios, &io->pending_bios)) + wake_up_locked(&io->u.wait); + spin_unlock_irqrestore(&io->u.wait.lock, flags); + return; + } + + if (!atomic_add_return(bios, &io->pending_bios)) + queue_work(z_erofs_workqueue, &io->u.work); +} + +static inline void z_erofs_vle_read_endio(struct bio *bio) +{ + struct erofs_sb_info *sbi = NULL; + blk_status_t err = bio->bi_status; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) { + struct page *page = bvec->bv_page; + bool cachemngd = false; + + DBG_BUGON(PageUptodate(page)); + DBG_BUGON(!page->mapping); + + if (!sbi && !z_erofs_page_is_staging(page)) + sbi = EROFS_SB(page->mapping->host->i_sb); + + /* sbi should already be gotten if the page is managed */ + if (sbi) + cachemngd = erofs_page_is_managed(sbi, page); + + if (err) + SetPageError(page); + else if (cachemngd) + SetPageUptodate(page); + + if (cachemngd) + unlock_page(page); + } + + z_erofs_vle_unzip_kickoff(bio->bi_private, -1); + bio_put(bio); +} + +static int z_erofs_decompress_pcluster(struct super_block *sb, + struct z_erofs_pcluster *pcl, + struct list_head *pagepool) +{ + struct erofs_sb_info *const sbi = EROFS_SB(sb); + const unsigned int clusterpages = BIT(pcl->clusterbits); + struct z_erofs_pagevec_ctor ctor; + unsigned int i, outputsize, llen, nr_pages; + struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES]; + struct page **pages, **compressed_pages, *page; + + enum z_erofs_page_type page_type; + bool overlapped, partial; + struct z_erofs_collection *cl; + int err; + + might_sleep(); + cl = z_erofs_primarycollection(pcl); + DBG_BUGON(!READ_ONCE(cl->nr_pages)); + + mutex_lock(&cl->lock); + nr_pages = cl->nr_pages; + + if (nr_pages <= Z_EROFS_VMAP_ONSTACK_PAGES) { + pages = pages_onstack; + } else if (nr_pages <= Z_EROFS_VMAP_GLOBAL_PAGES && + mutex_trylock(&z_pagemap_global_lock)) { + pages = z_pagemap_global; + } else { + gfp_t gfp_flags = GFP_KERNEL; + + if (nr_pages > Z_EROFS_VMAP_GLOBAL_PAGES) + gfp_flags |= __GFP_NOFAIL; + + pages = kvmalloc_array(nr_pages, sizeof(struct page *), + gfp_flags); + + /* fallback to global pagemap for the lowmem scenario */ + if (!pages) { + mutex_lock(&z_pagemap_global_lock); + pages = z_pagemap_global; + } + } + + for (i = 0; i < nr_pages; ++i) + pages[i] = NULL; + + err = 0; + z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS, + cl->pagevec, 0); + + for (i = 0; i < cl->vcnt; ++i) { + unsigned int pagenr; + + page = z_erofs_pagevec_dequeue(&ctor, &page_type); + + /* all pages in pagevec ought to be valid */ + DBG_BUGON(!page); + DBG_BUGON(!page->mapping); + + if (z_erofs_put_stagingpage(pagepool, page)) + continue; + + if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD) + pagenr = 0; + else + pagenr = z_erofs_onlinepage_index(page); + + DBG_BUGON(pagenr >= nr_pages); + + /* + * currently EROFS doesn't support multiref(dedup), + * so here erroring out one multiref page. + */ + if (pages[pagenr]) { + DBG_BUGON(1); + SetPageError(pages[pagenr]); + z_erofs_onlinepage_endio(pages[pagenr]); + err = -EFSCORRUPTED; + } + pages[pagenr] = page; + } + z_erofs_pagevec_ctor_exit(&ctor, true); + + overlapped = false; + compressed_pages = pcl->compressed_pages; + + for (i = 0; i < clusterpages; ++i) { + unsigned int pagenr; + + page = compressed_pages[i]; + + /* all compressed pages ought to be valid */ + DBG_BUGON(!page); + DBG_BUGON(!page->mapping); + + if (!z_erofs_page_is_staging(page)) { + if (erofs_page_is_managed(sbi, page)) { + if (!PageUptodate(page)) + err = -EIO; + continue; + } + + /* + * only if non-head page can be selected + * for inplace decompression + */ + pagenr = z_erofs_onlinepage_index(page); + + DBG_BUGON(pagenr >= nr_pages); + if (pages[pagenr]) { + DBG_BUGON(1); + SetPageError(pages[pagenr]); + z_erofs_onlinepage_endio(pages[pagenr]); + err = -EFSCORRUPTED; + } + pages[pagenr] = page; + + overlapped = true; + } + + /* PG_error needs checking for inplaced and staging pages */ + if (PageError(page)) { + DBG_BUGON(PageUptodate(page)); + err = -EIO; + } + } + + if (err) + goto out; + + llen = pcl->length >> Z_EROFS_PCLUSTER_LENGTH_BIT; + if (nr_pages << PAGE_SHIFT >= cl->pageofs + llen) { + outputsize = llen; + partial = !(pcl->length & Z_EROFS_PCLUSTER_FULL_LENGTH); + } else { + outputsize = (nr_pages << PAGE_SHIFT) - cl->pageofs; + partial = true; + } + + err = z_erofs_decompress(&(struct z_erofs_decompress_req) { + .sb = sb, + .in = compressed_pages, + .out = pages, + .pageofs_out = cl->pageofs, + .inputsize = PAGE_SIZE, + .outputsize = outputsize, + .alg = pcl->algorithmformat, + .inplace_io = overlapped, + .partial_decoding = partial + }, pagepool); + +out: + /* must handle all compressed pages before endding pages */ + for (i = 0; i < clusterpages; ++i) { + page = compressed_pages[i]; + + if (erofs_page_is_managed(sbi, page)) + continue; + + /* recycle all individual staging pages */ + (void)z_erofs_put_stagingpage(pagepool, page); + + WRITE_ONCE(compressed_pages[i], NULL); + } + + for (i = 0; i < nr_pages; ++i) { + page = pages[i]; + if (!page) + continue; + + DBG_BUGON(!page->mapping); + + /* recycle all individual staging pages */ + if (z_erofs_put_stagingpage(pagepool, page)) + continue; + + if (err < 0) + SetPageError(page); + + z_erofs_onlinepage_endio(page); + } + + if (pages == z_pagemap_global) + mutex_unlock(&z_pagemap_global_lock); + else if (pages != pages_onstack) + kvfree(pages); + + cl->nr_pages = 0; + cl->vcnt = 0; + + /* all cl locks MUST be taken before the following line */ + WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL); + + /* all cl locks SHOULD be released right now */ + mutex_unlock(&cl->lock); + + z_erofs_collection_put(cl); + return err; +} + +static void z_erofs_vle_unzip_all(struct super_block *sb, + struct z_erofs_unzip_io *io, + struct list_head *pagepool) +{ + z_erofs_next_pcluster_t owned = io->head; + + while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) { + struct z_erofs_pcluster *pcl; + + /* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */ + DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL); + + /* no possible that 'owned' equals NULL */ + DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL); + + pcl = container_of(owned, struct z_erofs_pcluster, next); + owned = READ_ONCE(pcl->next); + + z_erofs_decompress_pcluster(sb, pcl, pagepool); + } +} + +static void z_erofs_vle_unzip_wq(struct work_struct *work) +{ + struct z_erofs_unzip_io_sb *iosb = + container_of(work, struct z_erofs_unzip_io_sb, io.u.work); + LIST_HEAD(pagepool); + + DBG_BUGON(iosb->io.head == Z_EROFS_PCLUSTER_TAIL_CLOSED); + z_erofs_vle_unzip_all(iosb->sb, &iosb->io, &pagepool); + + put_pages_list(&pagepool); + kvfree(iosb); +} + +static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, + unsigned int nr, + struct list_head *pagepool, + struct address_space *mc, + gfp_t gfp) +{ + /* determined at compile time to avoid too many #ifdefs */ + const bool nocache = __builtin_constant_p(mc) ? !mc : false; + const pgoff_t index = pcl->obj.index; + bool tocache = false; + + struct address_space *mapping; + struct page *oldpage, *page; + + compressed_page_t t; + int justfound; + +repeat: + page = READ_ONCE(pcl->compressed_pages[nr]); + oldpage = page; + + if (!page) + goto out_allocpage; + + /* + * the cached page has not been allocated and + * an placeholder is out there, prepare it now. + */ + if (!nocache && page == PAGE_UNALLOCATED) { + tocache = true; + goto out_allocpage; + } + + /* process the target tagged pointer */ + t = tagptr_init(compressed_page_t, page); + justfound = tagptr_unfold_tags(t); + page = tagptr_unfold_ptr(t); + + mapping = READ_ONCE(page->mapping); + + /* + * if managed cache is disabled, it's no way to + * get such a cached-like page. + */ + if (nocache) { + /* if managed cache is disabled, it is impossible `justfound' */ + DBG_BUGON(justfound); + + /* and it should be locked, not uptodate, and not truncated */ + DBG_BUGON(!PageLocked(page)); + DBG_BUGON(PageUptodate(page)); + DBG_BUGON(!mapping); + goto out; + } + + /* + * unmanaged (file) pages are all locked solidly, + * therefore it is impossible for `mapping' to be NULL. + */ + if (mapping && mapping != mc) + /* ought to be unmanaged pages */ + goto out; + + lock_page(page); + + /* only true if page reclaim goes wrong, should never happen */ + DBG_BUGON(justfound && PagePrivate(page)); + + /* the page is still in manage cache */ + if (page->mapping == mc) { + WRITE_ONCE(pcl->compressed_pages[nr], page); + + ClearPageError(page); + if (!PagePrivate(page)) { + /* + * impossible to be !PagePrivate(page) for + * the current restriction as well if + * the page is already in compressed_pages[]. + */ + DBG_BUGON(!justfound); + + justfound = 0; + set_page_private(page, (unsigned long)pcl); + SetPagePrivate(page); + } + + /* no need to submit io if it is already up-to-date */ + if (PageUptodate(page)) { + unlock_page(page); + page = NULL; + } + goto out; + } + + /* + * the managed page has been truncated, it's unsafe to + * reuse this one, let's allocate a new cache-managed page. + */ + DBG_BUGON(page->mapping); + DBG_BUGON(!justfound); + + tocache = true; + unlock_page(page); + put_page(page); +out_allocpage: + page = __stagingpage_alloc(pagepool, gfp); + if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) { + list_add(&page->lru, pagepool); + cpu_relax(); + goto repeat; + } + if (nocache || !tocache) + goto out; + if (add_to_page_cache_lru(page, mc, index + nr, gfp)) { + page->mapping = Z_EROFS_MAPPING_STAGING; + goto out; + } + + set_page_private(page, (unsigned long)pcl); + SetPagePrivate(page); +out: /* the only exit (for tracing and debugging) */ + return page; +} + +static struct z_erofs_unzip_io *jobqueue_init(struct super_block *sb, + struct z_erofs_unzip_io *io, + bool foreground) +{ + struct z_erofs_unzip_io_sb *iosb; + + if (foreground) { + /* waitqueue available for foreground io */ + DBG_BUGON(!io); + + init_waitqueue_head(&io->u.wait); + atomic_set(&io->pending_bios, 0); + goto out; + } + + iosb = kvzalloc(sizeof(*iosb), GFP_KERNEL | __GFP_NOFAIL); + DBG_BUGON(!iosb); + + /* initialize fields in the allocated descriptor */ + io = &iosb->io; + iosb->sb = sb; + INIT_WORK(&io->u.work, z_erofs_vle_unzip_wq); +out: + io->head = Z_EROFS_PCLUSTER_TAIL_CLOSED; + return io; +} + +/* define decompression jobqueue types */ +enum { + JQ_BYPASS, + JQ_SUBMIT, + NR_JOBQUEUES, +}; + +static void *jobqueueset_init(struct super_block *sb, + z_erofs_next_pcluster_t qtail[], + struct z_erofs_unzip_io *q[], + struct z_erofs_unzip_io *fgq, + bool forcefg) +{ + /* + * if managed cache is enabled, bypass jobqueue is needed, + * no need to read from device for all pclusters in this queue. + */ + q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, true); + qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; + + q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, forcefg); + qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head; + + return tagptr_cast_ptr(tagptr_fold(tagptr1_t, q[JQ_SUBMIT], !forcefg)); +} + +static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, + z_erofs_next_pcluster_t qtail[], + z_erofs_next_pcluster_t owned_head) +{ + z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT]; + z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS]; + + DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); + if (owned_head == Z_EROFS_PCLUSTER_TAIL) + owned_head = Z_EROFS_PCLUSTER_TAIL_CLOSED; + + WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL_CLOSED); + + WRITE_ONCE(*submit_qtail, owned_head); + WRITE_ONCE(*bypass_qtail, &pcl->next); + + qtail[JQ_BYPASS] = &pcl->next; +} + +static bool postsubmit_is_all_bypassed(struct z_erofs_unzip_io *q[], + unsigned int nr_bios, + bool force_fg) +{ + /* + * although background is preferred, no one is pending for submission. + * don't issue workqueue for decompression but drop it directly instead. + */ + if (force_fg || nr_bios) + return false; + + kvfree(container_of(q[JQ_SUBMIT], struct z_erofs_unzip_io_sb, io)); + return true; +} + +static bool z_erofs_vle_submit_all(struct super_block *sb, + z_erofs_next_pcluster_t owned_head, + struct list_head *pagepool, + struct z_erofs_unzip_io *fgq, + bool force_fg) +{ + struct erofs_sb_info *const sbi __maybe_unused = EROFS_SB(sb); + z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; + struct z_erofs_unzip_io *q[NR_JOBQUEUES]; + struct bio *bio; + void *bi_private; + /* since bio will be NULL, no need to initialize last_index */ + pgoff_t uninitialized_var(last_index); + bool force_submit = false; + unsigned int nr_bios; + + if (owned_head == Z_EROFS_PCLUSTER_TAIL) + return false; + + force_submit = false; + bio = NULL; + nr_bios = 0; + bi_private = jobqueueset_init(sb, qtail, q, fgq, force_fg); + + /* by default, all need io submission */ + q[JQ_SUBMIT]->head = owned_head; + + do { + struct z_erofs_pcluster *pcl; + unsigned int clusterpages; + pgoff_t first_index; + struct page *page; + unsigned int i = 0, bypass = 0; + int err; + + /* no possible 'owned_head' equals the following */ + DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); + DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL); + + pcl = container_of(owned_head, struct z_erofs_pcluster, next); + + clusterpages = BIT(pcl->clusterbits); + + /* close the main owned chain at first */ + owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, + Z_EROFS_PCLUSTER_TAIL_CLOSED); + + first_index = pcl->obj.index; + force_submit |= (first_index != last_index + 1); + +repeat: + page = pickup_page_for_submission(pcl, i, pagepool, + MNGD_MAPPING(sbi), + GFP_NOFS); + if (!page) { + force_submit = true; + ++bypass; + goto skippage; + } + + if (bio && force_submit) { +submit_bio_retry: + submit_bio(bio); + bio = NULL; + } + + if (!bio) { + bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); + + bio->bi_end_io = z_erofs_vle_read_endio; + bio_set_dev(bio, sb->s_bdev); + bio->bi_iter.bi_sector = (sector_t)(first_index + i) << + LOG_SECTORS_PER_BLOCK; + bio->bi_private = bi_private; + bio->bi_opf = REQ_OP_READ; + + ++nr_bios; + } + + err = bio_add_page(bio, page, PAGE_SIZE, 0); + if (err < PAGE_SIZE) + goto submit_bio_retry; + + force_submit = false; + last_index = first_index + i; +skippage: + if (++i < clusterpages) + goto repeat; + + if (bypass < clusterpages) + qtail[JQ_SUBMIT] = &pcl->next; + else + move_to_bypass_jobqueue(pcl, qtail, owned_head); + } while (owned_head != Z_EROFS_PCLUSTER_TAIL); + + if (bio) + submit_bio(bio); + + if (postsubmit_is_all_bypassed(q, nr_bios, force_fg)) + return true; + + z_erofs_vle_unzip_kickoff(bi_private, nr_bios); + return true; +} + +static void z_erofs_submit_and_unzip(struct super_block *sb, + struct z_erofs_collector *clt, + struct list_head *pagepool, + bool force_fg) +{ + struct z_erofs_unzip_io io[NR_JOBQUEUES]; + + if (!z_erofs_vle_submit_all(sb, clt->owned_head, + pagepool, io, force_fg)) + return; + + /* decompress no I/O pclusters immediately */ + z_erofs_vle_unzip_all(sb, &io[JQ_BYPASS], pagepool); + + if (!force_fg) + return; + + /* wait until all bios are completed */ + wait_event(io[JQ_SUBMIT].u.wait, + !atomic_read(&io[JQ_SUBMIT].pending_bios)); + + /* let's synchronous decompression */ + z_erofs_vle_unzip_all(sb, &io[JQ_SUBMIT], pagepool); +} + +static int z_erofs_vle_normalaccess_readpage(struct file *file, + struct page *page) +{ + struct inode *const inode = page->mapping->host; + struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); + int err; + LIST_HEAD(pagepool); + + trace_erofs_readpage(page, false); + + f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; + + err = z_erofs_do_read_page(&f, page, &pagepool); + (void)z_erofs_collector_end(&f.clt); + + /* if some compressed cluster ready, need submit them anyway */ + z_erofs_submit_and_unzip(inode->i_sb, &f.clt, &pagepool, true); + + if (err) + erofs_err(inode->i_sb, "failed to read, err [%d]", err); + + if (f.map.mpage) + put_page(f.map.mpage); + + /* clean up the remaining free pages */ + put_pages_list(&pagepool); + return err; +} + +static bool should_decompress_synchronously(struct erofs_sb_info *sbi, + unsigned int nr) +{ + return nr <= sbi->max_sync_decompress_pages; +} + +static int z_erofs_vle_normalaccess_readpages(struct file *filp, + struct address_space *mapping, + struct list_head *pages, + unsigned int nr_pages) +{ + struct inode *const inode = mapping->host; + struct erofs_sb_info *const sbi = EROFS_I_SB(inode); + + bool sync = should_decompress_synchronously(sbi, nr_pages); + struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); + gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL); + struct page *head = NULL; + LIST_HEAD(pagepool); + + trace_erofs_readpages(mapping->host, lru_to_page(pages), + nr_pages, false); + + f.headoffset = (erofs_off_t)lru_to_page(pages)->index << PAGE_SHIFT; + + for (; nr_pages; --nr_pages) { + struct page *page = lru_to_page(pages); + + prefetchw(&page->flags); + list_del(&page->lru); + + /* + * A pure asynchronous readahead is indicated if + * a PG_readahead marked page is hitted at first. + * Let's also do asynchronous decompression for this case. + */ + sync &= !(PageReadahead(page) && !head); + + if (add_to_page_cache_lru(page, mapping, page->index, gfp)) { + list_add(&page->lru, &pagepool); + continue; + } + + set_page_private(page, (unsigned long)head); + head = page; + } + + while (head) { + struct page *page = head; + int err; + + /* traversal in reverse order */ + head = (void *)page_private(page); + + err = z_erofs_do_read_page(&f, page, &pagepool); + if (err) + erofs_err(inode->i_sb, + "readahead error at page %lu @ nid %llu", + page->index, EROFS_I(inode)->nid); + put_page(page); + } + + (void)z_erofs_collector_end(&f.clt); + + z_erofs_submit_and_unzip(inode->i_sb, &f.clt, &pagepool, sync); + + if (f.map.mpage) + put_page(f.map.mpage); + + /* clean up the remaining free pages */ + put_pages_list(&pagepool); + return 0; +} + +const struct address_space_operations z_erofs_vle_normalaccess_aops = { + .readpage = z_erofs_vle_normalaccess_readpage, + .readpages = z_erofs_vle_normalaccess_readpages, +}; + diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h new file mode 100644 index 000000000000..faf950189bd7 --- /dev/null +++ b/fs/erofs/zdata.h @@ -0,0 +1,193 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#ifndef __EROFS_FS_ZDATA_H +#define __EROFS_FS_ZDATA_H + +#include "internal.h" +#include "zpvec.h" + +#define Z_EROFS_NR_INLINE_PAGEVECS 3 + +/* + * Structure fields follow one of the following exclusion rules. + * + * I: Modifiable by initialization/destruction paths and read-only + * for everyone else; + * + * L: Field should be protected by pageset lock; + * + * A: Field should be accessed / updated in atomic for parallelized code. + */ +struct z_erofs_collection { + struct mutex lock; + + /* I: page offset of start position of decompression */ + unsigned short pageofs; + + /* L: maximum relative page index in pagevec[] */ + unsigned short nr_pages; + + /* L: total number of pages in pagevec[] */ + unsigned int vcnt; + + union { + /* L: inline a certain number of pagevecs for bootstrap */ + erofs_vtptr_t pagevec[Z_EROFS_NR_INLINE_PAGEVECS]; + + /* I: can be used to free the pcluster by RCU. */ + struct rcu_head rcu; + }; +}; + +#define Z_EROFS_PCLUSTER_FULL_LENGTH 0x00000001 +#define Z_EROFS_PCLUSTER_LENGTH_BIT 1 + +/* + * let's leave a type here in case of introducing + * another tagged pointer later. + */ +typedef void *z_erofs_next_pcluster_t; + +struct z_erofs_pcluster { + struct erofs_workgroup obj; + struct z_erofs_collection primary_collection; + + /* A: point to next chained pcluster or TAILs */ + z_erofs_next_pcluster_t next; + + /* A: compressed pages (including multi-usage pages) */ + struct page *compressed_pages[Z_EROFS_CLUSTER_MAX_PAGES]; + + /* A: lower limit of decompressed length and if full length or not */ + unsigned int length; + + /* I: compression algorithm format */ + unsigned char algorithmformat; + /* I: bit shift of physical cluster size */ + unsigned char clusterbits; +}; + +#define z_erofs_primarycollection(pcluster) (&(pcluster)->primary_collection) + +/* let's avoid the valid 32-bit kernel addresses */ + +/* the chained workgroup has't submitted io (still open) */ +#define Z_EROFS_PCLUSTER_TAIL ((void *)0x5F0ECAFE) +/* the chained workgroup has already submitted io */ +#define Z_EROFS_PCLUSTER_TAIL_CLOSED ((void *)0x5F0EDEAD) + +#define Z_EROFS_PCLUSTER_NIL (NULL) + +#define Z_EROFS_WORKGROUP_SIZE sizeof(struct z_erofs_pcluster) + +struct z_erofs_unzip_io { + atomic_t pending_bios; + z_erofs_next_pcluster_t head; + + union { + wait_queue_head_t wait; + struct work_struct work; + } u; +}; + +struct z_erofs_unzip_io_sb { + struct z_erofs_unzip_io io; + struct super_block *sb; +}; + +#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping) +static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi, + struct page *page) +{ + return page->mapping == MNGD_MAPPING(sbi); +} + +#define Z_EROFS_ONLINEPAGE_COUNT_BITS 2 +#define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1) +#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT (Z_EROFS_ONLINEPAGE_COUNT_BITS) + +/* + * waiters (aka. ongoing_packs): # to unlock the page + * sub-index: 0 - for partial page, >= 1 full page sub-index + */ +typedef atomic_t z_erofs_onlinepage_t; + +/* type punning */ +union z_erofs_onlinepage_converter { + z_erofs_onlinepage_t *o; + unsigned long *v; +}; + +static inline unsigned int z_erofs_onlinepage_index(struct page *page) +{ + union z_erofs_onlinepage_converter u; + + DBG_BUGON(!PagePrivate(page)); + u.v = &page_private(page); + + return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT; +} + +static inline void z_erofs_onlinepage_init(struct page *page) +{ + union { + z_erofs_onlinepage_t o; + unsigned long v; + /* keep from being unlocked in advance */ + } u = { .o = ATOMIC_INIT(1) }; + + set_page_private(page, u.v); + smp_wmb(); + SetPagePrivate(page); +} + +static inline void z_erofs_onlinepage_fixup(struct page *page, + uintptr_t index, bool down) +{ + unsigned long *p, o, v, id; +repeat: + p = &page_private(page); + o = READ_ONCE(*p); + + id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT; + if (id) { + if (!index) + return; + + DBG_BUGON(id != index); + } + + v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) | + ((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned int)down); + if (cmpxchg(p, o, v) != o) + goto repeat; +} + +static inline void z_erofs_onlinepage_endio(struct page *page) +{ + union z_erofs_onlinepage_converter u; + unsigned int v; + + DBG_BUGON(!PagePrivate(page)); + u.v = &page_private(page); + + v = atomic_dec_return(u.o); + if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) { + ClearPagePrivate(page); + if (!PageError(page)) + SetPageUptodate(page); + unlock_page(page); + } + erofs_dbg("%s, page %p value %x", __func__, page, atomic_read(u.o)); +} + +#define Z_EROFS_VMAP_ONSTACK_PAGES \ + min_t(unsigned int, THREAD_SIZE / 8 / sizeof(struct page *), 96U) +#define Z_EROFS_VMAP_GLOBAL_PAGES 2048 + +#endif + diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c new file mode 100644 index 000000000000..6a26c293ae2d --- /dev/null +++ b/fs/erofs/zmap.c @@ -0,0 +1,471 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2018-2019 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#include "internal.h" +#include <asm/unaligned.h> +#include <trace/events/erofs.h> + +int z_erofs_fill_inode(struct inode *inode) +{ + struct erofs_inode *const vi = EROFS_I(inode); + + if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) { + vi->z_advise = 0; + vi->z_algorithmtype[0] = 0; + vi->z_algorithmtype[1] = 0; + vi->z_logical_clusterbits = LOG_BLOCK_SIZE; + vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits; + vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits; + set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); + } + + inode->i_mapping->a_ops = &z_erofs_vle_normalaccess_aops; + return 0; +} + +static int fill_inode_lazy(struct inode *inode) +{ + struct erofs_inode *const vi = EROFS_I(inode); + struct super_block *const sb = inode->i_sb; + int err; + erofs_off_t pos; + struct page *page; + void *kaddr; + struct z_erofs_map_header *h; + + if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) + return 0; + + if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE)) + return -ERESTARTSYS; + + err = 0; + if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) + goto out_unlock; + + DBG_BUGON(vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY); + + pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize + + vi->xattr_isize, 8); + page = erofs_get_meta_page(sb, erofs_blknr(pos)); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out_unlock; + } + + kaddr = kmap_atomic(page); + + h = kaddr + erofs_blkoff(pos); + vi->z_advise = le16_to_cpu(h->h_advise); + vi->z_algorithmtype[0] = h->h_algorithmtype & 15; + vi->z_algorithmtype[1] = h->h_algorithmtype >> 4; + + if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX) { + erofs_err(sb, "unknown compression format %u for nid %llu, please upgrade kernel", + vi->z_algorithmtype[0], vi->nid); + err = -EOPNOTSUPP; + goto unmap_done; + } + + vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7); + vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits + + ((h->h_clusterbits >> 3) & 3); + + if (vi->z_physical_clusterbits[0] != LOG_BLOCK_SIZE) { + erofs_err(sb, "unsupported physical clusterbits %u for nid %llu, please upgrade kernel", + vi->z_physical_clusterbits[0], vi->nid); + err = -EOPNOTSUPP; + goto unmap_done; + } + + vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits + + ((h->h_clusterbits >> 5) & 7); + set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); +unmap_done: + kunmap_atomic(kaddr); + unlock_page(page); + put_page(page); +out_unlock: + clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags); + return err; +} + +struct z_erofs_maprecorder { + struct inode *inode; + struct erofs_map_blocks *map; + void *kaddr; + + unsigned long lcn; + /* compression extent information gathered */ + u8 type; + u16 clusterofs; + u16 delta[2]; + erofs_blk_t pblk; +}; + +static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m, + erofs_blk_t eblk) +{ + struct super_block *const sb = m->inode->i_sb; + struct erofs_map_blocks *const map = m->map; + struct page *mpage = map->mpage; + + if (mpage) { + if (mpage->index == eblk) { + if (!m->kaddr) + m->kaddr = kmap_atomic(mpage); + return 0; + } + + if (m->kaddr) { + kunmap_atomic(m->kaddr); + m->kaddr = NULL; + } + put_page(mpage); + } + + mpage = erofs_get_meta_page(sb, eblk); + if (IS_ERR(mpage)) { + map->mpage = NULL; + return PTR_ERR(mpage); + } + m->kaddr = kmap_atomic(mpage); + unlock_page(mpage); + map->mpage = mpage; + return 0; +} + +static int vle_legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, + unsigned long lcn) +{ + struct inode *const inode = m->inode; + struct erofs_inode *const vi = EROFS_I(inode); + const erofs_off_t ibase = iloc(EROFS_I_SB(inode), vi->nid); + const erofs_off_t pos = + Z_EROFS_VLE_LEGACY_INDEX_ALIGN(ibase + vi->inode_isize + + vi->xattr_isize) + + lcn * sizeof(struct z_erofs_vle_decompressed_index); + struct z_erofs_vle_decompressed_index *di; + unsigned int advise, type; + int err; + + err = z_erofs_reload_indexes(m, erofs_blknr(pos)); + if (err) + return err; + + m->lcn = lcn; + di = m->kaddr + erofs_blkoff(pos); + + advise = le16_to_cpu(di->di_advise); + type = (advise >> Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT) & + ((1 << Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) - 1); + switch (type) { + case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: + m->clusterofs = 1 << vi->z_logical_clusterbits; + m->delta[0] = le16_to_cpu(di->di_u.delta[0]); + m->delta[1] = le16_to_cpu(di->di_u.delta[1]); + break; + case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + m->clusterofs = le16_to_cpu(di->di_clusterofs); + m->pblk = le32_to_cpu(di->di_u.blkaddr); + break; + default: + DBG_BUGON(1); + return -EOPNOTSUPP; + } + m->type = type; + return 0; +} + +static unsigned int decode_compactedbits(unsigned int lobits, + unsigned int lomask, + u8 *in, unsigned int pos, u8 *type) +{ + const unsigned int v = get_unaligned_le32(in + pos / 8) >> (pos & 7); + const unsigned int lo = v & lomask; + + *type = (v >> lobits) & 3; + return lo; +} + +static int unpack_compacted_index(struct z_erofs_maprecorder *m, + unsigned int amortizedshift, + unsigned int eofs) +{ + struct erofs_inode *const vi = EROFS_I(m->inode); + const unsigned int lclusterbits = vi->z_logical_clusterbits; + const unsigned int lomask = (1 << lclusterbits) - 1; + unsigned int vcnt, base, lo, encodebits, nblk; + int i; + u8 *in, type; + + if (1 << amortizedshift == 4) + vcnt = 2; + else if (1 << amortizedshift == 2 && lclusterbits == 12) + vcnt = 16; + else + return -EOPNOTSUPP; + + encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt; + base = round_down(eofs, vcnt << amortizedshift); + in = m->kaddr + base; + + i = (eofs - base) >> amortizedshift; + + lo = decode_compactedbits(lclusterbits, lomask, + in, encodebits * i, &type); + m->type = type; + if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) { + m->clusterofs = 1 << lclusterbits; + if (i + 1 != vcnt) { + m->delta[0] = lo; + return 0; + } + /* + * since the last lcluster in the pack is special, + * of which lo saves delta[1] rather than delta[0]. + * Hence, get delta[0] by the previous lcluster indirectly. + */ + lo = decode_compactedbits(lclusterbits, lomask, + in, encodebits * (i - 1), &type); + if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) + lo = 0; + m->delta[0] = lo + 1; + return 0; + } + m->clusterofs = lo; + m->delta[0] = 0; + /* figout out blkaddr (pblk) for HEAD lclusters */ + nblk = 1; + while (i > 0) { + --i; + lo = decode_compactedbits(lclusterbits, lomask, + in, encodebits * i, &type); + if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) + i -= lo; + + if (i >= 0) + ++nblk; + } + in += (vcnt << amortizedshift) - sizeof(__le32); + m->pblk = le32_to_cpu(*(__le32 *)in) + nblk; + return 0; +} + +static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, + unsigned long lcn) +{ + struct inode *const inode = m->inode; + struct erofs_inode *const vi = EROFS_I(inode); + const unsigned int lclusterbits = vi->z_logical_clusterbits; + const erofs_off_t ebase = ALIGN(iloc(EROFS_I_SB(inode), vi->nid) + + vi->inode_isize + vi->xattr_isize, 8) + + sizeof(struct z_erofs_map_header); + const unsigned int totalidx = DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ); + unsigned int compacted_4b_initial, compacted_2b; + unsigned int amortizedshift; + erofs_off_t pos; + int err; + + if (lclusterbits != 12) + return -EOPNOTSUPP; + + if (lcn >= totalidx) + return -EINVAL; + + m->lcn = lcn; + /* used to align to 32-byte (compacted_2b) alignment */ + compacted_4b_initial = (32 - ebase % 32) / 4; + if (compacted_4b_initial == 32 / 4) + compacted_4b_initial = 0; + + if (vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B) + compacted_2b = rounddown(totalidx - compacted_4b_initial, 16); + else + compacted_2b = 0; + + pos = ebase; + if (lcn < compacted_4b_initial) { + amortizedshift = 2; + goto out; + } + pos += compacted_4b_initial * 4; + lcn -= compacted_4b_initial; + + if (lcn < compacted_2b) { + amortizedshift = 1; + goto out; + } + pos += compacted_2b * 2; + lcn -= compacted_2b; + amortizedshift = 2; +out: + pos += lcn * (1 << amortizedshift); + err = z_erofs_reload_indexes(m, erofs_blknr(pos)); + if (err) + return err; + return unpack_compacted_index(m, amortizedshift, erofs_blkoff(pos)); +} + +static int vle_load_cluster_from_disk(struct z_erofs_maprecorder *m, + unsigned int lcn) +{ + const unsigned int datamode = EROFS_I(m->inode)->datalayout; + + if (datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY) + return vle_legacy_load_cluster_from_disk(m, lcn); + + if (datamode == EROFS_INODE_FLAT_COMPRESSION) + return compacted_load_cluster_from_disk(m, lcn); + + return -EINVAL; +} + +static int vle_extent_lookback(struct z_erofs_maprecorder *m, + unsigned int lookback_distance) +{ + struct erofs_inode *const vi = EROFS_I(m->inode); + struct erofs_map_blocks *const map = m->map; + const unsigned int lclusterbits = vi->z_logical_clusterbits; + unsigned long lcn = m->lcn; + int err; + + if (lcn < lookback_distance) { + erofs_err(m->inode->i_sb, + "bogus lookback distance @ nid %llu", vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; + } + + /* load extent head logical cluster if needed */ + lcn -= lookback_distance; + err = vle_load_cluster_from_disk(m, lcn); + if (err) + return err; + + switch (m->type) { + case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: + if (!m->delta[0]) { + erofs_err(m->inode->i_sb, + "invalid lookback distance 0 @ nid %llu", + vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; + } + return vle_extent_lookback(m, m->delta[0]); + case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: + map->m_flags &= ~EROFS_MAP_ZIPPED; + /* fallthrough */ + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + map->m_la = (lcn << lclusterbits) | m->clusterofs; + break; + default: + erofs_err(m->inode->i_sb, + "unknown type %u @ lcn %lu of nid %llu", + m->type, lcn, vi->nid); + DBG_BUGON(1); + return -EOPNOTSUPP; + } + return 0; +} + +int z_erofs_map_blocks_iter(struct inode *inode, + struct erofs_map_blocks *map, + int flags) +{ + struct erofs_inode *const vi = EROFS_I(inode); + struct z_erofs_maprecorder m = { + .inode = inode, + .map = map, + }; + int err = 0; + unsigned int lclusterbits, endoff; + unsigned long long ofs, end; + + trace_z_erofs_map_blocks_iter_enter(inode, map, flags); + + /* when trying to read beyond EOF, leave it unmapped */ + if (map->m_la >= inode->i_size) { + map->m_llen = map->m_la + 1 - inode->i_size; + map->m_la = inode->i_size; + map->m_flags = 0; + goto out; + } + + err = fill_inode_lazy(inode); + if (err) + goto out; + + lclusterbits = vi->z_logical_clusterbits; + ofs = map->m_la; + m.lcn = ofs >> lclusterbits; + endoff = ofs & ((1 << lclusterbits) - 1); + + err = vle_load_cluster_from_disk(&m, m.lcn); + if (err) + goto unmap_out; + + map->m_flags = EROFS_MAP_ZIPPED; /* by default, compressed */ + end = (m.lcn + 1ULL) << lclusterbits; + + switch (m.type) { + case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: + if (endoff >= m.clusterofs) + map->m_flags &= ~EROFS_MAP_ZIPPED; + /* fallthrough */ + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + if (endoff >= m.clusterofs) { + map->m_la = (m.lcn << lclusterbits) | m.clusterofs; + break; + } + /* m.lcn should be >= 1 if endoff < m.clusterofs */ + if (!m.lcn) { + erofs_err(inode->i_sb, + "invalid logical cluster 0 at nid %llu", + vi->nid); + err = -EFSCORRUPTED; + goto unmap_out; + } + end = (m.lcn << lclusterbits) | m.clusterofs; + map->m_flags |= EROFS_MAP_FULL_MAPPED; + m.delta[0] = 1; + /* fallthrough */ + case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: + /* get the correspoinding first chunk */ + err = vle_extent_lookback(&m, m.delta[0]); + if (err) + goto unmap_out; + break; + default: + erofs_err(inode->i_sb, + "unknown type %u @ offset %llu of nid %llu", + m.type, ofs, vi->nid); + err = -EOPNOTSUPP; + goto unmap_out; + } + + map->m_llen = end - map->m_la; + map->m_plen = 1 << lclusterbits; + map->m_pa = blknr_to_addr(m.pblk); + map->m_flags |= EROFS_MAP_MAPPED; + +unmap_out: + if (m.kaddr) + kunmap_atomic(m.kaddr); + +out: + erofs_dbg("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o", + __func__, map->m_la, map->m_pa, + map->m_llen, map->m_plen, map->m_flags); + + trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err); + + /* aggressively BUG_ON iff CONFIG_EROFS_FS_DEBUG is on */ + DBG_BUGON(err < 0 && err != -ENOMEM); + return err; +} + diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h new file mode 100644 index 000000000000..58556903aa94 --- /dev/null +++ b/fs/erofs/zpvec.h @@ -0,0 +1,157 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#ifndef __EROFS_FS_ZPVEC_H +#define __EROFS_FS_ZPVEC_H + +#include "tagptr.h" + +/* page type in pagevec for decompress subsystem */ +enum z_erofs_page_type { + /* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */ + Z_EROFS_PAGE_TYPE_EXCLUSIVE, + + Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED, + + Z_EROFS_VLE_PAGE_TYPE_HEAD, + Z_EROFS_VLE_PAGE_TYPE_MAX +}; + +extern void __compiletime_error("Z_EROFS_PAGE_TYPE_EXCLUSIVE != 0") + __bad_page_type_exclusive(void); + +/* pagevec tagged pointer */ +typedef tagptr2_t erofs_vtptr_t; + +/* pagevec collector */ +struct z_erofs_pagevec_ctor { + struct page *curr, *next; + erofs_vtptr_t *pages; + + unsigned int nr, index; +}; + +static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor, + bool atomic) +{ + if (!ctor->curr) + return; + + if (atomic) + kunmap_atomic(ctor->pages); + else + kunmap(ctor->curr); +} + +static inline struct page * +z_erofs_pagevec_ctor_next_page(struct z_erofs_pagevec_ctor *ctor, + unsigned int nr) +{ + unsigned int index; + + /* keep away from occupied pages */ + if (ctor->next) + return ctor->next; + + for (index = 0; index < nr; ++index) { + const erofs_vtptr_t t = ctor->pages[index]; + const unsigned int tags = tagptr_unfold_tags(t); + + if (tags == Z_EROFS_PAGE_TYPE_EXCLUSIVE) + return tagptr_unfold_ptr(t); + } + DBG_BUGON(nr >= ctor->nr); + return NULL; +} + +static inline void +z_erofs_pagevec_ctor_pagedown(struct z_erofs_pagevec_ctor *ctor, + bool atomic) +{ + struct page *next = z_erofs_pagevec_ctor_next_page(ctor, ctor->nr); + + z_erofs_pagevec_ctor_exit(ctor, atomic); + + ctor->curr = next; + ctor->next = NULL; + ctor->pages = atomic ? + kmap_atomic(ctor->curr) : kmap(ctor->curr); + + ctor->nr = PAGE_SIZE / sizeof(struct page *); + ctor->index = 0; +} + +static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor, + unsigned int nr, + erofs_vtptr_t *pages, + unsigned int i) +{ + ctor->nr = nr; + ctor->curr = ctor->next = NULL; + ctor->pages = pages; + + if (i >= nr) { + i -= nr; + z_erofs_pagevec_ctor_pagedown(ctor, false); + while (i > ctor->nr) { + i -= ctor->nr; + z_erofs_pagevec_ctor_pagedown(ctor, false); + } + } + ctor->next = z_erofs_pagevec_ctor_next_page(ctor, i); + ctor->index = i; +} + +static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor, + struct page *page, + enum z_erofs_page_type type, + bool *occupied) +{ + *occupied = false; + if (!ctor->next && type) + if (ctor->index + 1 == ctor->nr) + return false; + + if (ctor->index >= ctor->nr) + z_erofs_pagevec_ctor_pagedown(ctor, false); + + /* exclusive page type must be 0 */ + if (Z_EROFS_PAGE_TYPE_EXCLUSIVE != (uintptr_t)NULL) + __bad_page_type_exclusive(); + + /* should remind that collector->next never equal to 1, 2 */ + if (type == (uintptr_t)ctor->next) { + ctor->next = page; + *occupied = true; + } + ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, page, type); + return true; +} + +static inline struct page * +z_erofs_pagevec_dequeue(struct z_erofs_pagevec_ctor *ctor, + enum z_erofs_page_type *type) +{ + erofs_vtptr_t t; + + if (ctor->index >= ctor->nr) { + DBG_BUGON(!ctor->next); + z_erofs_pagevec_ctor_pagedown(ctor, true); + } + + t = ctor->pages[ctor->index]; + + *type = tagptr_unfold_tags(t); + + /* should remind that collector->next never equal to 1, 2 */ + if (*type == (uintptr_t)ctor->next) + ctor->next = tagptr_unfold_ptr(t); + + ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, NULL, 0); + return tagptr_unfold_ptr(t); +} +#endif + diff --git a/fs/eventpoll.c b/fs/eventpoll.c index d7f1f5011fac..c4159bcc05d9 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1459,13 +1459,13 @@ static int ep_create_wakeup_source(struct epitem *epi) struct wakeup_source *ws; if (!epi->ep->ws) { - epi->ep->ws = wakeup_source_register("eventpoll"); + epi->ep->ws = wakeup_source_register(NULL, "eventpoll"); if (!epi->ep->ws) return -ENOMEM; } name = epi->ffd.file->f_path.dentry->d_name.name; - ws = wakeup_source_register(name); + ws = wakeup_source_register(NULL, name); if (!ws) return -ENOMEM; diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index f0e549783caf..09bc68708d28 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -7,7 +7,7 @@ * and for mapping back from file handles to dentries. * * For details on why we do all the strange and hairy things in here - * take a look at Documentation/filesystems/nfs/Exporting. + * take a look at Documentation/filesystems/nfs/exporting.rst. */ #include <linux/exportfs.h> #include <linux/fs.h> diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 8fdfcd3c3e04..b17ddc229ac5 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -13,3 +13,4 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o +ext4-$(CONFIG_FS_VERITY) += verity.o diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index bf660aa7a9e0..9c7f4036021b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -41,6 +41,7 @@ #endif #include <linux/fscrypt.h> +#include <linux/fsverity.h> #include <linux/compiler.h> @@ -395,6 +396,7 @@ struct flex_groups { #define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ #define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ +#define EXT4_VERITY_FL 0x00100000 /* Verity protected inode */ #define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ #define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ #define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ @@ -402,7 +404,7 @@ struct flex_groups { #define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded file */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ -#define EXT4_FL_USER_VISIBLE 0x704BDFFF /* User visible flags */ +#define EXT4_FL_USER_VISIBLE 0x705BDFFF /* User visible flags */ #define EXT4_FL_USER_MODIFIABLE 0x604BC0FF /* User modifiable flags */ /* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */ @@ -467,6 +469,7 @@ enum { EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/ EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */ EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ + EXT4_INODE_VERITY = 20, /* Verity protected inode */ EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */ EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ @@ -512,6 +515,7 @@ static inline void ext4_check_flag_values(void) CHECK_FLAG_VALUE(TOPDIR); CHECK_FLAG_VALUE(HUGE_FILE); CHECK_FLAG_VALUE(EXTENTS); + CHECK_FLAG_VALUE(VERITY); CHECK_FLAG_VALUE(EA_INODE); CHECK_FLAG_VALUE(EOFBLOCKS); CHECK_FLAG_VALUE(INLINE_DATA); @@ -1560,6 +1564,7 @@ enum { EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ + EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ @@ -1610,6 +1615,12 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_SB(sb) (sb) #endif +static inline bool ext4_verity_in_progress(struct inode *inode) +{ + return IS_ENABLED(CONFIG_FS_VERITY) && + ext4_test_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); +} + #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime /* @@ -1662,6 +1673,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 #define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000 #define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000 +#define EXT4_FEATURE_RO_COMPAT_VERITY 0x8000 #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 @@ -1756,6 +1768,7 @@ EXT4_FEATURE_RO_COMPAT_FUNCS(bigalloc, BIGALLOC) EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum, METADATA_CSUM) EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY) EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT) +EXT4_FEATURE_RO_COMPAT_FUNCS(verity, VERITY) EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION) EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE) @@ -1813,7 +1826,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD) EXT4_FEATURE_RO_COMPAT_BIGALLOC |\ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\ EXT4_FEATURE_RO_COMPAT_QUOTA |\ - EXT4_FEATURE_RO_COMPAT_PROJECT) + EXT4_FEATURE_RO_COMPAT_PROJECT |\ + EXT4_FEATURE_RO_COMPAT_VERITY) #define EXTN_FEATURE_FUNCS(ver) \ static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \ @@ -3177,6 +3191,8 @@ static inline void ext4_set_de_type(struct super_block *sb, extern int ext4_mpage_readpages(struct address_space *mapping, struct list_head *pages, struct page *page, unsigned nr_pages, bool is_readahead); +extern int __init ext4_init_post_read_processing(void); +extern void ext4_exit_post_read_processing(void); /* symlink.c */ extern const struct inode_operations ext4_encrypted_symlink_inode_operations; @@ -3283,6 +3299,9 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io, /* mmp.c */ extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); +/* verity.c */ +extern const struct fsverity_operations ext4_verityops; + /* * Add new method to test whether block and inode bitmaps are properly * initialized. With uninit_bg reading the block from disk is not enough diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 70b0438dbc94..b8a20bb9a145 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -457,6 +457,10 @@ static int ext4_file_open(struct inode * inode, struct file * filp) if (ret) return ret; + ret = fsverity_file_open(inode, filp); + if (ret) + return ret; + /* * Set up the jbd2_inode if we are opening the inode for * writing and the journal is present diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 420fe3deed39..d0dc0e3463db 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1340,6 +1340,9 @@ retry_journal: } if (ret) { + bool extended = (pos + len > inode->i_size) && + !ext4_verity_in_progress(inode); + unlock_page(page); /* * __block_write_begin may have instantiated a few blocks @@ -1349,11 +1352,11 @@ retry_journal: * Add inode to orphan list in case we crash before * truncate finishes */ - if (pos + len > inode->i_size && ext4_can_truncate(inode)) + if (extended && ext4_can_truncate(inode)) ext4_orphan_add(handle, inode); ext4_journal_stop(handle); - if (pos + len > inode->i_size) { + if (extended) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might @@ -1406,6 +1409,7 @@ static int ext4_write_end(struct file *file, int ret = 0, ret2; int i_size_changed = 0; int inline_data = ext4_has_inline_data(inode); + bool verity = ext4_verity_in_progress(inode); trace_ext4_write_end(inode, pos, len, copied); if (inline_data) { @@ -1423,12 +1427,16 @@ static int ext4_write_end(struct file *file, /* * it's important to update i_size while still holding page lock: * page writeout could otherwise come in and zero beyond i_size. + * + * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree + * blocks are being written past EOF, so skip the i_size update. */ - i_size_changed = ext4_update_inode_size(inode, pos + copied); + if (!verity) + i_size_changed = ext4_update_inode_size(inode, pos + copied); unlock_page(page); put_page(page); - if (old_size < pos) + if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); /* * Don't mark the inode dirty under page lock. First, it unnecessarily @@ -1439,7 +1447,7 @@ static int ext4_write_end(struct file *file, if (i_size_changed || inline_data) ext4_mark_inode_dirty(handle, inode); - if (pos + len > inode->i_size && ext4_can_truncate(inode)) + if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them @@ -1450,7 +1458,7 @@ errout: if (!ret) ret = ret2; - if (pos + len > inode->i_size) { + if (pos + len > inode->i_size && !verity) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be @@ -1511,6 +1519,7 @@ static int ext4_journalled_write_end(struct file *file, unsigned from, to; int size_changed = 0; int inline_data = ext4_has_inline_data(inode); + bool verity = ext4_verity_in_progress(inode); trace_ext4_journalled_write_end(inode, pos, len, copied); from = pos & (PAGE_SIZE - 1); @@ -1540,13 +1549,14 @@ static int ext4_journalled_write_end(struct file *file, if (!partial) SetPageUptodate(page); } - size_changed = ext4_update_inode_size(inode, pos + copied); + if (!verity) + size_changed = ext4_update_inode_size(inode, pos + copied); ext4_set_inode_state(inode, EXT4_STATE_JDATA); EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; unlock_page(page); put_page(page); - if (old_size < pos) + if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); if (size_changed || inline_data) { @@ -1555,7 +1565,7 @@ static int ext4_journalled_write_end(struct file *file, ret = ret2; } - if (pos + len > inode->i_size && ext4_can_truncate(inode)) + if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them @@ -1566,7 +1576,7 @@ errout: ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; - if (pos + len > inode->i_size) { + if (pos + len > inode->i_size && !verity) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be @@ -2162,7 +2172,8 @@ static int ext4_writepage(struct page *page, trace_ext4_writepage(page); size = i_size_read(inode); - if (page->index == size >> PAGE_SHIFT) + if (page->index == size >> PAGE_SHIFT && + !ext4_verity_in_progress(inode)) len = size & ~PAGE_MASK; else len = PAGE_SIZE; @@ -2246,7 +2257,8 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) * after page tables are updated. */ size = i_size_read(mpd->inode); - if (page->index == size >> PAGE_SHIFT) + if (page->index == size >> PAGE_SHIFT && + !ext4_verity_in_progress(mpd->inode)) len = size & ~PAGE_MASK; else len = PAGE_SIZE; @@ -2345,6 +2357,9 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1) >> inode->i_blkbits; + if (ext4_verity_in_progress(inode)) + blocks = EXT_MAX_BLOCKS; + do { BUG_ON(buffer_locked(bh)); @@ -3061,8 +3076,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, index = pos >> PAGE_SHIFT; - if (ext4_nonda_switch(inode->i_sb) || - S_ISLNK(inode->i_mode)) { + if (ext4_nonda_switch(inode->i_sb) || S_ISLNK(inode->i_mode) || + ext4_verity_in_progress(inode)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; return ext4_write_begin(file, mapping, pos, len, flags, pagep, fsdata); @@ -3897,6 +3912,8 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) return 0; #endif + if (fsverity_active(inode)) + return 0; /* * If we are doing data journalling we don't support O_DIRECT @@ -4586,7 +4603,6 @@ static int __ext4_get_inode_loc(struct inode *inode, struct buffer_head *bh; struct super_block *sb = inode->i_sb; ext4_fsblk_t block; - struct blk_plug plug; int inodes_per_block, inode_offset; iloc->bh = NULL; @@ -4675,7 +4691,6 @@ make_io: * If we need to do any I/O, try to pre-readahead extra * blocks from the inode table. */ - blk_start_plug(&plug); if (EXT4_SB(sb)->s_inode_readahead_blks) { ext4_fsblk_t b, end, table; unsigned num; @@ -4706,7 +4721,6 @@ make_io: get_bh(bh); bh->b_end_io = end_buffer_read_sync; submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh); - blk_finish_plug(&plug); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { EXT4_ERROR_INODE_BLOCK(inode, block, @@ -4739,6 +4753,8 @@ static bool ext4_should_use_dax(struct inode *inode) return false; if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT)) return false; + if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY)) + return false; return true; } @@ -4763,9 +4779,11 @@ void ext4_set_inode_flags(struct inode *inode) new_fl |= S_ENCRYPTED; if (flags & EXT4_CASEFOLD_FL) new_fl |= S_CASEFOLD; + if (flags & EXT4_VERITY_FL) + new_fl |= S_VERITY; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX| - S_ENCRYPTED|S_CASEFOLD); + S_ENCRYPTED|S_CASEFOLD|S_VERITY); } static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, @@ -5555,6 +5573,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; + error = fsverity_prepare_setattr(dentry, attr); + if (error) + return error; + if (is_quota_modification(inode, attr)) { error = dquot_initialize(inode); if (error) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 442f7ef873fc..5444d49cbf09 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1113,8 +1113,35 @@ resizefs_out: #endif } case EXT4_IOC_GET_ENCRYPTION_POLICY: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; return fscrypt_ioctl_get_policy(filp, (void __user *)arg); + case FS_IOC_GET_ENCRYPTION_POLICY_EX: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_get_policy_ex(filp, (void __user *)arg); + + case FS_IOC_ADD_ENCRYPTION_KEY: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_add_key(filp, (void __user *)arg); + + case FS_IOC_REMOVE_ENCRYPTION_KEY: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_remove_key(filp, (void __user *)arg); + + case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_remove_key_all_users(filp, + (void __user *)arg); + case FS_IOC_GET_ENCRYPTION_KEY_STATUS: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_get_key_status(filp, (void __user *)arg); + case EXT4_IOC_FSGETXATTR: { struct fsxattr fa; @@ -1171,6 +1198,17 @@ out: } case EXT4_IOC_SHUTDOWN: return ext4_shutdown(sb, arg); + + case FS_IOC_ENABLE_VERITY: + if (!ext4_has_feature_verity(sb)) + return -EOPNOTSUPP; + return fsverity_ioctl_enable(filp, (const void __user *)arg); + + case FS_IOC_MEASURE_VERITY: + if (!ext4_has_feature_verity(sb)) + return -EOPNOTSUPP; + return fsverity_ioctl_measure(filp, (void __user *)arg); + default: return -ENOTTY; } @@ -1231,8 +1269,15 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC_SET_ENCRYPTION_POLICY: case EXT4_IOC_GET_ENCRYPTION_PWSALT: case EXT4_IOC_GET_ENCRYPTION_POLICY: + case FS_IOC_GET_ENCRYPTION_POLICY_EX: + case FS_IOC_ADD_ENCRYPTION_KEY: + case FS_IOC_REMOVE_ENCRYPTION_KEY: + case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: + case FS_IOC_GET_ENCRYPTION_KEY_STATUS: case EXT4_IOC_SHUTDOWN: case FS_IOC_GETFSMAP: + case FS_IOC_ENABLE_VERITY: + case FS_IOC_MEASURE_VERITY: break; default: return -ENOIOCTLCMD; diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index c916017db334..a30b203fa461 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -47,13 +47,103 @@ #include "ext4.h" -static inline bool ext4_bio_encrypted(struct bio *bio) +#define NUM_PREALLOC_POST_READ_CTXS 128 + +static struct kmem_cache *bio_post_read_ctx_cache; +static mempool_t *bio_post_read_ctx_pool; + +/* postprocessing steps for read bios */ +enum bio_post_read_step { + STEP_INITIAL = 0, + STEP_DECRYPT, + STEP_VERITY, +}; + +struct bio_post_read_ctx { + struct bio *bio; + struct work_struct work; + unsigned int cur_step; + unsigned int enabled_steps; +}; + +static void __read_end_io(struct bio *bio) { -#ifdef CONFIG_FS_ENCRYPTION - return unlikely(bio->bi_private != NULL); -#else - return false; -#endif + struct page *page; + struct bio_vec *bv; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bv, bio, iter_all) { + page = bv->bv_page; + + /* PG_error was set if any post_read step failed */ + if (bio->bi_status || PageError(page)) { + ClearPageUptodate(page); + /* will re-read again later */ + ClearPageError(page); + } else { + SetPageUptodate(page); + } + unlock_page(page); + } + if (bio->bi_private) + mempool_free(bio->bi_private, bio_post_read_ctx_pool); + bio_put(bio); +} + +static void bio_post_read_processing(struct bio_post_read_ctx *ctx); + +static void decrypt_work(struct work_struct *work) +{ + struct bio_post_read_ctx *ctx = + container_of(work, struct bio_post_read_ctx, work); + + fscrypt_decrypt_bio(ctx->bio); + + bio_post_read_processing(ctx); +} + +static void verity_work(struct work_struct *work) +{ + struct bio_post_read_ctx *ctx = + container_of(work, struct bio_post_read_ctx, work); + + fsverity_verify_bio(ctx->bio); + + bio_post_read_processing(ctx); +} + +static void bio_post_read_processing(struct bio_post_read_ctx *ctx) +{ + /* + * We use different work queues for decryption and for verity because + * verity may require reading metadata pages that need decryption, and + * we shouldn't recurse to the same workqueue. + */ + switch (++ctx->cur_step) { + case STEP_DECRYPT: + if (ctx->enabled_steps & (1 << STEP_DECRYPT)) { + INIT_WORK(&ctx->work, decrypt_work); + fscrypt_enqueue_decrypt_work(&ctx->work); + return; + } + ctx->cur_step++; + /* fall-through */ + case STEP_VERITY: + if (ctx->enabled_steps & (1 << STEP_VERITY)) { + INIT_WORK(&ctx->work, verity_work); + fsverity_enqueue_verify_work(&ctx->work); + return; + } + ctx->cur_step++; + /* fall-through */ + default: + __read_end_io(ctx->bio); + } +} + +static bool bio_post_read_required(struct bio *bio) +{ + return bio->bi_private && !bio->bi_status; } /* @@ -70,30 +160,53 @@ static inline bool ext4_bio_encrypted(struct bio *bio) */ static void mpage_end_io(struct bio *bio) { - struct bio_vec *bv; - struct bvec_iter_all iter_all; + if (bio_post_read_required(bio)) { + struct bio_post_read_ctx *ctx = bio->bi_private; - if (ext4_bio_encrypted(bio)) { - if (bio->bi_status) { - fscrypt_release_ctx(bio->bi_private); - } else { - fscrypt_enqueue_decrypt_bio(bio->bi_private, bio); - return; - } + ctx->cur_step = STEP_INITIAL; + bio_post_read_processing(ctx); + return; } - bio_for_each_segment_all(bv, bio, iter_all) { - struct page *page = bv->bv_page; + __read_end_io(bio); +} - if (!bio->bi_status) { - SetPageUptodate(page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - unlock_page(page); +static inline bool ext4_need_verity(const struct inode *inode, pgoff_t idx) +{ + return fsverity_active(inode) && + idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); +} + +static struct bio_post_read_ctx *get_bio_post_read_ctx(struct inode *inode, + struct bio *bio, + pgoff_t first_idx) +{ + unsigned int post_read_steps = 0; + struct bio_post_read_ctx *ctx = NULL; + + if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) + post_read_steps |= 1 << STEP_DECRYPT; + + if (ext4_need_verity(inode, first_idx)) + post_read_steps |= 1 << STEP_VERITY; + + if (post_read_steps) { + ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); + if (!ctx) + return ERR_PTR(-ENOMEM); + ctx->bio = bio; + ctx->enabled_steps = post_read_steps; + bio->bi_private = ctx; } + return ctx; +} - bio_put(bio); +static inline loff_t ext4_readpage_limit(struct inode *inode) +{ + if (IS_ENABLED(CONFIG_FS_VERITY) && + (IS_VERITY(inode) || ext4_verity_in_progress(inode))) + return inode->i_sb->s_maxbytes; + + return i_size_read(inode); } int ext4_mpage_readpages(struct address_space *mapping, @@ -141,7 +254,8 @@ int ext4_mpage_readpages(struct address_space *mapping, block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits); last_block = block_in_file + nr_pages * blocks_per_page; - last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; + last_block_in_file = (ext4_readpage_limit(inode) + + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) last_block = last_block_in_file; page_block = 0; @@ -218,6 +332,9 @@ int ext4_mpage_readpages(struct address_space *mapping, zero_user_segment(page, first_hole << blkbits, PAGE_SIZE); if (first_hole == 0) { + if (ext4_need_verity(inode, page->index) && + !fsverity_verify_page(page)) + goto set_error_page; SetPageUptodate(page); unlock_page(page); goto next_page; @@ -241,18 +358,16 @@ int ext4_mpage_readpages(struct address_space *mapping, bio = NULL; } if (bio == NULL) { - struct fscrypt_ctx *ctx = NULL; + struct bio_post_read_ctx *ctx; - if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) { - ctx = fscrypt_get_ctx(GFP_NOFS); - if (IS_ERR(ctx)) - goto set_error_page; - } bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); - if (!bio) { - if (ctx) - fscrypt_release_ctx(ctx); + if (!bio) + goto set_error_page; + ctx = get_bio_post_read_ctx(inode, bio, page->index); + if (IS_ERR(ctx)) { + bio_put(bio); + bio = NULL; goto set_error_page; } bio_set_dev(bio, bdev); @@ -293,3 +408,29 @@ int ext4_mpage_readpages(struct address_space *mapping, submit_bio(bio); return 0; } + +int __init ext4_init_post_read_processing(void) +{ + bio_post_read_ctx_cache = + kmem_cache_create("ext4_bio_post_read_ctx", + sizeof(struct bio_post_read_ctx), 0, 0, NULL); + if (!bio_post_read_ctx_cache) + goto fail; + bio_post_read_ctx_pool = + mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS, + bio_post_read_ctx_cache); + if (!bio_post_read_ctx_pool) + goto fail_free_cache; + return 0; + +fail_free_cache: + kmem_cache_destroy(bio_post_read_ctx_cache); +fail: + return -ENOMEM; +} + +void ext4_exit_post_read_processing(void) +{ + mempool_destroy(bio_post_read_ctx_pool); + kmem_cache_destroy(bio_post_read_ctx_cache); +} diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4079605d437a..27cd622676e7 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1107,6 +1107,9 @@ static int ext4_drop_inode(struct inode *inode) { int drop = generic_drop_inode(inode); + if (!drop) + drop = fscrypt_drop_inode(inode); + trace_ext4_drop_inode(inode, drop); return drop; } @@ -1179,6 +1182,7 @@ void ext4_clear_inode(struct inode *inode) EXT4_I(inode)->jinode = NULL; } fscrypt_put_encryption_info(inode); + fsverity_cleanup_inode(inode); } static struct inode *ext4_nfs_get_inode(struct super_block *sb, @@ -4272,6 +4276,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) #ifdef CONFIG_FS_ENCRYPTION sb->s_cop = &ext4_cryptops; #endif +#ifdef CONFIG_FS_VERITY + sb->s_vop = &ext4_verityops; +#endif #ifdef CONFIG_QUOTA sb->dq_op = &ext4_quota_operations; if (ext4_has_feature_quota(sb)) @@ -4419,6 +4426,11 @@ no_journal: goto failed_mount_wq; } + if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) { + ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity"); + goto failed_mount_wq; + } + if (DUMMY_ENCRYPTION_ENABLED(sbi) && !sb_rdonly(sb) && !ext4_has_feature_encrypt(sb)) { ext4_set_feature_encrypt(sb); @@ -6095,6 +6107,10 @@ static int __init ext4_init_fs(void) err = ext4_init_pending(); if (err) + goto out7; + + err = ext4_init_post_read_processing(); + if (err) goto out6; err = ext4_init_pageio(); @@ -6135,8 +6151,10 @@ out3: out4: ext4_exit_pageio(); out5: - ext4_exit_pending(); + ext4_exit_post_read_processing(); out6: + ext4_exit_pending(); +out7: ext4_exit_es(); return err; @@ -6153,6 +6171,7 @@ static void __exit ext4_exit_fs(void) ext4_exit_sysfs(); ext4_exit_system_zone(); ext4_exit_pageio(); + ext4_exit_post_read_processing(); ext4_exit_es(); ext4_exit_pending(); } diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index b3cd7655a6ff..eb1efad0e20a 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -242,6 +242,9 @@ EXT4_ATTR_FEATURE(encryption); #ifdef CONFIG_UNICODE EXT4_ATTR_FEATURE(casefold); #endif +#ifdef CONFIG_FS_VERITY +EXT4_ATTR_FEATURE(verity); +#endif EXT4_ATTR_FEATURE(metadata_csum_seed); static struct attribute *ext4_feat_attrs[] = { @@ -254,6 +257,9 @@ static struct attribute *ext4_feat_attrs[] = { #ifdef CONFIG_UNICODE ATTR_LIST(casefold), #endif +#ifdef CONFIG_FS_VERITY + ATTR_LIST(verity), +#endif ATTR_LIST(metadata_csum_seed), NULL, }; diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c new file mode 100644 index 000000000000..d0d8a9795dd6 --- /dev/null +++ b/fs/ext4/verity.c @@ -0,0 +1,367 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/ext4/verity.c: fs-verity support for ext4 + * + * Copyright 2019 Google LLC + */ + +/* + * Implementation of fsverity_operations for ext4. + * + * ext4 stores the verity metadata (Merkle tree and fsverity_descriptor) past + * the end of the file, starting at the first 64K boundary beyond i_size. This + * approach works because (a) verity files are readonly, and (b) pages fully + * beyond i_size aren't visible to userspace but can be read/written internally + * by ext4 with only some relatively small changes to ext4. This approach + * avoids having to depend on the EA_INODE feature and on rearchitecturing + * ext4's xattr support to support paging multi-gigabyte xattrs into memory, and + * to support encrypting xattrs. Note that the verity metadata *must* be + * encrypted when the file is, since it contains hashes of the plaintext data. + * + * Using a 64K boundary rather than a 4K one keeps things ready for + * architectures with 64K pages, and it doesn't necessarily waste space on-disk + * since there can be a hole between i_size and the start of the Merkle tree. + */ + +#include <linux/quotaops.h> + +#include "ext4.h" +#include "ext4_extents.h" +#include "ext4_jbd2.h" + +static inline loff_t ext4_verity_metadata_pos(const struct inode *inode) +{ + return round_up(inode->i_size, 65536); +} + +/* + * Read some verity metadata from the inode. __vfs_read() can't be used because + * we need to read beyond i_size. + */ +static int pagecache_read(struct inode *inode, void *buf, size_t count, + loff_t pos) +{ + while (count) { + size_t n = min_t(size_t, count, + PAGE_SIZE - offset_in_page(pos)); + struct page *page; + void *addr; + + page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT, + NULL); + if (IS_ERR(page)) + return PTR_ERR(page); + + addr = kmap_atomic(page); + memcpy(buf, addr + offset_in_page(pos), n); + kunmap_atomic(addr); + + put_page(page); + + buf += n; + pos += n; + count -= n; + } + return 0; +} + +/* + * Write some verity metadata to the inode for FS_IOC_ENABLE_VERITY. + * kernel_write() can't be used because the file descriptor is readonly. + */ +static int pagecache_write(struct inode *inode, const void *buf, size_t count, + loff_t pos) +{ + if (pos + count > inode->i_sb->s_maxbytes) + return -EFBIG; + + while (count) { + size_t n = min_t(size_t, count, + PAGE_SIZE - offset_in_page(pos)); + struct page *page; + void *fsdata; + void *addr; + int res; + + res = pagecache_write_begin(NULL, inode->i_mapping, pos, n, 0, + &page, &fsdata); + if (res) + return res; + + addr = kmap_atomic(page); + memcpy(addr + offset_in_page(pos), buf, n); + kunmap_atomic(addr); + + res = pagecache_write_end(NULL, inode->i_mapping, pos, n, n, + page, fsdata); + if (res < 0) + return res; + if (res != n) + return -EIO; + + buf += n; + pos += n; + count -= n; + } + return 0; +} + +static int ext4_begin_enable_verity(struct file *filp) +{ + struct inode *inode = file_inode(filp); + const int credits = 2; /* superblock and inode for ext4_orphan_add() */ + handle_t *handle; + int err; + + if (ext4_verity_in_progress(inode)) + return -EBUSY; + + /* + * Since the file was opened readonly, we have to initialize the jbd + * inode and quotas here and not rely on ->open() doing it. This must + * be done before evicting the inline data. + */ + + err = ext4_inode_attach_jinode(inode); + if (err) + return err; + + err = dquot_initialize(inode); + if (err) + return err; + + err = ext4_convert_inline_data(inode); + if (err) + return err; + + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + ext4_warning_inode(inode, + "verity is only allowed on extent-based files"); + return -EOPNOTSUPP; + } + + /* + * ext4 uses the last allocated block to find the verity descriptor, so + * we must remove any other blocks past EOF which might confuse things. + */ + err = ext4_truncate(inode); + if (err) + return err; + + handle = ext4_journal_start(inode, EXT4_HT_INODE, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + err = ext4_orphan_add(handle, inode); + if (err == 0) + ext4_set_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); + + ext4_journal_stop(handle); + return err; +} + +/* + * ext4 stores the verity descriptor beginning on the next filesystem block + * boundary after the Merkle tree. Then, the descriptor size is stored in the + * last 4 bytes of the last allocated filesystem block --- which is either the + * block in which the descriptor ends, or the next block after that if there + * weren't at least 4 bytes remaining. + * + * We can't simply store the descriptor in an xattr because it *must* be + * encrypted when ext4 encryption is used, but ext4 encryption doesn't encrypt + * xattrs. Also, if the descriptor includes a large signature blob it may be + * too large to store in an xattr without the EA_INODE feature. + */ +static int ext4_write_verity_descriptor(struct inode *inode, const void *desc, + size_t desc_size, u64 merkle_tree_size) +{ + const u64 desc_pos = round_up(ext4_verity_metadata_pos(inode) + + merkle_tree_size, i_blocksize(inode)); + const u64 desc_end = desc_pos + desc_size; + const __le32 desc_size_disk = cpu_to_le32(desc_size); + const u64 desc_size_pos = round_up(desc_end + sizeof(desc_size_disk), + i_blocksize(inode)) - + sizeof(desc_size_disk); + int err; + + err = pagecache_write(inode, desc, desc_size, desc_pos); + if (err) + return err; + + return pagecache_write(inode, &desc_size_disk, sizeof(desc_size_disk), + desc_size_pos); +} + +static int ext4_end_enable_verity(struct file *filp, const void *desc, + size_t desc_size, u64 merkle_tree_size) +{ + struct inode *inode = file_inode(filp); + const int credits = 2; /* superblock and inode for ext4_orphan_del() */ + handle_t *handle; + int err = 0; + int err2; + + if (desc != NULL) { + /* Succeeded; write the verity descriptor. */ + err = ext4_write_verity_descriptor(inode, desc, desc_size, + merkle_tree_size); + + /* Write all pages before clearing VERITY_IN_PROGRESS. */ + if (!err) + err = filemap_write_and_wait(inode->i_mapping); + } + + /* If we failed, truncate anything we wrote past i_size. */ + if (desc == NULL || err) + ext4_truncate(inode); + + /* + * We must always clean up by clearing EXT4_STATE_VERITY_IN_PROGRESS and + * deleting the inode from the orphan list, even if something failed. + * If everything succeeded, we'll also set the verity bit in the same + * transaction. + */ + + ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); + + handle = ext4_journal_start(inode, EXT4_HT_INODE, credits); + if (IS_ERR(handle)) { + ext4_orphan_del(NULL, inode); + return PTR_ERR(handle); + } + + err2 = ext4_orphan_del(handle, inode); + if (err2) + goto out_stop; + + if (desc != NULL && !err) { + struct ext4_iloc iloc; + + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out_stop; + ext4_set_inode_flag(inode, EXT4_INODE_VERITY); + ext4_set_inode_flags(inode); + err = ext4_mark_iloc_dirty(handle, inode, &iloc); + } +out_stop: + ext4_journal_stop(handle); + return err ?: err2; +} + +static int ext4_get_verity_descriptor_location(struct inode *inode, + size_t *desc_size_ret, + u64 *desc_pos_ret) +{ + struct ext4_ext_path *path; + struct ext4_extent *last_extent; + u32 end_lblk; + u64 desc_size_pos; + __le32 desc_size_disk; + u32 desc_size; + u64 desc_pos; + int err; + + /* + * Descriptor size is in last 4 bytes of last allocated block. + * See ext4_write_verity_descriptor(). + */ + + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + EXT4_ERROR_INODE(inode, "verity file doesn't use extents"); + return -EFSCORRUPTED; + } + + path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + + last_extent = path[path->p_depth].p_ext; + if (!last_extent) { + EXT4_ERROR_INODE(inode, "verity file has no extents"); + ext4_ext_drop_refs(path); + kfree(path); + return -EFSCORRUPTED; + } + + end_lblk = le32_to_cpu(last_extent->ee_block) + + ext4_ext_get_actual_len(last_extent); + desc_size_pos = (u64)end_lblk << inode->i_blkbits; + ext4_ext_drop_refs(path); + kfree(path); + + if (desc_size_pos < sizeof(desc_size_disk)) + goto bad; + desc_size_pos -= sizeof(desc_size_disk); + + err = pagecache_read(inode, &desc_size_disk, sizeof(desc_size_disk), + desc_size_pos); + if (err) + return err; + desc_size = le32_to_cpu(desc_size_disk); + + /* + * The descriptor is stored just before the desc_size_disk, but starting + * on a filesystem block boundary. + */ + + if (desc_size > INT_MAX || desc_size > desc_size_pos) + goto bad; + + desc_pos = round_down(desc_size_pos - desc_size, i_blocksize(inode)); + if (desc_pos < ext4_verity_metadata_pos(inode)) + goto bad; + + *desc_size_ret = desc_size; + *desc_pos_ret = desc_pos; + return 0; + +bad: + EXT4_ERROR_INODE(inode, "verity file corrupted; can't find descriptor"); + return -EFSCORRUPTED; +} + +static int ext4_get_verity_descriptor(struct inode *inode, void *buf, + size_t buf_size) +{ + size_t desc_size = 0; + u64 desc_pos = 0; + int err; + + err = ext4_get_verity_descriptor_location(inode, &desc_size, &desc_pos); + if (err) + return err; + + if (buf_size) { + if (desc_size > buf_size) + return -ERANGE; + err = pagecache_read(inode, buf, desc_size, desc_pos); + if (err) + return err; + } + return desc_size; +} + +static struct page *ext4_read_merkle_tree_page(struct inode *inode, + pgoff_t index) +{ + index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT; + + return read_mapping_page(inode->i_mapping, index, NULL); +} + +static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf, + u64 index, int log_blocksize) +{ + loff_t pos = ext4_verity_metadata_pos(inode) + (index << log_blocksize); + + return pagecache_write(inode, buf, 1 << log_blocksize, pos); +} + +const struct fsverity_operations ext4_verityops = { + .begin_enable_verity = ext4_begin_enable_verity, + .end_enable_verity = ext4_end_enable_verity, + .get_verity_descriptor = ext4_get_verity_descriptor, + .read_merkle_tree_page = ext4_read_merkle_tree_page, + .write_merkle_tree_block = ext4_write_merkle_tree_block, +}; diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index 776c4b936504..2aaecc63834f 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -8,3 +8,4 @@ f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o +f2fs-$(CONFIG_FS_VERITY) += verity.o diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index abbf14e9bd72..54cad80acb7d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -74,6 +74,7 @@ static enum count_type __read_io_type(struct page *page) enum bio_post_read_step { STEP_INITIAL = 0, STEP_DECRYPT, + STEP_VERITY, }; struct bio_post_read_ctx { @@ -120,8 +121,23 @@ static void decrypt_work(struct work_struct *work) bio_post_read_processing(ctx); } +static void verity_work(struct work_struct *work) +{ + struct bio_post_read_ctx *ctx = + container_of(work, struct bio_post_read_ctx, work); + + fsverity_verify_bio(ctx->bio); + + bio_post_read_processing(ctx); +} + static void bio_post_read_processing(struct bio_post_read_ctx *ctx) { + /* + * We use different work queues for decryption and for verity because + * verity may require reading metadata pages that need decryption, and + * we shouldn't recurse to the same workqueue. + */ switch (++ctx->cur_step) { case STEP_DECRYPT: if (ctx->enabled_steps & (1 << STEP_DECRYPT)) { @@ -131,6 +147,14 @@ static void bio_post_read_processing(struct bio_post_read_ctx *ctx) } ctx->cur_step++; /* fall-through */ + case STEP_VERITY: + if (ctx->enabled_steps & (1 << STEP_VERITY)) { + INIT_WORK(&ctx->work, verity_work); + fsverity_enqueue_verify_work(&ctx->work); + return; + } + ctx->cur_step++; + /* fall-through */ default: __read_end_io(ctx->bio); } @@ -608,8 +632,15 @@ out: up_write(&io->io_rwsem); } +static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx) +{ + return fsverity_active(inode) && + idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); +} + static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, - unsigned nr_pages, unsigned op_flag) + unsigned nr_pages, unsigned op_flag, + pgoff_t first_idx) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; @@ -625,6 +656,10 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, if (f2fs_encrypted_file(inode)) post_read_steps |= 1 << STEP_DECRYPT; + + if (f2fs_need_verity(inode, first_idx)) + post_read_steps |= 1 << STEP_VERITY; + if (post_read_steps) { ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); if (!ctx) { @@ -646,7 +681,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; - bio = f2fs_grab_read_bio(inode, blkaddr, 1, 0); + bio = f2fs_grab_read_bio(inode, blkaddr, 1, 0, page->index); if (IS_ERR(bio)) return PTR_ERR(bio); @@ -1569,6 +1604,15 @@ out: return ret; } +static inline loff_t f2fs_readpage_limit(struct inode *inode) +{ + if (IS_ENABLED(CONFIG_FS_VERITY) && + (IS_VERITY(inode) || f2fs_verity_in_progress(inode))) + return inode->i_sb->s_maxbytes; + + return i_size_read(inode); +} + static int f2fs_read_single_page(struct inode *inode, struct page *page, unsigned nr_pages, struct f2fs_map_blocks *map, @@ -1587,7 +1631,7 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page, block_in_file = (sector_t)page_index(page); last_block = block_in_file + nr_pages; - last_block_in_file = (i_size_read(inode) + blocksize - 1) >> + last_block_in_file = (f2fs_readpage_limit(inode) + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) last_block = last_block_in_file; @@ -1632,6 +1676,11 @@ got_it: } else { zero_out: zero_user_segment(page, 0, PAGE_SIZE); + if (f2fs_need_verity(inode, page->index) && + !fsverity_verify_page(page)) { + ret = -EIO; + goto out; + } if (!PageUptodate(page)) SetPageUptodate(page); unlock_page(page); @@ -1650,7 +1699,7 @@ submit_and_realloc: } if (bio == NULL) { bio = f2fs_grab_read_bio(inode, block_nr, nr_pages, - is_readahead ? REQ_RAHEAD : 0); + is_readahead ? REQ_RAHEAD : 0, page->index); if (IS_ERR(bio)) { ret = PTR_ERR(bio); bio = NULL; @@ -2052,7 +2101,7 @@ static int __write_data_page(struct page *page, bool *submitted, if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; - if (page->index < end_index) + if (page->index < end_index || f2fs_verity_in_progress(inode)) goto write; /* @@ -2427,7 +2476,8 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; loff_t i_size = i_size_read(inode); - if (to > i_size) { + /* In the fs-verity case, f2fs_end_enable_verity() does the truncate */ + if (to > i_size && !f2fs_verity_in_progress(inode)) { down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); down_write(&F2FS_I(inode)->i_mmap_sem); @@ -2458,7 +2508,8 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, * the block addresses when there is no need to fill the page. */ if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE && - !is_inode_flag_set(inode, FI_NO_PREALLOC)) + !is_inode_flag_set(inode, FI_NO_PREALLOC) && + !f2fs_verity_in_progress(inode)) return 0; /* f2fs_lock_op avoids race between write CP and convert_inline_page */ @@ -2597,7 +2648,8 @@ repeat: if (len == PAGE_SIZE || PageUptodate(page)) return 0; - if (!(pos & (PAGE_SIZE - 1)) && (pos + len) >= i_size_read(inode)) { + if (!(pos & (PAGE_SIZE - 1)) && (pos + len) >= i_size_read(inode) && + !f2fs_verity_in_progress(inode)) { zero_user_segment(page, len, PAGE_SIZE); return 0; } @@ -2660,7 +2712,8 @@ static int f2fs_write_end(struct file *file, set_page_dirty(page); - if (pos + copied > i_size_read(inode)) + if (pos + copied > i_size_read(inode) && + !f2fs_verity_in_progress(inode)) f2fs_i_size_write(inode, pos + copied); unlock_out: f2fs_put_page(page, 1); @@ -3104,7 +3157,9 @@ void f2fs_clear_page_cache_dirty_tag(struct page *page) int __init f2fs_init_post_read_processing(void) { - bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, 0); + bio_post_read_ctx_cache = + kmem_cache_create("f2fs_bio_post_read_ctx", + sizeof(struct bio_post_read_ctx), 0, 0, NULL); if (!bio_post_read_ctx_cache) goto fail; bio_post_read_ctx_pool = diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 17382da7f0bd..7c5f121edac5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -25,6 +25,7 @@ #include <crypto/hash.h> #include <linux/fscrypt.h> +#include <linux/fsverity.h> #ifdef CONFIG_F2FS_CHECK_FS #define f2fs_bug_on(sbi, condition) BUG_ON(condition) @@ -151,7 +152,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_QUOTA_INO 0x0080 #define F2FS_FEATURE_INODE_CRTIME 0x0100 #define F2FS_FEATURE_LOST_FOUND 0x0200 -#define F2FS_FEATURE_VERITY 0x0400 /* reserved */ +#define F2FS_FEATURE_VERITY 0x0400 #define F2FS_FEATURE_SB_CHKSUM 0x0800 #define __F2FS_HAS_FEATURE(raw_super, mask) \ @@ -630,7 +631,7 @@ enum { #define FADVISE_ENC_NAME_BIT 0x08 #define FADVISE_KEEP_SIZE_BIT 0x10 #define FADVISE_HOT_BIT 0x20 -#define FADVISE_VERITY_BIT 0x40 /* reserved */ +#define FADVISE_VERITY_BIT 0x40 #define FADVISE_MODIFIABLE_BITS (FADVISE_COLD_BIT | FADVISE_HOT_BIT) @@ -650,6 +651,8 @@ enum { #define file_is_hot(inode) is_file(inode, FADVISE_HOT_BIT) #define file_set_hot(inode) set_file(inode, FADVISE_HOT_BIT) #define file_clear_hot(inode) clear_file(inode, FADVISE_HOT_BIT) +#define file_is_verity(inode) is_file(inode, FADVISE_VERITY_BIT) +#define file_set_verity(inode) set_file(inode, FADVISE_VERITY_BIT) #define DEF_DIR_LEVEL 0 @@ -2412,6 +2415,7 @@ enum { FI_PROJ_INHERIT, /* indicate file inherits projectid */ FI_PIN_FILE, /* indicate file should not be gced */ FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */ + FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ }; static inline void __mark_inode_dirty_flag(struct inode *inode, @@ -2451,6 +2455,12 @@ static inline void clear_inode_flag(struct inode *inode, int flag) __mark_inode_dirty_flag(inode, flag, false); } +static inline bool f2fs_verity_in_progress(struct inode *inode) +{ + return IS_ENABLED(CONFIG_FS_VERITY) && + is_inode_flag_set(inode, FI_VERITY_IN_PROGRESS); +} + static inline void set_acl_inode(struct inode *inode, umode_t mode) { F2FS_I(inode)->i_acl_mode = mode; @@ -3521,6 +3531,9 @@ void f2fs_exit_sysfs(void); int f2fs_register_sysfs(struct f2fs_sb_info *sbi); void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi); +/* verity.c */ +extern const struct fsverity_operations f2fs_verityops; + /* * crypto support */ @@ -3543,7 +3556,7 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode) */ static inline bool f2fs_post_read_required(struct inode *inode) { - return f2fs_encrypted_file(inode); + return f2fs_encrypted_file(inode) || fsverity_active(inode); } #define F2FS_FEATURE_FUNCS(name, flagname) \ @@ -3561,6 +3574,7 @@ F2FS_FEATURE_FUNCS(flexible_inline_xattr, FLEXIBLE_INLINE_XATTR); F2FS_FEATURE_FUNCS(quota_ino, QUOTA_INO); F2FS_FEATURE_FUNCS(inode_crtime, INODE_CRTIME); F2FS_FEATURE_FUNCS(lost_found, LOST_FOUND); +F2FS_FEATURE_FUNCS(verity, VERITY); F2FS_FEATURE_FUNCS(sb_chksum, SB_CHKSUM); #ifdef CONFIG_BLK_DEV_ZONED diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 3e58a6f697dd..39fffc19e00c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -496,6 +496,10 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) if (err) return err; + err = fsverity_file_open(inode, filp); + if (err) + return err; + filp->f_mode |= FMODE_NOWAIT; return dquot_file_open(inode, filp); @@ -778,6 +782,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (err) return err; + err = fsverity_prepare_setattr(dentry, attr); + if (err) + return err; + if (is_quota_modification(inode, attr)) { err = dquot_initialize(inode); if (err) @@ -1705,7 +1713,8 @@ static const struct { FS_PROJINHERIT_FL | \ FS_ENCRYPT_FL | \ FS_INLINE_DATA_FL | \ - FS_NOCOW_FL) + FS_NOCOW_FL | \ + FS_VERITY_FL) #define F2FS_SETTABLE_FS_FL ( \ FS_SYNC_FL | \ @@ -1750,6 +1759,8 @@ static int f2fs_ioc_getflags(struct file *filp, unsigned long arg) if (IS_ENCRYPTED(inode)) fsflags |= FS_ENCRYPT_FL; + if (IS_VERITY(inode)) + fsflags |= FS_VERITY_FL; if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) fsflags |= FS_INLINE_DATA_FL; if (is_inode_flag_set(inode, FI_PIN_FILE)) @@ -2184,6 +2195,49 @@ out_err: return err; } +static int f2fs_ioc_get_encryption_policy_ex(struct file *filp, + unsigned long arg) +{ + if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fscrypt_ioctl_get_policy_ex(filp, (void __user *)arg); +} + +static int f2fs_ioc_add_encryption_key(struct file *filp, unsigned long arg) +{ + if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fscrypt_ioctl_add_key(filp, (void __user *)arg); +} + +static int f2fs_ioc_remove_encryption_key(struct file *filp, unsigned long arg) +{ + if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fscrypt_ioctl_remove_key(filp, (void __user *)arg); +} + +static int f2fs_ioc_remove_encryption_key_all_users(struct file *filp, + unsigned long arg) +{ + if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fscrypt_ioctl_remove_key_all_users(filp, (void __user *)arg); +} + +static int f2fs_ioc_get_encryption_key_status(struct file *filp, + unsigned long arg) +{ + if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fscrypt_ioctl_get_key_status(filp, (void __user *)arg); +} + static int f2fs_ioc_gc(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -3060,6 +3114,30 @@ static int f2fs_ioc_resize_fs(struct file *filp, unsigned long arg) return ret; } +static int f2fs_ioc_enable_verity(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + + if (!f2fs_sb_has_verity(F2FS_I_SB(inode))) { + f2fs_warn(F2FS_I_SB(inode), + "Can't enable fs-verity on inode %lu: the verity feature is not enabled on this filesystem.\n", + inode->i_ino); + return -EOPNOTSUPP; + } + + return fsverity_ioctl_enable(filp, (const void __user *)arg); +} + +static int f2fs_ioc_measure_verity(struct file *filp, unsigned long arg) +{ + if (!f2fs_sb_has_verity(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fsverity_ioctl_measure(filp, (void __user *)arg); +} + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp))))) @@ -3092,6 +3170,16 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_get_encryption_policy(filp, arg); case F2FS_IOC_GET_ENCRYPTION_PWSALT: return f2fs_ioc_get_encryption_pwsalt(filp, arg); + case FS_IOC_GET_ENCRYPTION_POLICY_EX: + return f2fs_ioc_get_encryption_policy_ex(filp, arg); + case FS_IOC_ADD_ENCRYPTION_KEY: + return f2fs_ioc_add_encryption_key(filp, arg); + case FS_IOC_REMOVE_ENCRYPTION_KEY: + return f2fs_ioc_remove_encryption_key(filp, arg); + case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: + return f2fs_ioc_remove_encryption_key_all_users(filp, arg); + case FS_IOC_GET_ENCRYPTION_KEY_STATUS: + return f2fs_ioc_get_encryption_key_status(filp, arg); case F2FS_IOC_GARBAGE_COLLECT: return f2fs_ioc_gc(filp, arg); case F2FS_IOC_GARBAGE_COLLECT_RANGE: @@ -3118,6 +3206,10 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_precache_extents(filp, arg); case F2FS_IOC_RESIZE_FS: return f2fs_ioc_resize_fs(filp, arg); + case FS_IOC_ENABLE_VERITY: + return f2fs_ioc_enable_verity(filp, arg); + case FS_IOC_MEASURE_VERITY: + return f2fs_ioc_measure_verity(filp, arg); default: return -ENOTTY; } @@ -3219,6 +3311,11 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_SET_ENCRYPTION_POLICY: case F2FS_IOC_GET_ENCRYPTION_PWSALT: case F2FS_IOC_GET_ENCRYPTION_POLICY: + case FS_IOC_GET_ENCRYPTION_POLICY_EX: + case FS_IOC_ADD_ENCRYPTION_KEY: + case FS_IOC_REMOVE_ENCRYPTION_KEY: + case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: + case FS_IOC_GET_ENCRYPTION_KEY_STATUS: case F2FS_IOC_GARBAGE_COLLECT: case F2FS_IOC_GARBAGE_COLLECT_RANGE: case F2FS_IOC_WRITE_CHECKPOINT: @@ -3232,6 +3329,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_SET_PIN_FILE: case F2FS_IOC_PRECACHE_EXTENTS: case F2FS_IOC_RESIZE_FS: + case FS_IOC_ENABLE_VERITY: + case FS_IOC_MEASURE_VERITY: break; default: return -ENOIOCTLCMD; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index a33d7a849b2d..06da75d418e0 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -46,9 +46,11 @@ void f2fs_set_inode_flags(struct inode *inode) new_fl |= S_DIRSYNC; if (file_is_encrypt(inode)) new_fl |= S_ENCRYPTED; + if (file_is_verity(inode)) + new_fl |= S_VERITY; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC| - S_ENCRYPTED); + S_ENCRYPTED|S_VERITY); } static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) @@ -733,6 +735,7 @@ no_delete: } out_clear: fscrypt_put_encryption_info(inode); + fsverity_cleanup_inode(inode); clear_inode(inode); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 78a1b873e48a..f43befda0e1a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -913,6 +913,8 @@ static int f2fs_drop_inode(struct inode *inode) return 0; } ret = generic_drop_inode(inode); + if (!ret) + ret = fscrypt_drop_inode(inode); trace_f2fs_drop_inode(inode, ret); return ret; } @@ -3144,6 +3146,9 @@ try_onemore: #ifdef CONFIG_FS_ENCRYPTION sb->s_cop = &f2fs_cryptops; #endif +#ifdef CONFIG_FS_VERITY + sb->s_vop = &f2fs_verityops; +#endif sb->s_xattr = f2fs_xattr_handlers; sb->s_export_op = &f2fs_export_ops; sb->s_magic = F2FS_SUPER_MAGIC; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 3aeacd0aacfd..0cd64f994068 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -131,6 +131,9 @@ static ssize_t features_show(struct f2fs_attr *a, if (f2fs_sb_has_lost_found(sbi)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "lost_found"); + if (f2fs_sb_has_verity(sbi)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "verity"); if (f2fs_sb_has_sb_chksum(sbi)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "sb_checksum"); @@ -364,6 +367,7 @@ enum feat_id { FEAT_QUOTA_INO, FEAT_INODE_CRTIME, FEAT_LOST_FOUND, + FEAT_VERITY, FEAT_SB_CHECKSUM, }; @@ -381,6 +385,7 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a, case FEAT_QUOTA_INO: case FEAT_INODE_CRTIME: case FEAT_LOST_FOUND: + case FEAT_VERITY: case FEAT_SB_CHECKSUM: return snprintf(buf, PAGE_SIZE, "supported\n"); } @@ -470,6 +475,9 @@ F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR); F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO); F2FS_FEATURE_RO_ATTR(inode_crtime, FEAT_INODE_CRTIME); F2FS_FEATURE_RO_ATTR(lost_found, FEAT_LOST_FOUND); +#ifdef CONFIG_FS_VERITY +F2FS_FEATURE_RO_ATTR(verity, FEAT_VERITY); +#endif F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) @@ -534,6 +542,9 @@ static struct attribute *f2fs_feat_attrs[] = { ATTR_LIST(quota_ino), ATTR_LIST(inode_crtime), ATTR_LIST(lost_found), +#ifdef CONFIG_FS_VERITY + ATTR_LIST(verity), +#endif ATTR_LIST(sb_checksum), NULL, }; diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c new file mode 100644 index 000000000000..a401ef72bc82 --- /dev/null +++ b/fs/f2fs/verity.c @@ -0,0 +1,247 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/f2fs/verity.c: fs-verity support for f2fs + * + * Copyright 2019 Google LLC + */ + +/* + * Implementation of fsverity_operations for f2fs. + * + * Like ext4, f2fs stores the verity metadata (Merkle tree and + * fsverity_descriptor) past the end of the file, starting at the first 64K + * boundary beyond i_size. This approach works because (a) verity files are + * readonly, and (b) pages fully beyond i_size aren't visible to userspace but + * can be read/written internally by f2fs with only some relatively small + * changes to f2fs. Extended attributes cannot be used because (a) f2fs limits + * the total size of an inode's xattr entries to 4096 bytes, which wouldn't be + * enough for even a single Merkle tree block, and (b) f2fs encryption doesn't + * encrypt xattrs, yet the verity metadata *must* be encrypted when the file is + * because it contains hashes of the plaintext data. + * + * Using a 64K boundary rather than a 4K one keeps things ready for + * architectures with 64K pages, and it doesn't necessarily waste space on-disk + * since there can be a hole between i_size and the start of the Merkle tree. + */ + +#include <linux/f2fs_fs.h> + +#include "f2fs.h" +#include "xattr.h" + +static inline loff_t f2fs_verity_metadata_pos(const struct inode *inode) +{ + return round_up(inode->i_size, 65536); +} + +/* + * Read some verity metadata from the inode. __vfs_read() can't be used because + * we need to read beyond i_size. + */ +static int pagecache_read(struct inode *inode, void *buf, size_t count, + loff_t pos) +{ + while (count) { + size_t n = min_t(size_t, count, + PAGE_SIZE - offset_in_page(pos)); + struct page *page; + void *addr; + + page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT, + NULL); + if (IS_ERR(page)) + return PTR_ERR(page); + + addr = kmap_atomic(page); + memcpy(buf, addr + offset_in_page(pos), n); + kunmap_atomic(addr); + + put_page(page); + + buf += n; + pos += n; + count -= n; + } + return 0; +} + +/* + * Write some verity metadata to the inode for FS_IOC_ENABLE_VERITY. + * kernel_write() can't be used because the file descriptor is readonly. + */ +static int pagecache_write(struct inode *inode, const void *buf, size_t count, + loff_t pos) +{ + if (pos + count > inode->i_sb->s_maxbytes) + return -EFBIG; + + while (count) { + size_t n = min_t(size_t, count, + PAGE_SIZE - offset_in_page(pos)); + struct page *page; + void *fsdata; + void *addr; + int res; + + res = pagecache_write_begin(NULL, inode->i_mapping, pos, n, 0, + &page, &fsdata); + if (res) + return res; + + addr = kmap_atomic(page); + memcpy(addr + offset_in_page(pos), buf, n); + kunmap_atomic(addr); + + res = pagecache_write_end(NULL, inode->i_mapping, pos, n, n, + page, fsdata); + if (res < 0) + return res; + if (res != n) + return -EIO; + + buf += n; + pos += n; + count -= n; + } + return 0; +} + +/* + * Format of f2fs verity xattr. This points to the location of the verity + * descriptor within the file data rather than containing it directly because + * the verity descriptor *must* be encrypted when f2fs encryption is used. But, + * f2fs encryption does not encrypt xattrs. + */ +struct fsverity_descriptor_location { + __le32 version; + __le32 size; + __le64 pos; +}; + +static int f2fs_begin_enable_verity(struct file *filp) +{ + struct inode *inode = file_inode(filp); + int err; + + if (f2fs_verity_in_progress(inode)) + return -EBUSY; + + if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode)) + return -EOPNOTSUPP; + + /* + * Since the file was opened readonly, we have to initialize the quotas + * here and not rely on ->open() doing it. This must be done before + * evicting the inline data. + */ + err = dquot_initialize(inode); + if (err) + return err; + + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + + set_inode_flag(inode, FI_VERITY_IN_PROGRESS); + return 0; +} + +static int f2fs_end_enable_verity(struct file *filp, const void *desc, + size_t desc_size, u64 merkle_tree_size) +{ + struct inode *inode = file_inode(filp); + u64 desc_pos = f2fs_verity_metadata_pos(inode) + merkle_tree_size; + struct fsverity_descriptor_location dloc = { + .version = cpu_to_le32(1), + .size = cpu_to_le32(desc_size), + .pos = cpu_to_le64(desc_pos), + }; + int err = 0; + + if (desc != NULL) { + /* Succeeded; write the verity descriptor. */ + err = pagecache_write(inode, desc, desc_size, desc_pos); + + /* Write all pages before clearing FI_VERITY_IN_PROGRESS. */ + if (!err) + err = filemap_write_and_wait(inode->i_mapping); + } + + /* If we failed, truncate anything we wrote past i_size. */ + if (desc == NULL || err) + f2fs_truncate(inode); + + clear_inode_flag(inode, FI_VERITY_IN_PROGRESS); + + if (desc != NULL && !err) { + err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_VERITY, + F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc), + NULL, XATTR_CREATE); + if (!err) { + file_set_verity(inode); + f2fs_set_inode_flags(inode); + f2fs_mark_inode_dirty_sync(inode, true); + } + } + return err; +} + +static int f2fs_get_verity_descriptor(struct inode *inode, void *buf, + size_t buf_size) +{ + struct fsverity_descriptor_location dloc; + int res; + u32 size; + u64 pos; + + /* Get the descriptor location */ + res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_VERITY, + F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc), NULL); + if (res < 0 && res != -ERANGE) + return res; + if (res != sizeof(dloc) || dloc.version != cpu_to_le32(1)) { + f2fs_warn(F2FS_I_SB(inode), "unknown verity xattr format"); + return -EINVAL; + } + size = le32_to_cpu(dloc.size); + pos = le64_to_cpu(dloc.pos); + + /* Get the descriptor */ + if (pos + size < pos || pos + size > inode->i_sb->s_maxbytes || + pos < f2fs_verity_metadata_pos(inode) || size > INT_MAX) { + f2fs_warn(F2FS_I_SB(inode), "invalid verity xattr"); + return -EFSCORRUPTED; + } + if (buf_size) { + if (size > buf_size) + return -ERANGE; + res = pagecache_read(inode, buf, size, pos); + if (res) + return res; + } + return size; +} + +static struct page *f2fs_read_merkle_tree_page(struct inode *inode, + pgoff_t index) +{ + index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT; + + return read_mapping_page(inode->i_mapping, index, NULL); +} + +static int f2fs_write_merkle_tree_block(struct inode *inode, const void *buf, + u64 index, int log_blocksize) +{ + loff_t pos = f2fs_verity_metadata_pos(inode) + (index << log_blocksize); + + return pagecache_write(inode, buf, 1 << log_blocksize, pos); +} + +const struct fsverity_operations f2fs_verityops = { + .begin_enable_verity = f2fs_begin_enable_verity, + .end_enable_verity = f2fs_end_enable_verity, + .get_verity_descriptor = f2fs_get_verity_descriptor, + .read_merkle_tree_page = f2fs_read_merkle_tree_page, + .write_merkle_tree_block = f2fs_write_merkle_tree_block, +}; diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index a90920e2f949..de0c600b9cab 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -34,8 +34,10 @@ #define F2FS_XATTR_INDEX_ADVISE 7 /* Should be same as EXT4_XATTR_INDEX_ENCRYPTION */ #define F2FS_XATTR_INDEX_ENCRYPTION 9 +#define F2FS_XATTR_INDEX_VERITY 11 #define F2FS_XATTR_NAME_ENCRYPTION_CONTEXT "c" +#define F2FS_XATTR_NAME_VERITY "v" struct f2fs_xattr_header { __le32 h_magic; /* magic number for identification */ diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 542b02d170f8..8aaa7eec7b74 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -36,10 +36,6 @@ */ #define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10)) -struct wb_completion { - atomic_t cnt; -}; - /* * Passed into wb_writeback(), essentially a subset of writeback_control */ @@ -61,19 +57,6 @@ struct wb_writeback_work { }; /* - * If one wants to wait for one or more wb_writeback_works, each work's - * ->done should be set to a wb_completion defined using the following - * macro. Once all work items are issued with wb_queue_work(), the caller - * can wait for the completion of all using wb_wait_for_completion(). Work - * items which are waited upon aren't freed automatically on completion. - */ -#define DEFINE_WB_COMPLETION_ONSTACK(cmpl) \ - struct wb_completion cmpl = { \ - .cnt = ATOMIC_INIT(1), \ - } - - -/* * If an inode is constantly having its pages dirtied, but then the * updates stop dirtytime_expire_interval seconds in the past, it's * possible for the worst case time between when an inode has its @@ -182,7 +165,7 @@ static void finish_writeback_work(struct bdi_writeback *wb, if (work->auto_free) kfree(work); if (done && atomic_dec_and_test(&done->cnt)) - wake_up_all(&wb->bdi->wb_waitq); + wake_up_all(done->waitq); } static void wb_queue_work(struct bdi_writeback *wb, @@ -206,28 +189,44 @@ static void wb_queue_work(struct bdi_writeback *wb, /** * wb_wait_for_completion - wait for completion of bdi_writeback_works - * @bdi: bdi work items were issued to * @done: target wb_completion * * Wait for one or more work items issued to @bdi with their ->done field - * set to @done, which should have been defined with - * DEFINE_WB_COMPLETION_ONSTACK(). This function returns after all such - * work items are completed. Work items which are waited upon aren't freed + * set to @done, which should have been initialized with + * DEFINE_WB_COMPLETION(). This function returns after all such work items + * are completed. Work items which are waited upon aren't freed * automatically on completion. */ -static void wb_wait_for_completion(struct backing_dev_info *bdi, - struct wb_completion *done) +void wb_wait_for_completion(struct wb_completion *done) { atomic_dec(&done->cnt); /* put down the initial count */ - wait_event(bdi->wb_waitq, !atomic_read(&done->cnt)); + wait_event(*done->waitq, !atomic_read(&done->cnt)); } #ifdef CONFIG_CGROUP_WRITEBACK -/* parameters for foreign inode detection, see wb_detach_inode() */ +/* + * Parameters for foreign inode detection, see wbc_detach_inode() to see + * how they're used. + * + * These paramters are inherently heuristical as the detection target + * itself is fuzzy. All we want to do is detaching an inode from the + * current owner if it's being written to by some other cgroups too much. + * + * The current cgroup writeback is built on the assumption that multiple + * cgroups writing to the same inode concurrently is very rare and a mode + * of operation which isn't well supported. As such, the goal is not + * taking too long when a different cgroup takes over an inode while + * avoiding too aggressive flip-flops from occasional foreign writes. + * + * We record, very roughly, 2s worth of IO time history and if more than + * half of that is foreign, trigger the switch. The recording is quantized + * to 16 slots. To avoid tiny writes from swinging the decision too much, + * writes smaller than 1/8 of avg size are ignored. + */ #define WB_FRN_TIME_SHIFT 13 /* 1s = 2^13, upto 8 secs w/ 16bit */ #define WB_FRN_TIME_AVG_SHIFT 3 /* avg = avg * 7/8 + new * 1/8 */ -#define WB_FRN_TIME_CUT_DIV 2 /* ignore rounds < avg / 2 */ +#define WB_FRN_TIME_CUT_DIV 8 /* ignore rounds < avg / 8 */ #define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT)) /* 2s */ #define WB_FRN_HIST_SLOTS 16 /* inode->i_wb_frn_history is 16bit */ @@ -237,6 +236,7 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi, /* if foreign slots >= 8, switch */ #define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1) /* one round can affect upto 5 slots */ +#define WB_FRN_MAX_IN_FLIGHT 1024 /* don't queue too many concurrently */ static atomic_t isw_nr_in_flight = ATOMIC_INIT(0); static struct workqueue_struct *isw_wq; @@ -389,6 +389,8 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) if (unlikely(inode->i_state & I_FREEING)) goto skip_switch; + trace_inode_switch_wbs(inode, old_wb, new_wb); + /* * Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to @@ -489,18 +491,13 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) if (inode->i_state & I_WB_SWITCH) return; - /* - * Avoid starting new switches while sync_inodes_sb() is in - * progress. Otherwise, if the down_write protected issue path - * blocks heavily, we might end up starting a large number of - * switches which will block on the rwsem. - */ - if (!down_read_trylock(&bdi->wb_switch_rwsem)) + /* avoid queueing a new switch if too many are already in flight */ + if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT) return; isw = kzalloc(sizeof(*isw), GFP_ATOMIC); if (!isw) - goto out_unlock; + return; /* find and pin the new wb */ rcu_read_lock(); @@ -534,15 +531,12 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); atomic_inc(&isw_nr_in_flight); - - goto out_unlock; + return; out_free: if (isw->new_wb) wb_put(isw->new_wb); kfree(isw); -out_unlock: - up_read(&bdi->wb_switch_rwsem); } /** @@ -681,6 +675,9 @@ void wbc_detach_inode(struct writeback_control *wbc) if (wbc->wb_id != max_id) history |= (1U << slots) - 1; + if (history) + trace_inode_foreign_history(inode, wbc, history); + /* * Switch if the current wb isn't the consistent winner. * If there are multiple closely competing dirtiers, the @@ -843,7 +840,7 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, restart: rcu_read_lock(); list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) { - DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done); + DEFINE_WB_COMPLETION(fallback_work_done, bdi); struct wb_writeback_work fallback_work; struct wb_writeback_work *work; long nr_pages; @@ -890,7 +887,7 @@ restart: last_wb = wb; rcu_read_unlock(); - wb_wait_for_completion(bdi, &fallback_work_done); + wb_wait_for_completion(&fallback_work_done); goto restart; } rcu_read_unlock(); @@ -900,6 +897,89 @@ restart: } /** + * cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs + * @bdi_id: target bdi id + * @memcg_id: target memcg css id + * @nr_pages: number of pages to write, 0 for best-effort dirty flushing + * @reason: reason why some writeback work initiated + * @done: target wb_completion + * + * Initiate flush of the bdi_writeback identified by @bdi_id and @memcg_id + * with the specified parameters. + */ +int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr, + enum wb_reason reason, struct wb_completion *done) +{ + struct backing_dev_info *bdi; + struct cgroup_subsys_state *memcg_css; + struct bdi_writeback *wb; + struct wb_writeback_work *work; + int ret; + + /* lookup bdi and memcg */ + bdi = bdi_get_by_id(bdi_id); + if (!bdi) + return -ENOENT; + + rcu_read_lock(); + memcg_css = css_from_id(memcg_id, &memory_cgrp_subsys); + if (memcg_css && !css_tryget(memcg_css)) + memcg_css = NULL; + rcu_read_unlock(); + if (!memcg_css) { + ret = -ENOENT; + goto out_bdi_put; + } + + /* + * And find the associated wb. If the wb isn't there already + * there's nothing to flush, don't create one. + */ + wb = wb_get_lookup(bdi, memcg_css); + if (!wb) { + ret = -ENOENT; + goto out_css_put; + } + + /* + * If @nr is zero, the caller is attempting to write out most of + * the currently dirty pages. Let's take the current dirty page + * count and inflate it by 25% which should be large enough to + * flush out most dirty pages while avoiding getting livelocked by + * concurrent dirtiers. + */ + if (!nr) { + unsigned long filepages, headroom, dirty, writeback; + + mem_cgroup_wb_stats(wb, &filepages, &headroom, &dirty, + &writeback); + nr = dirty * 10 / 8; + } + + /* issue the writeback work */ + work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN); + if (work) { + work->nr_pages = nr; + work->sync_mode = WB_SYNC_NONE; + work->range_cyclic = 1; + work->reason = reason; + work->done = done; + work->auto_free = 1; + wb_queue_work(wb, work); + ret = 0; + } else { + ret = -ENOMEM; + } + + wb_put(wb); +out_css_put: + css_put(memcg_css); +out_bdi_put: + bdi_put(bdi); + return ret; +} + +/** * cgroup_writeback_umount - flush inode wb switches for umount * * This function is called when a super_block is about to be destroyed and @@ -2362,7 +2442,8 @@ static void wait_sb_inodes(struct super_block *sb) static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, enum wb_reason reason, bool skip_if_busy) { - DEFINE_WB_COMPLETION_ONSTACK(done); + struct backing_dev_info *bdi = sb->s_bdi; + DEFINE_WB_COMPLETION(done, bdi); struct wb_writeback_work work = { .sb = sb, .sync_mode = WB_SYNC_NONE, @@ -2371,14 +2452,13 @@ static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, .nr_pages = nr, .reason = reason, }; - struct backing_dev_info *bdi = sb->s_bdi; if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info) return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy); - wb_wait_for_completion(bdi, &done); + wb_wait_for_completion(&done); } /** @@ -2440,7 +2520,8 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb); */ void sync_inodes_sb(struct super_block *sb) { - DEFINE_WB_COMPLETION_ONSTACK(done); + struct backing_dev_info *bdi = sb->s_bdi; + DEFINE_WB_COMPLETION(done, bdi); struct wb_writeback_work work = { .sb = sb, .sync_mode = WB_SYNC_ALL, @@ -2450,7 +2531,6 @@ void sync_inodes_sb(struct super_block *sb) .reason = WB_REASON_SYNC, .for_sync = 1, }; - struct backing_dev_info *bdi = sb->s_bdi; /* * Can't skip on !bdi_has_dirty() because we should wait for !dirty @@ -2464,7 +2544,7 @@ void sync_inodes_sb(struct super_block *sb) /* protect against inode wb switch, see inode_switch_wbs_work_fn() */ bdi_down_write_wb_switch_rwsem(bdi); bdi_split_work_to_wbs(bdi, &work, false); - wb_wait_for_completion(bdi, &done); + wb_wait_for_completion(&done); bdi_up_write_wb_switch_rwsem(bdi); wait_sb_inodes(sb); diff --git a/fs/fs_context.c b/fs/fs_context.c index 103643c68e3f..87c2c9687d90 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -279,10 +279,8 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type, fc->user_ns = get_user_ns(reference->d_sb->s_user_ns); break; case FS_CONTEXT_FOR_RECONFIGURE: - /* We don't pin any namespaces as the superblock's - * subscriptions cannot be changed at this point. - */ atomic_inc(&reference->d_sb->s_active); + fc->user_ns = get_user_ns(reference->d_sb->s_user_ns); fc->root = dget(reference); break; } diff --git a/fs/io_uring.c b/fs/io_uring.c index cfb48bd088e1..0dadbdbead0f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -75,7 +75,7 @@ #include "internal.h" -#define IORING_MAX_ENTRIES 4096 +#define IORING_MAX_ENTRIES 32768 #define IORING_MAX_FIXED_FILES 1024 struct io_uring { @@ -84,27 +84,29 @@ struct io_uring { }; /* - * This data is shared with the application through the mmap at offset - * IORING_OFF_SQ_RING. + * This data is shared with the application through the mmap at offsets + * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING. * * The offsets to the member fields are published through struct * io_sqring_offsets when calling io_uring_setup. */ -struct io_sq_ring { +struct io_rings { /* * Head and tail offsets into the ring; the offsets need to be * masked to get valid indices. * - * The kernel controls head and the application controls tail. + * The kernel controls head of the sq ring and the tail of the cq ring, + * and the application controls tail of the sq ring and the head of the + * cq ring. */ - struct io_uring r; + struct io_uring sq, cq; /* - * Bitmask to apply to head and tail offsets (constant, equals + * Bitmasks to apply to head and tail offsets (constant, equals * ring_entries - 1) */ - u32 ring_mask; - /* Ring size (constant, power of 2) */ - u32 ring_entries; + u32 sq_ring_mask, cq_ring_mask; + /* Ring sizes (constant, power of 2) */ + u32 sq_ring_entries, cq_ring_entries; /* * Number of invalid entries dropped by the kernel due to * invalid index stored in array @@ -117,7 +119,7 @@ struct io_sq_ring { * counter includes all submissions that were dropped reaching * the new SQ head (and possibly more). */ - u32 dropped; + u32 sq_dropped; /* * Runtime flags * @@ -127,43 +129,7 @@ struct io_sq_ring { * The application needs a full memory barrier before checking * for IORING_SQ_NEED_WAKEUP after updating the sq tail. */ - u32 flags; - /* - * Ring buffer of indices into array of io_uring_sqe, which is - * mmapped by the application using the IORING_OFF_SQES offset. - * - * This indirection could e.g. be used to assign fixed - * io_uring_sqe entries to operations and only submit them to - * the queue when needed. - * - * The kernel modifies neither the indices array nor the entries - * array. - */ - u32 array[]; -}; - -/* - * This data is shared with the application through the mmap at offset - * IORING_OFF_CQ_RING. - * - * The offsets to the member fields are published through struct - * io_cqring_offsets when calling io_uring_setup. - */ -struct io_cq_ring { - /* - * Head and tail offsets into the ring; the offsets need to be - * masked to get valid indices. - * - * The application controls head and the kernel tail. - */ - struct io_uring r; - /* - * Bitmask to apply to head and tail offsets (constant, equals - * ring_entries - 1) - */ - u32 ring_mask; - /* Ring size (constant, power of 2) */ - u32 ring_entries; + u32 sq_flags; /* * Number of completion events lost because the queue was full; * this should be avoided by the application by making sure @@ -177,7 +143,7 @@ struct io_cq_ring { * As completion events come in out of order this counter is not * ordered with any other data. */ - u32 overflow; + u32 cq_overflow; /* * Ring buffer of completion events. * @@ -185,7 +151,7 @@ struct io_cq_ring { * produced, so the application is allowed to modify pending * entries. */ - struct io_uring_cqe cqes[]; + struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; }; struct io_mapped_ubuf { @@ -201,7 +167,7 @@ struct async_list { struct list_head list; struct file *file; - off_t io_end; + off_t io_start; size_t io_len; }; @@ -215,8 +181,18 @@ struct io_ring_ctx { bool compat; bool account_mem; - /* SQ ring */ - struct io_sq_ring *sq_ring; + /* + * Ring buffer of indices into array of io_uring_sqe, which is + * mmapped by the application using the IORING_OFF_SQES offset. + * + * This indirection could e.g. be used to assign fixed + * io_uring_sqe entries to operations and only submit them to + * the queue when needed. + * + * The kernel modifies neither the indices array nor the entries + * array. + */ + u32 *sq_array; unsigned cached_sq_head; unsigned sq_entries; unsigned sq_mask; @@ -227,15 +203,13 @@ struct io_ring_ctx { } ____cacheline_aligned_in_smp; /* IO offload */ - struct workqueue_struct *sqo_wq; + struct workqueue_struct *sqo_wq[2]; struct task_struct *sqo_thread; /* if using sq thread polling */ struct mm_struct *sqo_mm; wait_queue_head_t sqo_wait; struct completion sqo_thread_started; struct { - /* CQ ring */ - struct io_cq_ring *cq_ring; unsigned cached_cq_tail; unsigned cq_entries; unsigned cq_mask; @@ -244,6 +218,8 @@ struct io_ring_ctx { struct eventfd_ctx *cq_ev_fd; } ____cacheline_aligned_in_smp; + struct io_rings *rings; + /* * If used, fixed file set. Writers must ensure that ->refs is dead, * readers must ensure that ->refs is alive as long as the file* is @@ -288,6 +264,7 @@ struct io_ring_ctx { struct sqe_submit { const struct io_uring_sqe *sqe; unsigned short index; + u32 sequence; bool has_user; bool needs_lock; bool needs_fixed_file; @@ -335,6 +312,7 @@ struct io_kiocb { #define REQ_F_LINK 64 /* linked sqes */ #define REQ_F_LINK_DONE 128 /* linked sqes done */ #define REQ_F_FAIL_LINK 256 /* fail rest of links */ +#define REQ_F_SHADOW_DRAIN 512 /* link-drain shadow req */ u64 user_data; u32 result; u32 sequence; @@ -366,6 +344,7 @@ struct io_submit_state { }; static void io_sq_wq_submit_work(struct work_struct *work); +static void __io_free_req(struct io_kiocb *req); static struct kmem_cache *req_cachep; @@ -430,7 +409,7 @@ static inline bool io_sequence_defer(struct io_ring_ctx *ctx, if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN) return false; - return req->sequence != ctx->cached_cq_tail + ctx->sq_ring->dropped; + return req->sequence != ctx->cached_cq_tail + ctx->rings->sq_dropped; } static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) @@ -451,11 +430,11 @@ static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) static void __io_commit_cqring(struct io_ring_ctx *ctx) { - struct io_cq_ring *ring = ctx->cq_ring; + struct io_rings *rings = ctx->rings; - if (ctx->cached_cq_tail != READ_ONCE(ring->r.tail)) { + if (ctx->cached_cq_tail != READ_ONCE(rings->cq.tail)) { /* order cqe stores with ring update */ - smp_store_release(&ring->r.tail, ctx->cached_cq_tail); + smp_store_release(&rings->cq.tail, ctx->cached_cq_tail); if (wq_has_sleeper(&ctx->cq_wait)) { wake_up_interruptible(&ctx->cq_wait); @@ -464,6 +443,24 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) } } +static inline void io_queue_async_work(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + int rw; + + switch (req->submit.sqe->opcode) { + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + rw = !(req->rw.ki_flags & IOCB_DIRECT); + break; + default: + rw = 0; + break; + } + + queue_work(ctx->sqo_wq[rw], &req->work); +} + static void io_commit_cqring(struct io_ring_ctx *ctx) { struct io_kiocb *req; @@ -471,14 +468,19 @@ static void io_commit_cqring(struct io_ring_ctx *ctx) __io_commit_cqring(ctx); while ((req = io_get_deferred_req(ctx)) != NULL) { + if (req->flags & REQ_F_SHADOW_DRAIN) { + /* Just for drain, free it. */ + __io_free_req(req); + continue; + } req->flags |= REQ_F_IO_DRAINED; - queue_work(ctx->sqo_wq, &req->work); + io_queue_async_work(ctx, req); } } static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) { - struct io_cq_ring *ring = ctx->cq_ring; + struct io_rings *rings = ctx->rings; unsigned tail; tail = ctx->cached_cq_tail; @@ -487,11 +489,11 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) * control dependency is enough as we're using WRITE_ONCE to * fill the cq entry */ - if (tail - READ_ONCE(ring->r.head) == ring->ring_entries) + if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries) return NULL; ctx->cached_cq_tail++; - return &ring->cqes[tail & ctx->cq_mask]; + return &rings->cqes[tail & ctx->cq_mask]; } static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, @@ -510,9 +512,9 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, 0); } else { - unsigned overflow = READ_ONCE(ctx->cq_ring->overflow); + unsigned overflow = READ_ONCE(ctx->rings->cq_overflow); - WRITE_ONCE(ctx->cq_ring->overflow, overflow + 1); + WRITE_ONCE(ctx->rings->cq_overflow, overflow + 1); } } @@ -635,7 +637,7 @@ static void io_req_link_next(struct io_kiocb *req) nxt->flags |= REQ_F_LINK_DONE; INIT_WORK(&nxt->work, io_sq_wq_submit_work); - queue_work(req->ctx->sqo_wq, &nxt->work); + io_queue_async_work(req->ctx, nxt); } } @@ -679,11 +681,11 @@ static void io_put_req(struct io_kiocb *req) io_free_req(req); } -static unsigned io_cqring_events(struct io_cq_ring *ring) +static unsigned io_cqring_events(struct io_rings *rings) { /* See comment at the top of this file */ smp_rmb(); - return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head); + return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head); } /* @@ -836,7 +838,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, * If we do, we can potentially be spinning for commands that * already triggered a CQE (eg in error). */ - if (io_cqring_events(ctx->cq_ring)) + if (io_cqring_events(ctx->rings)) break; /* @@ -1187,6 +1189,28 @@ static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw, return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter); } +static inline bool io_should_merge(struct async_list *al, struct kiocb *kiocb) +{ + if (al->file == kiocb->ki_filp) { + off_t start, end; + + /* + * Allow merging if we're anywhere in the range of the same + * page. Generally this happens for sub-page reads or writes, + * and it's beneficial to allow the first worker to bring the + * page in and the piggy backed work can then work on the + * cached page. + */ + start = al->io_start & PAGE_MASK; + end = (al->io_start + al->io_len + PAGE_SIZE - 1) & PAGE_MASK; + if (kiocb->ki_pos >= start && kiocb->ki_pos <= end) + return true; + } + + al->file = NULL; + return false; +} + /* * Make a note of the last file/offset/direction we punted to async * context. We'll use this information to see if we can piggy back a @@ -1198,9 +1222,8 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len) struct async_list *async_list = &req->ctx->pending_async[rw]; struct kiocb *kiocb = &req->rw; struct file *filp = kiocb->ki_filp; - off_t io_end = kiocb->ki_pos + len; - if (filp == async_list->file && kiocb->ki_pos == async_list->io_end) { + if (io_should_merge(async_list, kiocb)) { unsigned long max_bytes; /* Use 8x RA size as a decent limiter for both reads/writes */ @@ -1213,17 +1236,16 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len) req->flags |= REQ_F_SEQ_PREV; async_list->io_len += len; } else { - io_end = 0; - async_list->io_len = 0; + async_list->file = NULL; } } /* New file? Reset state. */ if (async_list->file != filp) { - async_list->io_len = 0; + async_list->io_start = kiocb->ki_pos; + async_list->io_len = len; async_list->file = filp; } - async_list->io_end = io_end; } static int io_read(struct io_kiocb *req, const struct sqe_submit *s, @@ -1535,7 +1557,7 @@ static void io_poll_remove_one(struct io_kiocb *req) WRITE_ONCE(poll->canceled, true); if (!list_empty(&poll->wait.entry)) { list_del_init(&poll->wait.entry); - queue_work(req->ctx->sqo_wq, &req->work); + io_queue_async_work(req->ctx, req); } spin_unlock(&poll->head->lock); @@ -1649,7 +1671,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, io_cqring_ev_posted(ctx); io_put_req(req); } else { - queue_work(ctx->sqo_wq, &req->work); + io_queue_async_work(ctx, req); } return 1; @@ -1992,7 +2014,7 @@ out: */ static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req) { - bool ret = false; + bool ret; if (!list) return false; @@ -2038,10 +2060,14 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s, flags = READ_ONCE(s->sqe->flags); fd = READ_ONCE(s->sqe->fd); - if (flags & IOSQE_IO_DRAIN) { + if (flags & IOSQE_IO_DRAIN) req->flags |= REQ_F_IO_DRAIN; - req->sequence = ctx->cached_sq_head - 1; - } + /* + * All io need record the previous position, if LINK vs DARIN, + * it can be used to mark the position of the first IO in the + * link list. + */ + req->sequence = s->sequence; if (!io_op_needs_file(s->sqe)) return 0; @@ -2063,21 +2089,12 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s, return 0; } -static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, - struct sqe_submit *s) +static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, + struct sqe_submit *s, bool force_nonblock) { int ret; - ret = io_req_defer(ctx, req, s->sqe); - if (ret) { - if (ret != -EIOCBQUEUED) { - io_free_req(req); - io_cqring_add_event(ctx, s->sqe->user_data, ret); - } - return 0; - } - - ret = __io_submit_sqe(ctx, req, s, true); + ret = __io_submit_sqe(ctx, req, s, force_nonblock); if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { struct io_uring_sqe *sqe_copy; @@ -2094,7 +2111,7 @@ static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, if (list) atomic_inc(&list->cnt); INIT_WORK(&req->work, io_sq_wq_submit_work); - queue_work(ctx->sqo_wq, &req->work); + io_queue_async_work(ctx, req); } /* @@ -2119,10 +2136,70 @@ static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, return ret; } +static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, + struct sqe_submit *s, bool force_nonblock) +{ + int ret; + + ret = io_req_defer(ctx, req, s->sqe); + if (ret) { + if (ret != -EIOCBQUEUED) { + io_free_req(req); + io_cqring_add_event(ctx, s->sqe->user_data, ret); + } + return 0; + } + + return __io_queue_sqe(ctx, req, s, force_nonblock); +} + +static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req, + struct sqe_submit *s, struct io_kiocb *shadow, + bool force_nonblock) +{ + int ret; + int need_submit = false; + + if (!shadow) + return io_queue_sqe(ctx, req, s, force_nonblock); + + /* + * Mark the first IO in link list as DRAIN, let all the following + * IOs enter the defer list. all IO needs to be completed before link + * list. + */ + req->flags |= REQ_F_IO_DRAIN; + ret = io_req_defer(ctx, req, s->sqe); + if (ret) { + if (ret != -EIOCBQUEUED) { + io_free_req(req); + io_cqring_add_event(ctx, s->sqe->user_data, ret); + return 0; + } + } else { + /* + * If ret == 0 means that all IOs in front of link io are + * running done. let's queue link head. + */ + need_submit = true; + } + + /* Insert shadow req to defer_list, blocking next IOs */ + spin_lock_irq(&ctx->completion_lock); + list_add_tail(&shadow->list, &ctx->defer_list); + spin_unlock_irq(&ctx->completion_lock); + + if (need_submit) + return __io_queue_sqe(ctx, req, s, force_nonblock); + + return 0; +} + #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK) static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, - struct io_submit_state *state, struct io_kiocb **link) + struct io_submit_state *state, struct io_kiocb **link, + bool force_nonblock) { struct io_uring_sqe *sqe_copy; struct io_kiocb *req; @@ -2175,7 +2252,7 @@ err: INIT_LIST_HEAD(&req->link_list); *link = req; } else { - io_queue_sqe(ctx, req, s); + io_queue_sqe(ctx, req, s, force_nonblock); } } @@ -2205,15 +2282,15 @@ static void io_submit_state_start(struct io_submit_state *state, static void io_commit_sqring(struct io_ring_ctx *ctx) { - struct io_sq_ring *ring = ctx->sq_ring; + struct io_rings *rings = ctx->rings; - if (ctx->cached_sq_head != READ_ONCE(ring->r.head)) { + if (ctx->cached_sq_head != READ_ONCE(rings->sq.head)) { /* * Ensure any loads from the SQEs are done at this point, * since once we write the new head, the application could * write new data to them. */ - smp_store_release(&ring->r.head, ctx->cached_sq_head); + smp_store_release(&rings->sq.head, ctx->cached_sq_head); } } @@ -2227,7 +2304,8 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) */ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) { - struct io_sq_ring *ring = ctx->sq_ring; + struct io_rings *rings = ctx->rings; + u32 *sq_array = ctx->sq_array; unsigned head; /* @@ -2240,20 +2318,21 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) */ head = ctx->cached_sq_head; /* make sure SQ entry isn't read before tail */ - if (head == smp_load_acquire(&ring->r.tail)) + if (head == smp_load_acquire(&rings->sq.tail)) return false; - head = READ_ONCE(ring->array[head & ctx->sq_mask]); + head = READ_ONCE(sq_array[head & ctx->sq_mask]); if (head < ctx->sq_entries) { s->index = head; s->sqe = &ctx->sq_sqes[head]; + s->sequence = ctx->cached_sq_head; ctx->cached_sq_head++; return true; } /* drop invalid entries */ ctx->cached_sq_head++; - ring->dropped++; + rings->sq_dropped++; return false; } @@ -2262,6 +2341,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, { struct io_submit_state state, *statep = NULL; struct io_kiocb *link = NULL; + struct io_kiocb *shadow_req = NULL; bool prev_was_link = false; int i, submitted = 0; @@ -2276,11 +2356,21 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, * that's the end of the chain. Submit the previous link. */ if (!prev_was_link && link) { - io_queue_sqe(ctx, link, &link->submit); + io_queue_link_head(ctx, link, &link->submit, shadow_req, + true); link = NULL; } prev_was_link = (sqes[i].sqe->flags & IOSQE_IO_LINK) != 0; + if (link && (sqes[i].sqe->flags & IOSQE_IO_DRAIN)) { + if (!shadow_req) { + shadow_req = io_get_req(ctx, NULL); + shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN); + refcount_dec(&shadow_req->refs); + } + shadow_req->sequence = sqes[i].sequence; + } + if (unlikely(mm_fault)) { io_cqring_add_event(ctx, sqes[i].sqe->user_data, -EFAULT); @@ -2288,13 +2378,13 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, sqes[i].has_user = has_user; sqes[i].needs_lock = true; sqes[i].needs_fixed_file = true; - io_submit_sqe(ctx, &sqes[i], statep, &link); + io_submit_sqe(ctx, &sqes[i], statep, &link, true); submitted++; } } if (link) - io_queue_sqe(ctx, link, &link->submit); + io_queue_link_head(ctx, link, &link->submit, shadow_req, true); if (statep) io_submit_state_end(&state); @@ -2366,7 +2456,7 @@ static int io_sq_thread(void *data) TASK_INTERRUPTIBLE); /* Tell userspace we may need a wakeup call */ - ctx->sq_ring->flags |= IORING_SQ_NEED_WAKEUP; + ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP; /* make sure to read SQ tail after writing flags */ smp_mb(); @@ -2380,12 +2470,12 @@ static int io_sq_thread(void *data) schedule(); finish_wait(&ctx->sqo_wait, &wait); - ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; + ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; continue; } finish_wait(&ctx->sqo_wait, &wait); - ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; + ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; } i = 0; @@ -2426,10 +2516,12 @@ static int io_sq_thread(void *data) return 0; } -static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) +static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit, + bool block_for_last) { struct io_submit_state state, *statep = NULL; struct io_kiocb *link = NULL; + struct io_kiocb *shadow_req = NULL; bool prev_was_link = false; int i, submit = 0; @@ -2439,6 +2531,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) } for (i = 0; i < to_submit; i++) { + bool force_nonblock = true; struct sqe_submit s; if (!io_get_sqring(ctx, &s)) @@ -2449,21 +2542,43 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) * that's the end of the chain. Submit the previous link. */ if (!prev_was_link && link) { - io_queue_sqe(ctx, link, &link->submit); + io_queue_link_head(ctx, link, &link->submit, shadow_req, + force_nonblock); link = NULL; } prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0; + if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) { + if (!shadow_req) { + shadow_req = io_get_req(ctx, NULL); + shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN); + refcount_dec(&shadow_req->refs); + } + shadow_req->sequence = s.sequence; + } + s.has_user = true; s.needs_lock = false; s.needs_fixed_file = false; submit++; - io_submit_sqe(ctx, &s, statep, &link); + + /* + * The caller will block for events after submit, submit the + * last IO non-blocking. This is either the only IO it's + * submitting, or it already submitted the previous ones. This + * improves performance by avoiding an async punt that we don't + * need to do. + */ + if (block_for_last && submit == to_submit) + force_nonblock = false; + + io_submit_sqe(ctx, &s, statep, &link, force_nonblock); } io_commit_sqring(ctx); if (link) - io_queue_sqe(ctx, link, &link->submit); + io_queue_link_head(ctx, link, &link->submit, shadow_req, + block_for_last); if (statep) io_submit_state_end(statep); @@ -2477,10 +2592,10 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, const sigset_t __user *sig, size_t sigsz) { - struct io_cq_ring *ring = ctx->cq_ring; + struct io_rings *rings = ctx->rings; int ret; - if (io_cqring_events(ring) >= min_events) + if (io_cqring_events(rings) >= min_events) return 0; if (sig) { @@ -2496,12 +2611,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return ret; } - ret = wait_event_interruptible(ctx->wait, io_cqring_events(ring) >= min_events); + ret = wait_event_interruptible(ctx->wait, io_cqring_events(rings) >= min_events); restore_saved_sigmask_unless(ret == -ERESTARTSYS); if (ret == -ERESTARTSYS) ret = -EINTR; - return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0; + return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) @@ -2551,11 +2666,15 @@ static void io_sq_thread_stop(struct io_ring_ctx *ctx) static void io_finish_async(struct io_ring_ctx *ctx) { + int i; + io_sq_thread_stop(ctx); - if (ctx->sqo_wq) { - destroy_workqueue(ctx->sqo_wq); - ctx->sqo_wq = NULL; + for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++) { + if (ctx->sqo_wq[i]) { + destroy_workqueue(ctx->sqo_wq[i]); + ctx->sqo_wq[i] = NULL; + } } } @@ -2763,16 +2882,31 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, } /* Do QD, or 2 * CPUS, whatever is smallest */ - ctx->sqo_wq = alloc_workqueue("io_ring-wq", WQ_UNBOUND | WQ_FREEZABLE, + ctx->sqo_wq[0] = alloc_workqueue("io_ring-wq", + WQ_UNBOUND | WQ_FREEZABLE, min(ctx->sq_entries - 1, 2 * num_online_cpus())); - if (!ctx->sqo_wq) { + if (!ctx->sqo_wq[0]) { + ret = -ENOMEM; + goto err; + } + + /* + * This is for buffered writes, where we want to limit the parallelism + * due to file locking in file systems. As "normal" buffered writes + * should parellelize on writeout quite nicely, limit us to having 2 + * pending. This avoids massive contention on the inode when doing + * buffered async writes. + */ + ctx->sqo_wq[1] = alloc_workqueue("io_ring-write-wq", + WQ_UNBOUND | WQ_FREEZABLE, 2); + if (!ctx->sqo_wq[1]) { ret = -ENOMEM; goto err; } return 0; err: - io_sq_thread_stop(ctx); + io_finish_async(ctx); mmdrop(ctx->sqo_mm); ctx->sqo_mm = NULL; return ret; @@ -2821,17 +2955,45 @@ static void *io_mem_alloc(size_t size) return (void *) __get_free_pages(gfp_flags, get_order(size)); } +static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, + size_t *sq_offset) +{ + struct io_rings *rings; + size_t off, sq_array_size; + + off = struct_size(rings, cqes, cq_entries); + if (off == SIZE_MAX) + return SIZE_MAX; + +#ifdef CONFIG_SMP + off = ALIGN(off, SMP_CACHE_BYTES); + if (off == 0) + return SIZE_MAX; +#endif + + sq_array_size = array_size(sizeof(u32), sq_entries); + if (sq_array_size == SIZE_MAX) + return SIZE_MAX; + + if (check_add_overflow(off, sq_array_size, &off)) + return SIZE_MAX; + + if (sq_offset) + *sq_offset = off; + + return off; +} + static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries) { - struct io_sq_ring *sq_ring; - struct io_cq_ring *cq_ring; - size_t bytes; + size_t pages; - bytes = struct_size(sq_ring, array, sq_entries); - bytes += array_size(sizeof(struct io_uring_sqe), sq_entries); - bytes += struct_size(cq_ring, cqes, cq_entries); + pages = (size_t)1 << get_order( + rings_size(sq_entries, cq_entries, NULL)); + pages += (size_t)1 << get_order( + array_size(sizeof(struct io_uring_sqe), sq_entries)); - return (bytes + PAGE_SIZE - 1) / PAGE_SIZE; + return pages; } static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) @@ -2845,7 +3007,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) struct io_mapped_ubuf *imu = &ctx->user_bufs[i]; for (j = 0; j < imu->nr_bvecs; j++) - put_page(imu->bvec[j].bv_page); + put_user_page(imu->bvec[j].bv_page); if (ctx->account_mem) io_unaccount_mem(ctx->user, imu->nr_bvecs); @@ -2989,10 +3151,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, * if we did partial map, or found file backed vmas, * release any pages we did get */ - if (pret > 0) { - for (j = 0; j < pret; j++) - put_page(pages[j]); - } + if (pret > 0) + put_user_pages(pages, pret); if (ctx->account_mem) io_unaccount_mem(ctx->user, nr_pages); kvfree(imu->bvec); @@ -3078,9 +3238,8 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) } #endif - io_mem_free(ctx->sq_ring); + io_mem_free(ctx->rings); io_mem_free(ctx->sq_sqes); - io_mem_free(ctx->cq_ring); percpu_ref_exit(&ctx->refs); if (ctx->account_mem) @@ -3101,10 +3260,10 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) * io_commit_cqring */ smp_rmb(); - if (READ_ONCE(ctx->sq_ring->r.tail) - ctx->cached_sq_head != - ctx->sq_ring->ring_entries) + if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head != + ctx->rings->sq_ring_entries) mask |= EPOLLOUT | EPOLLWRNORM; - if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail) + if (READ_ONCE(ctx->rings->sq.head) != ctx->cached_cq_tail) mask |= EPOLLIN | EPOLLRDNORM; return mask; @@ -3149,14 +3308,12 @@ static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) switch (offset) { case IORING_OFF_SQ_RING: - ptr = ctx->sq_ring; + case IORING_OFF_CQ_RING: + ptr = ctx->rings; break; case IORING_OFF_SQES: ptr = ctx->sq_sqes; break; - case IORING_OFF_CQ_RING: - ptr = ctx->cq_ring; - break; default: return -EINVAL; } @@ -3199,19 +3356,27 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, * Just return the requested submit count, and wake the thread if * we were asked to. */ + ret = 0; if (ctx->flags & IORING_SETUP_SQPOLL) { if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sqo_wait); submitted = to_submit; - goto out_ctx; - } + } else if (to_submit) { + bool block_for_last = false; - ret = 0; - if (to_submit) { to_submit = min(to_submit, ctx->sq_entries); + /* + * Allow last submission to block in a series, IFF the caller + * asked to wait for events and we don't currently have + * enough. This potentially avoids an async punt. + */ + if (to_submit == min_complete && + io_cqring_events(ctx->rings) < min_complete) + block_for_last = true; + mutex_lock(&ctx->uring_lock); - submitted = io_ring_submit(ctx, to_submit); + submitted = io_ring_submit(ctx, to_submit, block_for_last); mutex_unlock(&ctx->uring_lock); } if (flags & IORING_ENTER_GETEVENTS) { @@ -3226,7 +3391,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, } } -out_ctx: io_ring_drop_ctx_refs(ctx, 1); out_fput: fdput(f); @@ -3243,19 +3407,27 @@ static const struct file_operations io_uring_fops = { static int io_allocate_scq_urings(struct io_ring_ctx *ctx, struct io_uring_params *p) { - struct io_sq_ring *sq_ring; - struct io_cq_ring *cq_ring; - size_t size; + struct io_rings *rings; + size_t size, sq_array_offset; - sq_ring = io_mem_alloc(struct_size(sq_ring, array, p->sq_entries)); - if (!sq_ring) + size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset); + if (size == SIZE_MAX) + return -EOVERFLOW; + + rings = io_mem_alloc(size); + if (!rings) return -ENOMEM; - ctx->sq_ring = sq_ring; - sq_ring->ring_mask = p->sq_entries - 1; - sq_ring->ring_entries = p->sq_entries; - ctx->sq_mask = sq_ring->ring_mask; - ctx->sq_entries = sq_ring->ring_entries; + ctx->rings = rings; + ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); + rings->sq_ring_mask = p->sq_entries - 1; + rings->cq_ring_mask = p->cq_entries - 1; + rings->sq_ring_entries = p->sq_entries; + rings->cq_ring_entries = p->cq_entries; + ctx->sq_mask = rings->sq_ring_mask; + ctx->cq_mask = rings->cq_ring_mask; + ctx->sq_entries = rings->sq_ring_entries; + ctx->cq_entries = rings->cq_ring_entries; size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); if (size == SIZE_MAX) @@ -3265,15 +3437,6 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx, if (!ctx->sq_sqes) return -ENOMEM; - cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries)); - if (!cq_ring) - return -ENOMEM; - - ctx->cq_ring = cq_ring; - cq_ring->ring_mask = p->cq_entries - 1; - cq_ring->ring_entries = p->cq_entries; - ctx->cq_mask = cq_ring->ring_mask; - ctx->cq_entries = cq_ring->ring_entries; return 0; } @@ -3377,21 +3540,23 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) goto err; memset(&p->sq_off, 0, sizeof(p->sq_off)); - p->sq_off.head = offsetof(struct io_sq_ring, r.head); - p->sq_off.tail = offsetof(struct io_sq_ring, r.tail); - p->sq_off.ring_mask = offsetof(struct io_sq_ring, ring_mask); - p->sq_off.ring_entries = offsetof(struct io_sq_ring, ring_entries); - p->sq_off.flags = offsetof(struct io_sq_ring, flags); - p->sq_off.dropped = offsetof(struct io_sq_ring, dropped); - p->sq_off.array = offsetof(struct io_sq_ring, array); + p->sq_off.head = offsetof(struct io_rings, sq.head); + p->sq_off.tail = offsetof(struct io_rings, sq.tail); + p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); + p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); + p->sq_off.flags = offsetof(struct io_rings, sq_flags); + p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); + p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; memset(&p->cq_off, 0, sizeof(p->cq_off)); - p->cq_off.head = offsetof(struct io_cq_ring, r.head); - p->cq_off.tail = offsetof(struct io_cq_ring, r.tail); - p->cq_off.ring_mask = offsetof(struct io_cq_ring, ring_mask); - p->cq_off.ring_entries = offsetof(struct io_cq_ring, ring_entries); - p->cq_off.overflow = offsetof(struct io_cq_ring, overflow); - p->cq_off.cqes = offsetof(struct io_cq_ring, cqes); + p->cq_off.head = offsetof(struct io_rings, cq.head); + p->cq_off.tail = offsetof(struct io_rings, cq.tail); + p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); + p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); + p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); + p->cq_off.cqes = offsetof(struct io_rings, cqes); + + p->features = IORING_FEAT_SINGLE_MMAP; return ret; err: io_ring_ctx_wait_and_kill(ctx); diff --git a/fs/isofs/export.c b/fs/isofs/export.c index 85a9093769a9..35768a63fb1d 100644 --- a/fs/isofs/export.c +++ b/fs/isofs/export.c @@ -10,7 +10,7 @@ * * The following files are helpful: * - * Documentation/filesystems/nfs/Exporting + * Documentation/filesystems/nfs/exporting.rst * fs/exportfs/expfs.c. */ diff --git a/fs/jfs/Kconfig b/fs/jfs/Kconfig index 22a273bd4648..05cb0e8e4382 100644 --- a/fs/jfs/Kconfig +++ b/fs/jfs/Kconfig @@ -5,7 +5,7 @@ config JFS_FS select CRC32 help This is a port of IBM's Journaled Filesystem . More information is - available in the file <file:Documentation/filesystems/jfs.txt>. + available in the file <file:Documentation/admin-guide/jfs.rst>. If you do not intend to use the JFS filesystem, say N. diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index a387534c9577..6ebae6bbe6a5 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -137,6 +137,9 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, if (kn_from == kn_to) return strlcpy(buf, "/", buflen); + if (!buf) + return -EINVAL; + common = kernfs_common_ancestor(kn_from, kn_to); if (WARN_ON(!common)) return -EINVAL; @@ -144,8 +147,7 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, depth_to = kernfs_depth(common, kn_to); depth_from = kernfs_depth(common, kn_from); - if (buf) - buf[0] = '\0'; + buf[0] = '\0'; for (i = 0; i < depth_from; i++) len += strlcpy(buf + len, parent_str, @@ -430,7 +432,6 @@ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn) */ void kernfs_put_active(struct kernfs_node *kn) { - struct kernfs_root *root = kernfs_root(kn); int v; if (unlikely(!kn)) @@ -442,7 +443,7 @@ void kernfs_put_active(struct kernfs_node *kn) if (likely(v != KN_DEACTIVATED_BIAS)) return; - wake_up_all(&root->deactivate_waitq); + wake_up_all(&kernfs_root(kn)->deactivate_waitq); } /** diff --git a/fs/locks.c b/fs/locks.c index 686eae21daf6..a364ebc5cec3 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1592,7 +1592,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) ctx = smp_load_acquire(&inode->i_flctx); if (!ctx) { WARN_ON_ONCE(1); - return error; + goto free_lock; } percpu_down_read(&file_rwsem); @@ -1672,6 +1672,7 @@ out: spin_unlock(&ctx->flc_lock); percpu_up_read(&file_rwsem); locks_dispose_list(&dispose); +free_lock: locks_free_lock(new_fl); return error; } @@ -2784,10 +2785,10 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, ? (fl->fl_type & LOCK_WRITE) ? "RW " : "READ " : (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE "); } else { - seq_printf(f, "%s ", - (lease_breaking(fl)) - ? (fl->fl_type == F_UNLCK) ? "UNLCK" : "READ " - : (fl->fl_type == F_WRLCK) ? "WRITE" : "READ "); + int type = IS_LEASE(fl) ? target_leasetype(fl) : fl->fl_type; + + seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" : + (type == F_RDLCK) ? "READ" : "UNLCK"); } if (inode) { /* userspace relies on this representation of dev_t */ diff --git a/fs/namei.c b/fs/namei.c index 209c51a5226c..671c3c1a3425 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -596,14 +596,12 @@ static void terminate_walk(struct nameidata *nd) path_put(&nd->path); for (i = 0; i < nd->depth; i++) path_put(&nd->stack[i].link); - if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { + if (nd->flags & LOOKUP_ROOT_GRABBED) { path_put(&nd->root); - nd->root.mnt = NULL; + nd->flags &= ~LOOKUP_ROOT_GRABBED; } } else { nd->flags &= ~LOOKUP_RCU; - if (!(nd->flags & LOOKUP_ROOT)) - nd->root.mnt = NULL; rcu_read_unlock(); } nd->depth = 0; @@ -641,6 +639,14 @@ static bool legitimize_links(struct nameidata *nd) return true; } +static bool legitimize_root(struct nameidata *nd) +{ + if (!nd->root.mnt || (nd->flags & LOOKUP_ROOT)) + return true; + nd->flags |= LOOKUP_ROOT_GRABBED; + return legitimize_path(nd, &nd->root, nd->root_seq); +} + /* * Path walking has 2 modes, rcu-walk and ref-walk (see * Documentation/filesystems/path-lookup.txt). In situations when we can't @@ -671,23 +677,18 @@ static int unlazy_walk(struct nameidata *nd) nd->flags &= ~LOOKUP_RCU; if (unlikely(!legitimize_links(nd))) - goto out2; - if (unlikely(!legitimize_path(nd, &nd->path, nd->seq))) goto out1; - if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { - if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq))) - goto out; - } + if (unlikely(!legitimize_path(nd, &nd->path, nd->seq))) + goto out; + if (unlikely(!legitimize_root(nd))) + goto out; rcu_read_unlock(); BUG_ON(nd->inode != parent->d_inode); return 0; -out2: +out1: nd->path.mnt = NULL; nd->path.dentry = NULL; -out1: - if (!(nd->flags & LOOKUP_ROOT)) - nd->root.mnt = NULL; out: rcu_read_unlock(); return -ECHILD; @@ -727,23 +728,14 @@ static int unlazy_child(struct nameidata *nd, struct dentry *dentry, unsigned se */ if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) goto out; - if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) { - rcu_read_unlock(); - dput(dentry); - goto drop_root_mnt; - } + if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) + goto out_dput; /* * Sequence counts matched. Now make sure that the root is * still valid and get it if required. */ - if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { - if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq))) { - rcu_read_unlock(); - dput(dentry); - return -ECHILD; - } - } - + if (unlikely(!legitimize_root(nd))) + goto out_dput; rcu_read_unlock(); return 0; @@ -753,9 +745,10 @@ out1: nd->path.dentry = NULL; out: rcu_read_unlock(); -drop_root_mnt: - if (!(nd->flags & LOOKUP_ROOT)) - nd->root.mnt = NULL; + return -ECHILD; +out_dput: + rcu_read_unlock(); + dput(dentry); return -ECHILD; } @@ -819,6 +812,7 @@ static void set_root(struct nameidata *nd) } while (read_seqcount_retry(&fs->seq, seq)); } else { get_fs_root(fs, &nd->root); + nd->flags |= LOOKUP_ROOT_GRABBED; } } @@ -1735,8 +1729,6 @@ static int pick_link(struct nameidata *nd, struct path *link, nd->flags &= ~LOOKUP_RCU; nd->path.mnt = NULL; nd->path.dentry = NULL; - if (!(nd->flags & LOOKUP_ROOT)) - nd->root.mnt = NULL; rcu_read_unlock(); } else if (likely(unlazy_walk(nd)) == 0) error = nd_alloc_stack(nd); @@ -2350,7 +2342,7 @@ int filename_lookup(int dfd, struct filename *name, unsigned flags, retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path); if (likely(!retval)) - audit_inode(name, path->dentry, flags & LOOKUP_PARENT); + audit_inode(name, path->dentry, 0); restore_nameidata(); putname(name); return retval; @@ -2391,7 +2383,7 @@ static struct filename *filename_parentat(int dfd, struct filename *name, if (likely(!retval)) { *last = nd.last; *type = nd.last_type; - audit_inode(name, parent->dentry, LOOKUP_PARENT); + audit_inode(name, parent->dentry, AUDIT_INODE_PARENT); } else { putname(name); name = ERR_PTR(retval); @@ -2718,7 +2710,7 @@ filename_mountpoint(int dfd, struct filename *name, struct path *path, if (unlikely(error == -ESTALE)) error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path); if (likely(!error)) - audit_inode(name, path->dentry, flags & LOOKUP_NO_EVAL); + audit_inode(name, path->dentry, AUDIT_INODE_NOEVAL); restore_nameidata(); putname(name); return error; @@ -3299,7 +3291,7 @@ static int do_last(struct nameidata *nd, if (error) return error; - audit_inode(nd->name, dir, LOOKUP_PARENT); + audit_inode(nd->name, dir, AUDIT_INODE_PARENT); /* trailing slashes? */ if (unlikely(nd->last.name[nd->last.len])) return -EISDIR; diff --git a/fs/namespace.c b/fs/namespace.c index d28d30b13043..227f7b343034 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1643,13 +1643,18 @@ static inline bool may_mount(void) return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); } +#ifdef CONFIG_MANDATORY_FILE_LOCKING static inline bool may_mandlock(void) { -#ifndef CONFIG_MANDATORY_FILE_LOCKING - return false; -#endif return capable(CAP_SYS_ADMIN); } +#else +static inline bool may_mandlock(void) +{ + pr_warn("VFS: \"mand\" mount option not supported"); + return false; +} +#endif /* * Now umount can handle mount points as well as block devices. @@ -1675,8 +1680,6 @@ int ksys_umount(char __user *name, int flags) if (!(flags & UMOUNT_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; - lookup_flags |= LOOKUP_NO_EVAL; - retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path); if (retval) goto out; @@ -3046,7 +3049,7 @@ long do_mount(const char *dev_name, const char __user *dir_name, return -EINVAL; /* ... and get the mountpoint */ - retval = user_path(dir_name, &path); + retval = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path); if (retval) return retval; @@ -3593,11 +3596,13 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, if (!may_mount()) return -EPERM; - error = user_path_dir(new_root, &new); + error = user_path_at(AT_FDCWD, new_root, + LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new); if (error) goto out0; - error = user_path_dir(put_old, &old); + error = user_path_at(AT_FDCWD, put_old, + LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old); if (error) goto out1; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index c764cfe456e5..2a03bfeec10a 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1403,11 +1403,12 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) return 0; - /* No fileid? Just exit */ - if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) - return 0; + if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) { + /* Only a mounted-on-fileid? Just exit */ + if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) + return 0; /* Has the inode gone and changed behind our back? */ - if (nfsi->fileid != fattr->fileid) { + } else if (nfsi->fileid != fattr->fileid) { /* Is this perhaps the mounted-on fileid? */ if ((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) && nfsi->fileid == fattr->mounted_on_fileid) @@ -1807,11 +1808,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_display_fhandle_hash(NFS_FH(inode)), atomic_read(&inode->i_count), fattr->valid); - /* No fileid? Just exit */ - if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) - return 0; + if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) { + /* Only a mounted-on-fileid? Just exit */ + if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) + return 0; /* Has the inode gone and changed behind our back? */ - if (nfsi->fileid != fattr->fileid) { + } else if (nfsi->fileid != fattr->fileid) { /* Is this perhaps the mounted-on fileid? */ if ((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) && nfsi->fileid == fattr->mounted_on_fileid) diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 976d4089e267..361cc10d6f95 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -207,7 +207,6 @@ TRACE_DEFINE_ENUM(LOOKUP_PARENT); TRACE_DEFINE_ENUM(LOOKUP_REVAL); TRACE_DEFINE_ENUM(LOOKUP_RCU); TRACE_DEFINE_ENUM(LOOKUP_NO_REVAL); -TRACE_DEFINE_ENUM(LOOKUP_NO_EVAL); TRACE_DEFINE_ENUM(LOOKUP_OPEN); TRACE_DEFINE_ENUM(LOOKUP_CREATE); TRACE_DEFINE_ENUM(LOOKUP_EXCL); @@ -226,7 +225,6 @@ TRACE_DEFINE_ENUM(LOOKUP_DOWN); { LOOKUP_REVAL, "REVAL" }, \ { LOOKUP_RCU, "RCU" }, \ { LOOKUP_NO_REVAL, "NO_REVAL" }, \ - { LOOKUP_NO_EVAL, "NO_EVAL" }, \ { LOOKUP_OPEN, "OPEN" }, \ { LOOKUP_CREATE, "CREATE" }, \ { LOOKUP_EXCL, "EXCL" }, \ diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 3cf4f6aa48d6..2c215171c0eb 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1385,8 +1385,7 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc) static int nfsd_fs_get_tree(struct fs_context *fc) { - fc->s_fs_info = get_net(fc->net_ns); - return vfs_get_super(fc, vfs_get_keyed_super, nfsd_fill_super); + return get_tree_keyed(fc, nfsd_fill_super, get_net(fc->net_ns)); } static void nfsd_fs_free_fc(struct fs_context *fc) diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 960f9a3c012d..a5612abc0936 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -555,7 +555,7 @@ static int orangefs_fsync(struct file *file, * Change the file pointer position for an instance of an open file. * * \note If .llseek is overriden, we must acquire lock as described in - * Documentation/filesystems/Locking. + * Documentation/filesystems/locking.rst. * * Future upgrade could support SEEK_DATA and SEEK_HOLE but would * require much changes to the FS diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 572dd29fbd54..34a6c99fa29b 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -246,7 +246,7 @@ struct orangefs_read_options { extern struct orangefs_stats orangefs_stats; /* - * NOTE: See Documentation/filesystems/porting for information + * NOTE: See Documentation/filesystems/porting.rst for information * on implementing FOO_I and properly accessing fs private data */ static inline struct orangefs_inode_s *ORANGEFS_I(struct inode *inode) diff --git a/fs/proc/root.c b/fs/proc/root.c index 33f72d1b92cc..0b7c8dffc9ae 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -157,8 +157,7 @@ static int proc_get_tree(struct fs_context *fc) { struct proc_fs_context *ctx = fc->fs_private; - fc->s_fs_info = ctx->pid_ns; - return vfs_get_super(fc, vfs_get_keyed_super, proc_fill_super); + return get_tree_keyed(fc, proc_fill_super, ctx->pid_ns); } static void proc_fs_context_free(struct fs_context *fc) diff --git a/fs/super.c b/fs/super.c index 5960578a4076..9459ba75a32e 100644 --- a/fs/super.c +++ b/fs/super.c @@ -32,6 +32,7 @@ #include <linux/backing-dev.h> #include <linux/rculist_bl.h> #include <linux/cleancache.h> +#include <linux/fscrypt.h> #include <linux/fsnotify.h> #include <linux/lockdep.h> #include <linux/user_namespace.h> @@ -290,6 +291,7 @@ static void __put_super(struct super_block *s) WARN_ON(s->s_inode_lru.node); WARN_ON(!list_empty(&s->s_mounts)); security_sb_free(s); + fscrypt_sb_free(s); put_user_ns(s->s_user_ns); kfree(s->s_subtype); call_rcu(&s->rcu, destroy_super_rcu); @@ -1211,7 +1213,18 @@ int get_tree_single(struct fs_context *fc, } EXPORT_SYMBOL(get_tree_single); +int get_tree_keyed(struct fs_context *fc, + int (*fill_super)(struct super_block *sb, + struct fs_context *fc), + void *key) +{ + fc->s_fs_info = key; + return vfs_get_super(fc, vfs_get_keyed_super, fill_super); +} +EXPORT_SYMBOL(get_tree_keyed); + #ifdef CONFIG_BLOCK + static int set_bdev_super(struct super_block *s, void *data) { s->s_bdev = data; @@ -1221,6 +1234,99 @@ static int set_bdev_super(struct super_block *s, void *data) return 0; } +static int set_bdev_super_fc(struct super_block *s, struct fs_context *fc) +{ + return set_bdev_super(s, fc->sget_key); +} + +static int test_bdev_super_fc(struct super_block *s, struct fs_context *fc) +{ + return s->s_bdev == fc->sget_key; +} + +/** + * get_tree_bdev - Get a superblock based on a single block device + * @fc: The filesystem context holding the parameters + * @fill_super: Helper to initialise a new superblock + */ +int get_tree_bdev(struct fs_context *fc, + int (*fill_super)(struct super_block *, + struct fs_context *)) +{ + struct block_device *bdev; + struct super_block *s; + fmode_t mode = FMODE_READ | FMODE_EXCL; + int error = 0; + + if (!(fc->sb_flags & SB_RDONLY)) + mode |= FMODE_WRITE; + + if (!fc->source) + return invalf(fc, "No source specified"); + + bdev = blkdev_get_by_path(fc->source, mode, fc->fs_type); + if (IS_ERR(bdev)) { + errorf(fc, "%s: Can't open blockdev", fc->source); + return PTR_ERR(bdev); + } + + /* Once the superblock is inserted into the list by sget_fc(), s_umount + * will protect the lockfs code from trying to start a snapshot while + * we are mounting + */ + mutex_lock(&bdev->bd_fsfreeze_mutex); + if (bdev->bd_fsfreeze_count > 0) { + mutex_unlock(&bdev->bd_fsfreeze_mutex); + warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); + return -EBUSY; + } + + fc->sb_flags |= SB_NOSEC; + fc->sget_key = bdev; + s = sget_fc(fc, test_bdev_super_fc, set_bdev_super_fc); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + if (IS_ERR(s)) + return PTR_ERR(s); + + if (s->s_root) { + /* Don't summarily change the RO/RW state. */ + if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) { + warnf(fc, "%pg: Can't mount, would change RO state", bdev); + deactivate_locked_super(s); + blkdev_put(bdev, mode); + return -EBUSY; + } + + /* + * s_umount nests inside bd_mutex during + * __invalidate_device(). blkdev_put() acquires + * bd_mutex and can't be called under s_umount. Drop + * s_umount temporarily. This is safe as we're + * holding an active reference. + */ + up_write(&s->s_umount); + blkdev_put(bdev, mode); + down_write(&s->s_umount); + } else { + s->s_mode = mode; + snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); + sb_set_blocksize(s, block_size(bdev)); + error = fill_super(s, fc); + if (error) { + deactivate_locked_super(s); + return error; + } + + s->s_flags |= SB_ACTIVE; + bdev->bd_super = s; + } + + BUG_ON(fc->root); + fc->root = dget(s->s_root); + return 0; +} +EXPORT_SYMBOL(get_tree_bdev); + static int test_bdev_super(struct super_block *s, void *data) { return (void *)s->s_bdev == data; diff --git a/fs/timerfd.c b/fs/timerfd.c index 6a6fc8aa1de7..48305ba41e3c 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -471,7 +471,11 @@ static int do_timerfd_settime(int ufd, int flags, break; } spin_unlock_irq(&ctx->wqh.lock); - cpu_relax(); + + if (isalarm(ctx)) + hrtimer_cancel_wait_running(&ctx->t.alarm.timer); + else + hrtimer_cancel_wait_running(&ctx->t.tmr); } /* diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 034ad14710d1..5dc5abca11c7 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -185,6 +185,21 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case FS_IOC_GET_ENCRYPTION_POLICY: return fscrypt_ioctl_get_policy(file, (void __user *)arg); + case FS_IOC_GET_ENCRYPTION_POLICY_EX: + return fscrypt_ioctl_get_policy_ex(file, (void __user *)arg); + + case FS_IOC_ADD_ENCRYPTION_KEY: + return fscrypt_ioctl_add_key(file, (void __user *)arg); + + case FS_IOC_REMOVE_ENCRYPTION_KEY: + return fscrypt_ioctl_remove_key(file, (void __user *)arg); + + case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: + return fscrypt_ioctl_remove_key_all_users(file, + (void __user *)arg); + case FS_IOC_GET_ENCRYPTION_KEY_STATUS: + return fscrypt_ioctl_get_key_status(file, (void __user *)arg); + default: return -ENOTTY; } @@ -202,6 +217,11 @@ long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) break; case FS_IOC_SET_ENCRYPTION_POLICY: case FS_IOC_GET_ENCRYPTION_POLICY: + case FS_IOC_GET_ENCRYPTION_POLICY_EX: + case FS_IOC_ADD_ENCRYPTION_KEY: + case FS_IOC_REMOVE_ENCRYPTION_KEY: + case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: + case FS_IOC_GET_ENCRYPTION_KEY_STATUS: break; default: return -ENOIOCTLCMD; diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 8c1d571334bc..5e1e8ec0589e 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -318,6 +318,16 @@ static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc) return err; } +static int ubifs_drop_inode(struct inode *inode) +{ + int drop = generic_drop_inode(inode); + + if (!drop) + drop = fscrypt_drop_inode(inode); + + return drop; +} + static void ubifs_evict_inode(struct inode *inode) { int err; @@ -1994,6 +2004,7 @@ const struct super_operations ubifs_super_operations = { .free_inode = ubifs_free_inode, .put_super = ubifs_put_super, .write_inode = ubifs_write_inode, + .drop_inode = ubifs_drop_inode, .evict_inode = ubifs_evict_inode, .statfs = ubifs_statfs, .dirty_inode = ubifs_dirty_inode, diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig index fcb41516ea59..6d30adb6b890 100644 --- a/fs/ufs/Kconfig +++ b/fs/ufs/Kconfig @@ -9,7 +9,7 @@ config UFS_FS this file system as well. Saying Y here will allow you to read from these partitions; if you also want to write to them, say Y to the experimental "UFS file system write support", below. Please read the - file <file:Documentation/filesystems/ufs.txt> for more information. + file <file:Documentation/admin-guide/ufs.rst> for more information. The recently released UFS2 variant (used in FreeBSD 5.x) is READ-ONLY supported. diff --git a/fs/verity/Kconfig b/fs/verity/Kconfig new file mode 100644 index 000000000000..88fb25119899 --- /dev/null +++ b/fs/verity/Kconfig @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: GPL-2.0 + +config FS_VERITY + bool "FS Verity (read-only file-based authenticity protection)" + select CRYPTO + # SHA-256 is selected as it's intended to be the default hash algorithm. + # To avoid bloat, other wanted algorithms must be selected explicitly. + select CRYPTO_SHA256 + help + This option enables fs-verity. fs-verity is the dm-verity + mechanism implemented at the file level. On supported + filesystems (currently EXT4 and F2FS), userspace can use an + ioctl to enable verity for a file, which causes the filesystem + to build a Merkle tree for the file. The filesystem will then + transparently verify any data read from the file against the + Merkle tree. The file is also made read-only. + + This serves as an integrity check, but the availability of the + Merkle tree root hash also allows efficiently supporting + various use cases where normally the whole file would need to + be hashed at once, such as: (a) auditing (logging the file's + hash), or (b) authenticity verification (comparing the hash + against a known good value, e.g. from a digital signature). + + fs-verity is especially useful on large files where not all + the contents may actually be needed. Also, fs-verity verifies + data each time it is paged back in, which provides better + protection against malicious disks vs. an ahead-of-time hash. + + If unsure, say N. + +config FS_VERITY_DEBUG + bool "FS Verity debugging" + depends on FS_VERITY + help + Enable debugging messages related to fs-verity by default. + + Say N unless you are an fs-verity developer. + +config FS_VERITY_BUILTIN_SIGNATURES + bool "FS Verity builtin signature support" + depends on FS_VERITY + select SYSTEM_DATA_VERIFICATION + help + Support verifying signatures of verity files against the X.509 + certificates that have been loaded into the ".fs-verity" + kernel keyring. + + This is meant as a relatively simple mechanism that can be + used to provide an authenticity guarantee for verity files, as + an alternative to IMA appraisal. Userspace programs still + need to check that the verity bit is set in order to get an + authenticity guarantee. + + If unsure, say N. diff --git a/fs/verity/Makefile b/fs/verity/Makefile new file mode 100644 index 000000000000..570e9136334d --- /dev/null +++ b/fs/verity/Makefile @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_FS_VERITY) += enable.o \ + hash_algs.o \ + init.o \ + measure.o \ + open.o \ + verify.o + +obj-$(CONFIG_FS_VERITY_BUILTIN_SIGNATURES) += signature.o diff --git a/fs/verity/enable.c b/fs/verity/enable.c new file mode 100644 index 000000000000..eabc6ac19906 --- /dev/null +++ b/fs/verity/enable.c @@ -0,0 +1,377 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/verity/enable.c: ioctl to enable verity on a file + * + * Copyright 2019 Google LLC + */ + +#include "fsverity_private.h" + +#include <crypto/hash.h> +#include <linux/mount.h> +#include <linux/pagemap.h> +#include <linux/sched/signal.h> +#include <linux/uaccess.h> + +static int build_merkle_tree_level(struct inode *inode, unsigned int level, + u64 num_blocks_to_hash, + const struct merkle_tree_params *params, + u8 *pending_hashes, + struct ahash_request *req) +{ + const struct fsverity_operations *vops = inode->i_sb->s_vop; + unsigned int pending_size = 0; + u64 dst_block_num; + u64 i; + int err; + + if (WARN_ON(params->block_size != PAGE_SIZE)) /* checked earlier too */ + return -EINVAL; + + if (level < params->num_levels) { + dst_block_num = params->level_start[level]; + } else { + if (WARN_ON(num_blocks_to_hash != 1)) + return -EINVAL; + dst_block_num = 0; /* unused */ + } + + for (i = 0; i < num_blocks_to_hash; i++) { + struct page *src_page; + + if ((pgoff_t)i % 10000 == 0 || i + 1 == num_blocks_to_hash) + pr_debug("Hashing block %llu of %llu for level %u\n", + i + 1, num_blocks_to_hash, level); + + if (level == 0) { + /* Leaf: hashing a data block */ + src_page = read_mapping_page(inode->i_mapping, i, NULL); + if (IS_ERR(src_page)) { + err = PTR_ERR(src_page); + fsverity_err(inode, + "Error %d reading data page %llu", + err, i); + return err; + } + } else { + /* Non-leaf: hashing hash block from level below */ + src_page = vops->read_merkle_tree_page(inode, + params->level_start[level - 1] + i); + if (IS_ERR(src_page)) { + err = PTR_ERR(src_page); + fsverity_err(inode, + "Error %d reading Merkle tree page %llu", + err, params->level_start[level - 1] + i); + return err; + } + } + + err = fsverity_hash_page(params, inode, req, src_page, + &pending_hashes[pending_size]); + put_page(src_page); + if (err) + return err; + pending_size += params->digest_size; + + if (level == params->num_levels) /* Root hash? */ + return 0; + + if (pending_size + params->digest_size > params->block_size || + i + 1 == num_blocks_to_hash) { + /* Flush the pending hash block */ + memset(&pending_hashes[pending_size], 0, + params->block_size - pending_size); + err = vops->write_merkle_tree_block(inode, + pending_hashes, + dst_block_num, + params->log_blocksize); + if (err) { + fsverity_err(inode, + "Error %d writing Merkle tree block %llu", + err, dst_block_num); + return err; + } + dst_block_num++; + pending_size = 0; + } + + if (fatal_signal_pending(current)) + return -EINTR; + cond_resched(); + } + return 0; +} + +/* + * Build the Merkle tree for the given inode using the given parameters, and + * return the root hash in @root_hash. + * + * The tree is written to a filesystem-specific location as determined by the + * ->write_merkle_tree_block() method. However, the blocks that comprise the + * tree are the same for all filesystems. + */ +static int build_merkle_tree(struct inode *inode, + const struct merkle_tree_params *params, + u8 *root_hash) +{ + u8 *pending_hashes; + struct ahash_request *req; + u64 blocks; + unsigned int level; + int err = -ENOMEM; + + if (inode->i_size == 0) { + /* Empty file is a special case; root hash is all 0's */ + memset(root_hash, 0, params->digest_size); + return 0; + } + + pending_hashes = kmalloc(params->block_size, GFP_KERNEL); + req = ahash_request_alloc(params->hash_alg->tfm, GFP_KERNEL); + if (!pending_hashes || !req) + goto out; + + /* + * Build each level of the Merkle tree, starting at the leaf level + * (level 0) and ascending to the root node (level 'num_levels - 1'). + * Then at the end (level 'num_levels'), calculate the root hash. + */ + blocks = (inode->i_size + params->block_size - 1) >> + params->log_blocksize; + for (level = 0; level <= params->num_levels; level++) { + err = build_merkle_tree_level(inode, level, blocks, params, + pending_hashes, req); + if (err) + goto out; + blocks = (blocks + params->hashes_per_block - 1) >> + params->log_arity; + } + memcpy(root_hash, pending_hashes, params->digest_size); + err = 0; +out: + kfree(pending_hashes); + ahash_request_free(req); + return err; +} + +static int enable_verity(struct file *filp, + const struct fsverity_enable_arg *arg) +{ + struct inode *inode = file_inode(filp); + const struct fsverity_operations *vops = inode->i_sb->s_vop; + struct merkle_tree_params params = { }; + struct fsverity_descriptor *desc; + size_t desc_size = sizeof(*desc) + arg->sig_size; + struct fsverity_info *vi; + int err; + + /* Start initializing the fsverity_descriptor */ + desc = kzalloc(desc_size, GFP_KERNEL); + if (!desc) + return -ENOMEM; + desc->version = 1; + desc->hash_algorithm = arg->hash_algorithm; + desc->log_blocksize = ilog2(arg->block_size); + + /* Get the salt if the user provided one */ + if (arg->salt_size && + copy_from_user(desc->salt, + (const u8 __user *)(uintptr_t)arg->salt_ptr, + arg->salt_size)) { + err = -EFAULT; + goto out; + } + desc->salt_size = arg->salt_size; + + /* Get the signature if the user provided one */ + if (arg->sig_size && + copy_from_user(desc->signature, + (const u8 __user *)(uintptr_t)arg->sig_ptr, + arg->sig_size)) { + err = -EFAULT; + goto out; + } + desc->sig_size = cpu_to_le32(arg->sig_size); + + desc->data_size = cpu_to_le64(inode->i_size); + + /* Prepare the Merkle tree parameters */ + err = fsverity_init_merkle_tree_params(¶ms, inode, + arg->hash_algorithm, + desc->log_blocksize, + desc->salt, desc->salt_size); + if (err) + goto out; + + /* + * Start enabling verity on this file, serialized by the inode lock. + * Fail if verity is already enabled or is already being enabled. + */ + inode_lock(inode); + if (IS_VERITY(inode)) + err = -EEXIST; + else + err = vops->begin_enable_verity(filp); + inode_unlock(inode); + if (err) + goto out; + + /* + * Build the Merkle tree. Don't hold the inode lock during this, since + * on huge files this may take a very long time and we don't want to + * force unrelated syscalls like chown() to block forever. We don't + * need the inode lock here because deny_write_access() already prevents + * the file from being written to or truncated, and we still serialize + * ->begin_enable_verity() and ->end_enable_verity() using the inode + * lock and only allow one process to be here at a time on a given file. + */ + pr_debug("Building Merkle tree...\n"); + BUILD_BUG_ON(sizeof(desc->root_hash) < FS_VERITY_MAX_DIGEST_SIZE); + err = build_merkle_tree(inode, ¶ms, desc->root_hash); + if (err) { + fsverity_err(inode, "Error %d building Merkle tree", err); + goto rollback; + } + pr_debug("Done building Merkle tree. Root hash is %s:%*phN\n", + params.hash_alg->name, params.digest_size, desc->root_hash); + + /* + * Create the fsverity_info. Don't bother trying to save work by + * reusing the merkle_tree_params from above. Instead, just create the + * fsverity_info from the fsverity_descriptor as if it were just loaded + * from disk. This is simpler, and it serves as an extra check that the + * metadata we're writing is valid before actually enabling verity. + */ + vi = fsverity_create_info(inode, desc, desc_size); + if (IS_ERR(vi)) { + err = PTR_ERR(vi); + goto rollback; + } + + if (arg->sig_size) + pr_debug("Storing a %u-byte PKCS#7 signature alongside the file\n", + arg->sig_size); + + /* + * Tell the filesystem to finish enabling verity on the file. + * Serialized with ->begin_enable_verity() by the inode lock. + */ + inode_lock(inode); + err = vops->end_enable_verity(filp, desc, desc_size, params.tree_size); + inode_unlock(inode); + if (err) { + fsverity_err(inode, "%ps() failed with err %d", + vops->end_enable_verity, err); + fsverity_free_info(vi); + } else if (WARN_ON(!IS_VERITY(inode))) { + err = -EINVAL; + fsverity_free_info(vi); + } else { + /* Successfully enabled verity */ + + /* + * Readers can start using ->i_verity_info immediately, so it + * can't be rolled back once set. So don't set it until just + * after the filesystem has successfully enabled verity. + */ + fsverity_set_info(inode, vi); + } +out: + kfree(params.hashstate); + kfree(desc); + return err; + +rollback: + inode_lock(inode); + (void)vops->end_enable_verity(filp, NULL, 0, params.tree_size); + inode_unlock(inode); + goto out; +} + +/** + * fsverity_ioctl_enable() - enable verity on a file + * + * Enable fs-verity on a file. See the "FS_IOC_ENABLE_VERITY" section of + * Documentation/filesystems/fsverity.rst for the documentation. + * + * Return: 0 on success, -errno on failure + */ +int fsverity_ioctl_enable(struct file *filp, const void __user *uarg) +{ + struct inode *inode = file_inode(filp); + struct fsverity_enable_arg arg; + int err; + + if (copy_from_user(&arg, uarg, sizeof(arg))) + return -EFAULT; + + if (arg.version != 1) + return -EINVAL; + + if (arg.__reserved1 || + memchr_inv(arg.__reserved2, 0, sizeof(arg.__reserved2))) + return -EINVAL; + + if (arg.block_size != PAGE_SIZE) + return -EINVAL; + + if (arg.salt_size > FIELD_SIZEOF(struct fsverity_descriptor, salt)) + return -EMSGSIZE; + + if (arg.sig_size > FS_VERITY_MAX_SIGNATURE_SIZE) + return -EMSGSIZE; + + /* + * Require a regular file with write access. But the actual fd must + * still be readonly so that we can lock out all writers. This is + * needed to guarantee that no writable fds exist to the file once it + * has verity enabled, and to stabilize the data being hashed. + */ + + err = inode_permission(inode, MAY_WRITE); + if (err) + return err; + + if (IS_APPEND(inode)) + return -EPERM; + + if (S_ISDIR(inode->i_mode)) + return -EISDIR; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + err = mnt_want_write_file(filp); + if (err) /* -EROFS */ + return err; + + err = deny_write_access(filp); + if (err) /* -ETXTBSY */ + goto out_drop_write; + + err = enable_verity(filp, &arg); + if (err) + goto out_allow_write_access; + + /* + * Some pages of the file may have been evicted from pagecache after + * being used in the Merkle tree construction, then read into pagecache + * again by another process reading from the file concurrently. Since + * these pages didn't undergo verification against the file measurement + * which fs-verity now claims to be enforcing, we have to wipe the + * pagecache to ensure that all future reads are verified. + */ + filemap_write_and_wait(inode->i_mapping); + invalidate_inode_pages2(inode->i_mapping); + + /* + * allow_write_access() is needed to pair with deny_write_access(). + * Regardless, the filesystem won't allow writing to verity files. + */ +out_allow_write_access: + allow_write_access(filp); +out_drop_write: + mnt_drop_write_file(filp); + return err; +} +EXPORT_SYMBOL_GPL(fsverity_ioctl_enable); diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h new file mode 100644 index 000000000000..e74c79b64d88 --- /dev/null +++ b/fs/verity/fsverity_private.h @@ -0,0 +1,185 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * fs-verity: read-only file-based authenticity protection + * + * Copyright 2019 Google LLC + */ + +#ifndef _FSVERITY_PRIVATE_H +#define _FSVERITY_PRIVATE_H + +#ifdef CONFIG_FS_VERITY_DEBUG +#define DEBUG +#endif + +#define pr_fmt(fmt) "fs-verity: " fmt + +#include <crypto/sha.h> +#include <linux/fsverity.h> + +struct ahash_request; + +/* + * Implementation limit: maximum depth of the Merkle tree. For now 8 is plenty; + * it's enough for over U64_MAX bytes of data using SHA-256 and 4K blocks. + */ +#define FS_VERITY_MAX_LEVELS 8 + +/* + * Largest digest size among all hash algorithms supported by fs-verity. + * Currently assumed to be <= size of fsverity_descriptor::root_hash. + */ +#define FS_VERITY_MAX_DIGEST_SIZE SHA512_DIGEST_SIZE + +/* A hash algorithm supported by fs-verity */ +struct fsverity_hash_alg { + struct crypto_ahash *tfm; /* hash tfm, allocated on demand */ + const char *name; /* crypto API name, e.g. sha256 */ + unsigned int digest_size; /* digest size in bytes, e.g. 32 for SHA-256 */ + unsigned int block_size; /* block size in bytes, e.g. 64 for SHA-256 */ +}; + +/* Merkle tree parameters: hash algorithm, initial hash state, and topology */ +struct merkle_tree_params { + const struct fsverity_hash_alg *hash_alg; /* the hash algorithm */ + const u8 *hashstate; /* initial hash state or NULL */ + unsigned int digest_size; /* same as hash_alg->digest_size */ + unsigned int block_size; /* size of data and tree blocks */ + unsigned int hashes_per_block; /* number of hashes per tree block */ + unsigned int log_blocksize; /* log2(block_size) */ + unsigned int log_arity; /* log2(hashes_per_block) */ + unsigned int num_levels; /* number of levels in Merkle tree */ + u64 tree_size; /* Merkle tree size in bytes */ + + /* + * Starting block index for each tree level, ordered from leaf level (0) + * to root level ('num_levels - 1') + */ + u64 level_start[FS_VERITY_MAX_LEVELS]; +}; + +/** + * fsverity_info - cached verity metadata for an inode + * + * When a verity file is first opened, an instance of this struct is allocated + * and stored in ->i_verity_info; it remains until the inode is evicted. It + * caches information about the Merkle tree that's needed to efficiently verify + * data read from the file. It also caches the file measurement. The Merkle + * tree pages themselves are not cached here, but the filesystem may cache them. + */ +struct fsverity_info { + struct merkle_tree_params tree_params; + u8 root_hash[FS_VERITY_MAX_DIGEST_SIZE]; + u8 measurement[FS_VERITY_MAX_DIGEST_SIZE]; + const struct inode *inode; +}; + +/* + * Merkle tree properties. The file measurement is the hash of this structure + * excluding the signature and with the sig_size field set to 0. + */ +struct fsverity_descriptor { + __u8 version; /* must be 1 */ + __u8 hash_algorithm; /* Merkle tree hash algorithm */ + __u8 log_blocksize; /* log2 of size of data and tree blocks */ + __u8 salt_size; /* size of salt in bytes; 0 if none */ + __le32 sig_size; /* size of signature in bytes; 0 if none */ + __le64 data_size; /* size of file the Merkle tree is built over */ + __u8 root_hash[64]; /* Merkle tree root hash */ + __u8 salt[32]; /* salt prepended to each hashed block */ + __u8 __reserved[144]; /* must be 0's */ + __u8 signature[]; /* optional PKCS#7 signature */ +}; + +/* Arbitrary limit to bound the kmalloc() size. Can be changed. */ +#define FS_VERITY_MAX_DESCRIPTOR_SIZE 16384 + +#define FS_VERITY_MAX_SIGNATURE_SIZE (FS_VERITY_MAX_DESCRIPTOR_SIZE - \ + sizeof(struct fsverity_descriptor)) + +/* + * Format in which verity file measurements are signed. This is the same as + * 'struct fsverity_digest', except here some magic bytes are prepended to + * provide some context about what is being signed in case the same key is used + * for non-fsverity purposes, and here the fields have fixed endianness. + */ +struct fsverity_signed_digest { + char magic[8]; /* must be "FSVerity" */ + __le16 digest_algorithm; + __le16 digest_size; + __u8 digest[]; +}; + +/* hash_algs.c */ + +extern struct fsverity_hash_alg fsverity_hash_algs[]; + +const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, + unsigned int num); +const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg, + const u8 *salt, size_t salt_size); +int fsverity_hash_page(const struct merkle_tree_params *params, + const struct inode *inode, + struct ahash_request *req, struct page *page, u8 *out); +int fsverity_hash_buffer(const struct fsverity_hash_alg *alg, + const void *data, size_t size, u8 *out); +void __init fsverity_check_hash_algs(void); + +/* init.c */ + +extern void __printf(3, 4) __cold +fsverity_msg(const struct inode *inode, const char *level, + const char *fmt, ...); + +#define fsverity_warn(inode, fmt, ...) \ + fsverity_msg((inode), KERN_WARNING, fmt, ##__VA_ARGS__) +#define fsverity_err(inode, fmt, ...) \ + fsverity_msg((inode), KERN_ERR, fmt, ##__VA_ARGS__) + +/* open.c */ + +int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, + const struct inode *inode, + unsigned int hash_algorithm, + unsigned int log_blocksize, + const u8 *salt, size_t salt_size); + +struct fsverity_info *fsverity_create_info(const struct inode *inode, + void *desc, size_t desc_size); + +void fsverity_set_info(struct inode *inode, struct fsverity_info *vi); + +void fsverity_free_info(struct fsverity_info *vi); + +int __init fsverity_init_info_cache(void); +void __init fsverity_exit_info_cache(void); + +/* signature.c */ + +#ifdef CONFIG_FS_VERITY_BUILTIN_SIGNATURES +int fsverity_verify_signature(const struct fsverity_info *vi, + const struct fsverity_descriptor *desc, + size_t desc_size); + +int __init fsverity_init_signature(void); +#else /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */ +static inline int +fsverity_verify_signature(const struct fsverity_info *vi, + const struct fsverity_descriptor *desc, + size_t desc_size) +{ + return 0; +} + +static inline int fsverity_init_signature(void) +{ + return 0; +} +#endif /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */ + +/* verify.c */ + +int __init fsverity_init_workqueue(void); +void __init fsverity_exit_workqueue(void); + +#endif /* _FSVERITY_PRIVATE_H */ diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c new file mode 100644 index 000000000000..31e6d7d2389a --- /dev/null +++ b/fs/verity/hash_algs.c @@ -0,0 +1,280 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/verity/hash_algs.c: fs-verity hash algorithms + * + * Copyright 2019 Google LLC + */ + +#include "fsverity_private.h" + +#include <crypto/hash.h> +#include <linux/scatterlist.h> + +/* The hash algorithms supported by fs-verity */ +struct fsverity_hash_alg fsverity_hash_algs[] = { + [FS_VERITY_HASH_ALG_SHA256] = { + .name = "sha256", + .digest_size = SHA256_DIGEST_SIZE, + .block_size = SHA256_BLOCK_SIZE, + }, + [FS_VERITY_HASH_ALG_SHA512] = { + .name = "sha512", + .digest_size = SHA512_DIGEST_SIZE, + .block_size = SHA512_BLOCK_SIZE, + }, +}; + +/** + * fsverity_get_hash_alg() - validate and prepare a hash algorithm + * @inode: optional inode for logging purposes + * @num: the hash algorithm number + * + * Get the struct fsverity_hash_alg for the given hash algorithm number, and + * ensure it has a hash transform ready to go. The hash transforms are + * allocated on-demand so that we don't waste resources unnecessarily, and + * because the crypto modules may be initialized later than fs/verity/. + * + * Return: pointer to the hash alg on success, else an ERR_PTR() + */ +const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, + unsigned int num) +{ + struct fsverity_hash_alg *alg; + struct crypto_ahash *tfm; + int err; + + if (num >= ARRAY_SIZE(fsverity_hash_algs) || + !fsverity_hash_algs[num].name) { + fsverity_warn(inode, "Unknown hash algorithm number: %u", num); + return ERR_PTR(-EINVAL); + } + alg = &fsverity_hash_algs[num]; + + /* pairs with cmpxchg() below */ + tfm = READ_ONCE(alg->tfm); + if (likely(tfm != NULL)) + return alg; + /* + * Using the shash API would make things a bit simpler, but the ahash + * API is preferable as it allows the use of crypto accelerators. + */ + tfm = crypto_alloc_ahash(alg->name, 0, 0); + if (IS_ERR(tfm)) { + if (PTR_ERR(tfm) == -ENOENT) { + fsverity_warn(inode, + "Missing crypto API support for hash algorithm \"%s\"", + alg->name); + return ERR_PTR(-ENOPKG); + } + fsverity_err(inode, + "Error allocating hash algorithm \"%s\": %ld", + alg->name, PTR_ERR(tfm)); + return ERR_CAST(tfm); + } + + err = -EINVAL; + if (WARN_ON(alg->digest_size != crypto_ahash_digestsize(tfm))) + goto err_free_tfm; + if (WARN_ON(alg->block_size != crypto_ahash_blocksize(tfm))) + goto err_free_tfm; + + pr_info("%s using implementation \"%s\"\n", + alg->name, crypto_ahash_driver_name(tfm)); + + /* pairs with READ_ONCE() above */ + if (cmpxchg(&alg->tfm, NULL, tfm) != NULL) + crypto_free_ahash(tfm); + + return alg; + +err_free_tfm: + crypto_free_ahash(tfm); + return ERR_PTR(err); +} + +/** + * fsverity_prepare_hash_state() - precompute the initial hash state + * @alg: hash algorithm + * @salt: a salt which is to be prepended to all data to be hashed + * @salt_size: salt size in bytes, possibly 0 + * + * Return: NULL if the salt is empty, otherwise the kmalloc()'ed precomputed + * initial hash state on success or an ERR_PTR() on failure. + */ +const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg, + const u8 *salt, size_t salt_size) +{ + u8 *hashstate = NULL; + struct ahash_request *req = NULL; + u8 *padded_salt = NULL; + size_t padded_salt_size; + struct scatterlist sg; + DECLARE_CRYPTO_WAIT(wait); + int err; + + if (salt_size == 0) + return NULL; + + hashstate = kmalloc(crypto_ahash_statesize(alg->tfm), GFP_KERNEL); + if (!hashstate) + return ERR_PTR(-ENOMEM); + + req = ahash_request_alloc(alg->tfm, GFP_KERNEL); + if (!req) { + err = -ENOMEM; + goto err_free; + } + + /* + * Zero-pad the salt to the next multiple of the input size of the hash + * algorithm's compression function, e.g. 64 bytes for SHA-256 or 128 + * bytes for SHA-512. This ensures that the hash algorithm won't have + * any bytes buffered internally after processing the salt, thus making + * salted hashing just as fast as unsalted hashing. + */ + padded_salt_size = round_up(salt_size, alg->block_size); + padded_salt = kzalloc(padded_salt_size, GFP_KERNEL); + if (!padded_salt) { + err = -ENOMEM; + goto err_free; + } + memcpy(padded_salt, salt, salt_size); + + sg_init_one(&sg, padded_salt, padded_salt_size); + ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | + CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &wait); + ahash_request_set_crypt(req, &sg, NULL, padded_salt_size); + + err = crypto_wait_req(crypto_ahash_init(req), &wait); + if (err) + goto err_free; + + err = crypto_wait_req(crypto_ahash_update(req), &wait); + if (err) + goto err_free; + + err = crypto_ahash_export(req, hashstate); + if (err) + goto err_free; +out: + ahash_request_free(req); + kfree(padded_salt); + return hashstate; + +err_free: + kfree(hashstate); + hashstate = ERR_PTR(err); + goto out; +} + +/** + * fsverity_hash_page() - hash a single data or hash page + * @params: the Merkle tree's parameters + * @inode: inode for which the hashing is being done + * @req: preallocated hash request + * @page: the page to hash + * @out: output digest, size 'params->digest_size' bytes + * + * Hash a single data or hash block, assuming block_size == PAGE_SIZE. + * The hash is salted if a salt is specified in the Merkle tree parameters. + * + * Return: 0 on success, -errno on failure + */ +int fsverity_hash_page(const struct merkle_tree_params *params, + const struct inode *inode, + struct ahash_request *req, struct page *page, u8 *out) +{ + struct scatterlist sg; + DECLARE_CRYPTO_WAIT(wait); + int err; + + if (WARN_ON(params->block_size != PAGE_SIZE)) + return -EINVAL; + + sg_init_table(&sg, 1); + sg_set_page(&sg, page, PAGE_SIZE, 0); + ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | + CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &wait); + ahash_request_set_crypt(req, &sg, out, PAGE_SIZE); + + if (params->hashstate) { + err = crypto_ahash_import(req, params->hashstate); + if (err) { + fsverity_err(inode, + "Error %d importing hash state", err); + return err; + } + err = crypto_ahash_finup(req); + } else { + err = crypto_ahash_digest(req); + } + + err = crypto_wait_req(err, &wait); + if (err) + fsverity_err(inode, "Error %d computing page hash", err); + return err; +} + +/** + * fsverity_hash_buffer() - hash some data + * @alg: the hash algorithm to use + * @data: the data to hash + * @size: size of data to hash, in bytes + * @out: output digest, size 'alg->digest_size' bytes + * + * Hash some data which is located in physically contiguous memory (i.e. memory + * allocated by kmalloc(), not by vmalloc()). No salt is used. + * + * Return: 0 on success, -errno on failure + */ +int fsverity_hash_buffer(const struct fsverity_hash_alg *alg, + const void *data, size_t size, u8 *out) +{ + struct ahash_request *req; + struct scatterlist sg; + DECLARE_CRYPTO_WAIT(wait); + int err; + + req = ahash_request_alloc(alg->tfm, GFP_KERNEL); + if (!req) + return -ENOMEM; + + sg_init_one(&sg, data, size); + ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | + CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &wait); + ahash_request_set_crypt(req, &sg, out, size); + + err = crypto_wait_req(crypto_ahash_digest(req), &wait); + + ahash_request_free(req); + return err; +} + +void __init fsverity_check_hash_algs(void) +{ + size_t i; + + /* + * Sanity check the hash algorithms (could be a build-time check, but + * they're in an array) + */ + for (i = 0; i < ARRAY_SIZE(fsverity_hash_algs); i++) { + const struct fsverity_hash_alg *alg = &fsverity_hash_algs[i]; + + if (!alg->name) + continue; + + BUG_ON(alg->digest_size > FS_VERITY_MAX_DIGEST_SIZE); + + /* + * For efficiency, the implementation currently assumes the + * digest and block sizes are powers of 2. This limitation can + * be lifted if the code is updated to handle other values. + */ + BUG_ON(!is_power_of_2(alg->digest_size)); + BUG_ON(!is_power_of_2(alg->block_size)); + } +} diff --git a/fs/verity/init.c b/fs/verity/init.c new file mode 100644 index 000000000000..94c104e00861 --- /dev/null +++ b/fs/verity/init.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/verity/init.c: fs-verity module initialization and logging + * + * Copyright 2019 Google LLC + */ + +#include "fsverity_private.h" + +#include <linux/ratelimit.h> + +void fsverity_msg(const struct inode *inode, const char *level, + const char *fmt, ...) +{ + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + struct va_format vaf; + va_list args; + + if (!__ratelimit(&rs)) + return; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + if (inode) + printk("%sfs-verity (%s, inode %lu): %pV\n", + level, inode->i_sb->s_id, inode->i_ino, &vaf); + else + printk("%sfs-verity: %pV\n", level, &vaf); + va_end(args); +} + +static int __init fsverity_init(void) +{ + int err; + + fsverity_check_hash_algs(); + + err = fsverity_init_info_cache(); + if (err) + return err; + + err = fsverity_init_workqueue(); + if (err) + goto err_exit_info_cache; + + err = fsverity_init_signature(); + if (err) + goto err_exit_workqueue; + + pr_debug("Initialized fs-verity\n"); + return 0; + +err_exit_workqueue: + fsverity_exit_workqueue(); +err_exit_info_cache: + fsverity_exit_info_cache(); + return err; +} +late_initcall(fsverity_init) diff --git a/fs/verity/measure.c b/fs/verity/measure.c new file mode 100644 index 000000000000..05049b68c745 --- /dev/null +++ b/fs/verity/measure.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/verity/measure.c: ioctl to get a verity file's measurement + * + * Copyright 2019 Google LLC + */ + +#include "fsverity_private.h" + +#include <linux/uaccess.h> + +/** + * fsverity_ioctl_measure() - get a verity file's measurement + * + * Retrieve the file measurement that the kernel is enforcing for reads from a + * verity file. See the "FS_IOC_MEASURE_VERITY" section of + * Documentation/filesystems/fsverity.rst for the documentation. + * + * Return: 0 on success, -errno on failure + */ +int fsverity_ioctl_measure(struct file *filp, void __user *_uarg) +{ + const struct inode *inode = file_inode(filp); + struct fsverity_digest __user *uarg = _uarg; + const struct fsverity_info *vi; + const struct fsverity_hash_alg *hash_alg; + struct fsverity_digest arg; + + vi = fsverity_get_info(inode); + if (!vi) + return -ENODATA; /* not a verity file */ + hash_alg = vi->tree_params.hash_alg; + + /* + * The user specifies the digest_size their buffer has space for; we can + * return the digest if it fits in the available space. We write back + * the actual size, which may be shorter than the user-specified size. + */ + + if (get_user(arg.digest_size, &uarg->digest_size)) + return -EFAULT; + if (arg.digest_size < hash_alg->digest_size) + return -EOVERFLOW; + + memset(&arg, 0, sizeof(arg)); + arg.digest_algorithm = hash_alg - fsverity_hash_algs; + arg.digest_size = hash_alg->digest_size; + + if (copy_to_user(uarg, &arg, sizeof(arg))) + return -EFAULT; + + if (copy_to_user(uarg->digest, vi->measurement, hash_alg->digest_size)) + return -EFAULT; + + return 0; +} +EXPORT_SYMBOL_GPL(fsverity_ioctl_measure); diff --git a/fs/verity/open.c b/fs/verity/open.c new file mode 100644 index 000000000000..63d1004b688c --- /dev/null +++ b/fs/verity/open.c @@ -0,0 +1,356 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/verity/open.c: opening fs-verity files + * + * Copyright 2019 Google LLC + */ + +#include "fsverity_private.h" + +#include <linux/slab.h> + +static struct kmem_cache *fsverity_info_cachep; + +/** + * fsverity_init_merkle_tree_params() - initialize Merkle tree parameters + * @params: the parameters struct to initialize + * @inode: the inode for which the Merkle tree is being built + * @hash_algorithm: number of hash algorithm to use + * @log_blocksize: log base 2 of block size to use + * @salt: pointer to salt (optional) + * @salt_size: size of salt, possibly 0 + * + * Validate the hash algorithm and block size, then compute the tree topology + * (num levels, num blocks in each level, etc.) and initialize @params. + * + * Return: 0 on success, -errno on failure + */ +int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, + const struct inode *inode, + unsigned int hash_algorithm, + unsigned int log_blocksize, + const u8 *salt, size_t salt_size) +{ + const struct fsverity_hash_alg *hash_alg; + int err; + u64 blocks; + u64 offset; + int level; + + memset(params, 0, sizeof(*params)); + + hash_alg = fsverity_get_hash_alg(inode, hash_algorithm); + if (IS_ERR(hash_alg)) + return PTR_ERR(hash_alg); + params->hash_alg = hash_alg; + params->digest_size = hash_alg->digest_size; + + params->hashstate = fsverity_prepare_hash_state(hash_alg, salt, + salt_size); + if (IS_ERR(params->hashstate)) { + err = PTR_ERR(params->hashstate); + params->hashstate = NULL; + fsverity_err(inode, "Error %d preparing hash state", err); + goto out_err; + } + + if (log_blocksize != PAGE_SHIFT) { + fsverity_warn(inode, "Unsupported log_blocksize: %u", + log_blocksize); + err = -EINVAL; + goto out_err; + } + params->log_blocksize = log_blocksize; + params->block_size = 1 << log_blocksize; + + if (WARN_ON(!is_power_of_2(params->digest_size))) { + err = -EINVAL; + goto out_err; + } + if (params->block_size < 2 * params->digest_size) { + fsverity_warn(inode, + "Merkle tree block size (%u) too small for hash algorithm \"%s\"", + params->block_size, hash_alg->name); + err = -EINVAL; + goto out_err; + } + params->log_arity = params->log_blocksize - ilog2(params->digest_size); + params->hashes_per_block = 1 << params->log_arity; + + pr_debug("Merkle tree uses %s with %u-byte blocks (%u hashes/block), salt=%*phN\n", + hash_alg->name, params->block_size, params->hashes_per_block, + (int)salt_size, salt); + + /* + * Compute the number of levels in the Merkle tree and create a map from + * level to the starting block of that level. Level 'num_levels - 1' is + * the root and is stored first. Level 0 is the level directly "above" + * the data blocks and is stored last. + */ + + /* Compute number of levels and the number of blocks in each level */ + blocks = (inode->i_size + params->block_size - 1) >> log_blocksize; + pr_debug("Data is %lld bytes (%llu blocks)\n", inode->i_size, blocks); + while (blocks > 1) { + if (params->num_levels >= FS_VERITY_MAX_LEVELS) { + fsverity_err(inode, "Too many levels in Merkle tree"); + err = -EINVAL; + goto out_err; + } + blocks = (blocks + params->hashes_per_block - 1) >> + params->log_arity; + /* temporarily using level_start[] to store blocks in level */ + params->level_start[params->num_levels++] = blocks; + } + + /* Compute the starting block of each level */ + offset = 0; + for (level = (int)params->num_levels - 1; level >= 0; level--) { + blocks = params->level_start[level]; + params->level_start[level] = offset; + pr_debug("Level %d is %llu blocks starting at index %llu\n", + level, blocks, offset); + offset += blocks; + } + + params->tree_size = offset << log_blocksize; + return 0; + +out_err: + kfree(params->hashstate); + memset(params, 0, sizeof(*params)); + return err; +} + +/* + * Compute the file measurement by hashing the fsverity_descriptor excluding the + * signature and with the sig_size field set to 0. + */ +static int compute_file_measurement(const struct fsverity_hash_alg *hash_alg, + struct fsverity_descriptor *desc, + u8 *measurement) +{ + __le32 sig_size = desc->sig_size; + int err; + + desc->sig_size = 0; + err = fsverity_hash_buffer(hash_alg, desc, sizeof(*desc), measurement); + desc->sig_size = sig_size; + + return err; +} + +/* + * Validate the given fsverity_descriptor and create a new fsverity_info from + * it. The signature (if present) is also checked. + */ +struct fsverity_info *fsverity_create_info(const struct inode *inode, + void *_desc, size_t desc_size) +{ + struct fsverity_descriptor *desc = _desc; + struct fsverity_info *vi; + int err; + + if (desc_size < sizeof(*desc)) { + fsverity_err(inode, "Unrecognized descriptor size: %zu bytes", + desc_size); + return ERR_PTR(-EINVAL); + } + + if (desc->version != 1) { + fsverity_err(inode, "Unrecognized descriptor version: %u", + desc->version); + return ERR_PTR(-EINVAL); + } + + if (memchr_inv(desc->__reserved, 0, sizeof(desc->__reserved))) { + fsverity_err(inode, "Reserved bits set in descriptor"); + return ERR_PTR(-EINVAL); + } + + if (desc->salt_size > sizeof(desc->salt)) { + fsverity_err(inode, "Invalid salt_size: %u", desc->salt_size); + return ERR_PTR(-EINVAL); + } + + if (le64_to_cpu(desc->data_size) != inode->i_size) { + fsverity_err(inode, + "Wrong data_size: %llu (desc) != %lld (inode)", + le64_to_cpu(desc->data_size), inode->i_size); + return ERR_PTR(-EINVAL); + } + + vi = kmem_cache_zalloc(fsverity_info_cachep, GFP_KERNEL); + if (!vi) + return ERR_PTR(-ENOMEM); + vi->inode = inode; + + err = fsverity_init_merkle_tree_params(&vi->tree_params, inode, + desc->hash_algorithm, + desc->log_blocksize, + desc->salt, desc->salt_size); + if (err) { + fsverity_err(inode, + "Error %d initializing Merkle tree parameters", + err); + goto out; + } + + memcpy(vi->root_hash, desc->root_hash, vi->tree_params.digest_size); + + err = compute_file_measurement(vi->tree_params.hash_alg, desc, + vi->measurement); + if (err) { + fsverity_err(inode, "Error %d computing file measurement", err); + goto out; + } + pr_debug("Computed file measurement: %s:%*phN\n", + vi->tree_params.hash_alg->name, + vi->tree_params.digest_size, vi->measurement); + + err = fsverity_verify_signature(vi, desc, desc_size); +out: + if (err) { + fsverity_free_info(vi); + vi = ERR_PTR(err); + } + return vi; +} + +void fsverity_set_info(struct inode *inode, struct fsverity_info *vi) +{ + /* + * Multiple processes may race to set ->i_verity_info, so use cmpxchg. + * This pairs with the READ_ONCE() in fsverity_get_info(). + */ + if (cmpxchg(&inode->i_verity_info, NULL, vi) != NULL) + fsverity_free_info(vi); +} + +void fsverity_free_info(struct fsverity_info *vi) +{ + if (!vi) + return; + kfree(vi->tree_params.hashstate); + kmem_cache_free(fsverity_info_cachep, vi); +} + +/* Ensure the inode has an ->i_verity_info */ +static int ensure_verity_info(struct inode *inode) +{ + struct fsverity_info *vi = fsverity_get_info(inode); + struct fsverity_descriptor *desc; + int res; + + if (vi) + return 0; + + res = inode->i_sb->s_vop->get_verity_descriptor(inode, NULL, 0); + if (res < 0) { + fsverity_err(inode, + "Error %d getting verity descriptor size", res); + return res; + } + if (res > FS_VERITY_MAX_DESCRIPTOR_SIZE) { + fsverity_err(inode, "Verity descriptor is too large (%d bytes)", + res); + return -EMSGSIZE; + } + desc = kmalloc(res, GFP_KERNEL); + if (!desc) + return -ENOMEM; + res = inode->i_sb->s_vop->get_verity_descriptor(inode, desc, res); + if (res < 0) { + fsverity_err(inode, "Error %d reading verity descriptor", res); + goto out_free_desc; + } + + vi = fsverity_create_info(inode, desc, res); + if (IS_ERR(vi)) { + res = PTR_ERR(vi); + goto out_free_desc; + } + + fsverity_set_info(inode, vi); + res = 0; +out_free_desc: + kfree(desc); + return res; +} + +/** + * fsverity_file_open() - prepare to open a verity file + * @inode: the inode being opened + * @filp: the struct file being set up + * + * When opening a verity file, deny the open if it is for writing. Otherwise, + * set up the inode's ->i_verity_info if not already done. + * + * When combined with fscrypt, this must be called after fscrypt_file_open(). + * Otherwise, we won't have the key set up to decrypt the verity metadata. + * + * Return: 0 on success, -errno on failure + */ +int fsverity_file_open(struct inode *inode, struct file *filp) +{ + if (!IS_VERITY(inode)) + return 0; + + if (filp->f_mode & FMODE_WRITE) { + pr_debug("Denying opening verity file (ino %lu) for write\n", + inode->i_ino); + return -EPERM; + } + + return ensure_verity_info(inode); +} +EXPORT_SYMBOL_GPL(fsverity_file_open); + +/** + * fsverity_prepare_setattr() - prepare to change a verity inode's attributes + * @dentry: dentry through which the inode is being changed + * @attr: attributes to change + * + * Verity files are immutable, so deny truncates. This isn't covered by the + * open-time check because sys_truncate() takes a path, not a file descriptor. + * + * Return: 0 on success, -errno on failure + */ +int fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr) +{ + if (IS_VERITY(d_inode(dentry)) && (attr->ia_valid & ATTR_SIZE)) { + pr_debug("Denying truncate of verity file (ino %lu)\n", + d_inode(dentry)->i_ino); + return -EPERM; + } + return 0; +} +EXPORT_SYMBOL_GPL(fsverity_prepare_setattr); + +/** + * fsverity_cleanup_inode() - free the inode's verity info, if present + * + * Filesystems must call this on inode eviction to free ->i_verity_info. + */ +void fsverity_cleanup_inode(struct inode *inode) +{ + fsverity_free_info(inode->i_verity_info); + inode->i_verity_info = NULL; +} +EXPORT_SYMBOL_GPL(fsverity_cleanup_inode); + +int __init fsverity_init_info_cache(void) +{ + fsverity_info_cachep = KMEM_CACHE_USERCOPY(fsverity_info, + SLAB_RECLAIM_ACCOUNT, + measurement); + if (!fsverity_info_cachep) + return -ENOMEM; + return 0; +} + +void __init fsverity_exit_info_cache(void) +{ + kmem_cache_destroy(fsverity_info_cachep); + fsverity_info_cachep = NULL; +} diff --git a/fs/verity/signature.c b/fs/verity/signature.c new file mode 100644 index 000000000000..c8b255232de5 --- /dev/null +++ b/fs/verity/signature.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/verity/signature.c: verification of builtin signatures + * + * Copyright 2019 Google LLC + */ + +#include "fsverity_private.h" + +#include <linux/cred.h> +#include <linux/key.h> +#include <linux/slab.h> +#include <linux/verification.h> + +/* + * /proc/sys/fs/verity/require_signatures + * If 1, all verity files must have a valid builtin signature. + */ +static int fsverity_require_signatures; + +/* + * Keyring that contains the trusted X.509 certificates. + * + * Only root (kuid=0) can modify this. Also, root may use + * keyctl_restrict_keyring() to prevent any more additions. + */ +static struct key *fsverity_keyring; + +/** + * fsverity_verify_signature() - check a verity file's signature + * + * If the file's fs-verity descriptor includes a signature of the file + * measurement, verify it against the certificates in the fs-verity keyring. + * + * Return: 0 on success (signature valid or not required); -errno on failure + */ +int fsverity_verify_signature(const struct fsverity_info *vi, + const struct fsverity_descriptor *desc, + size_t desc_size) +{ + const struct inode *inode = vi->inode; + const struct fsverity_hash_alg *hash_alg = vi->tree_params.hash_alg; + const u32 sig_size = le32_to_cpu(desc->sig_size); + struct fsverity_signed_digest *d; + int err; + + if (sig_size == 0) { + if (fsverity_require_signatures) { + fsverity_err(inode, + "require_signatures=1, rejecting unsigned file!"); + return -EPERM; + } + return 0; + } + + if (sig_size > desc_size - sizeof(*desc)) { + fsverity_err(inode, "Signature overflows verity descriptor"); + return -EBADMSG; + } + + d = kzalloc(sizeof(*d) + hash_alg->digest_size, GFP_KERNEL); + if (!d) + return -ENOMEM; + memcpy(d->magic, "FSVerity", 8); + d->digest_algorithm = cpu_to_le16(hash_alg - fsverity_hash_algs); + d->digest_size = cpu_to_le16(hash_alg->digest_size); + memcpy(d->digest, vi->measurement, hash_alg->digest_size); + + err = verify_pkcs7_signature(d, sizeof(*d) + hash_alg->digest_size, + desc->signature, sig_size, + fsverity_keyring, + VERIFYING_UNSPECIFIED_SIGNATURE, + NULL, NULL); + kfree(d); + + if (err) { + if (err == -ENOKEY) + fsverity_err(inode, + "File's signing cert isn't in the fs-verity keyring"); + else if (err == -EKEYREJECTED) + fsverity_err(inode, "Incorrect file signature"); + else if (err == -EBADMSG) + fsverity_err(inode, "Malformed file signature"); + else + fsverity_err(inode, "Error %d verifying file signature", + err); + return err; + } + + pr_debug("Valid signature for file measurement %s:%*phN\n", + hash_alg->name, hash_alg->digest_size, vi->measurement); + return 0; +} + +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *fsverity_sysctl_header; + +static const struct ctl_path fsverity_sysctl_path[] = { + { .procname = "fs", }, + { .procname = "verity", }, + { } +}; + +static struct ctl_table fsverity_sysctl_table[] = { + { + .procname = "require_signatures", + .data = &fsverity_require_signatures, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { } +}; + +static int __init fsverity_sysctl_init(void) +{ + fsverity_sysctl_header = register_sysctl_paths(fsverity_sysctl_path, + fsverity_sysctl_table); + if (!fsverity_sysctl_header) { + pr_err("sysctl registration failed!\n"); + return -ENOMEM; + } + return 0; +} +#else /* !CONFIG_SYSCTL */ +static inline int __init fsverity_sysctl_init(void) +{ + return 0; +} +#endif /* !CONFIG_SYSCTL */ + +int __init fsverity_init_signature(void) +{ + struct key *ring; + int err; + + ring = keyring_alloc(".fs-verity", KUIDT_INIT(0), KGIDT_INIT(0), + current_cred(), KEY_POS_SEARCH | + KEY_USR_VIEW | KEY_USR_READ | KEY_USR_WRITE | + KEY_USR_SEARCH | KEY_USR_SETATTR, + KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); + if (IS_ERR(ring)) + return PTR_ERR(ring); + + err = fsverity_sysctl_init(); + if (err) + goto err_put_ring; + + fsverity_keyring = ring; + return 0; + +err_put_ring: + key_put(ring); + return err; +} diff --git a/fs/verity/verify.c b/fs/verity/verify.c new file mode 100644 index 000000000000..3e8f2de44667 --- /dev/null +++ b/fs/verity/verify.c @@ -0,0 +1,281 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/verity/verify.c: data verification functions, i.e. hooks for ->readpages() + * + * Copyright 2019 Google LLC + */ + +#include "fsverity_private.h" + +#include <crypto/hash.h> +#include <linux/bio.h> +#include <linux/ratelimit.h> + +static struct workqueue_struct *fsverity_read_workqueue; + +/** + * hash_at_level() - compute the location of the block's hash at the given level + * + * @params: (in) the Merkle tree parameters + * @dindex: (in) the index of the data block being verified + * @level: (in) the level of hash we want (0 is leaf level) + * @hindex: (out) the index of the hash block containing the wanted hash + * @hoffset: (out) the byte offset to the wanted hash within the hash block + */ +static void hash_at_level(const struct merkle_tree_params *params, + pgoff_t dindex, unsigned int level, pgoff_t *hindex, + unsigned int *hoffset) +{ + pgoff_t position; + + /* Offset of the hash within the level's region, in hashes */ + position = dindex >> (level * params->log_arity); + + /* Index of the hash block in the tree overall */ + *hindex = params->level_start[level] + (position >> params->log_arity); + + /* Offset of the wanted hash (in bytes) within the hash block */ + *hoffset = (position & ((1 << params->log_arity) - 1)) << + (params->log_blocksize - params->log_arity); +} + +/* Extract a hash from a hash page */ +static void extract_hash(struct page *hpage, unsigned int hoffset, + unsigned int hsize, u8 *out) +{ + void *virt = kmap_atomic(hpage); + + memcpy(out, virt + hoffset, hsize); + kunmap_atomic(virt); +} + +static inline int cmp_hashes(const struct fsverity_info *vi, + const u8 *want_hash, const u8 *real_hash, + pgoff_t index, int level) +{ + const unsigned int hsize = vi->tree_params.digest_size; + + if (memcmp(want_hash, real_hash, hsize) == 0) + return 0; + + fsverity_err(vi->inode, + "FILE CORRUPTED! index=%lu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN", + index, level, + vi->tree_params.hash_alg->name, hsize, want_hash, + vi->tree_params.hash_alg->name, hsize, real_hash); + return -EBADMSG; +} + +/* + * Verify a single data page against the file's Merkle tree. + * + * In principle, we need to verify the entire path to the root node. However, + * for efficiency the filesystem may cache the hash pages. Therefore we need + * only ascend the tree until an already-verified page is seen, as indicated by + * the PageChecked bit being set; then verify the path to that page. + * + * This code currently only supports the case where the verity block size is + * equal to PAGE_SIZE. Doing otherwise would be possible but tricky, since we + * wouldn't be able to use the PageChecked bit. + * + * Note that multiple processes may race to verify a hash page and mark it + * Checked, but it doesn't matter; the result will be the same either way. + * + * Return: true if the page is valid, else false. + */ +static bool verify_page(struct inode *inode, const struct fsverity_info *vi, + struct ahash_request *req, struct page *data_page) +{ + const struct merkle_tree_params *params = &vi->tree_params; + const unsigned int hsize = params->digest_size; + const pgoff_t index = data_page->index; + int level; + u8 _want_hash[FS_VERITY_MAX_DIGEST_SIZE]; + const u8 *want_hash; + u8 real_hash[FS_VERITY_MAX_DIGEST_SIZE]; + struct page *hpages[FS_VERITY_MAX_LEVELS]; + unsigned int hoffsets[FS_VERITY_MAX_LEVELS]; + int err; + + if (WARN_ON_ONCE(!PageLocked(data_page) || PageUptodate(data_page))) + return false; + + pr_debug_ratelimited("Verifying data page %lu...\n", index); + + /* + * Starting at the leaf level, ascend the tree saving hash pages along + * the way until we find a verified hash page, indicated by PageChecked; + * or until we reach the root. + */ + for (level = 0; level < params->num_levels; level++) { + pgoff_t hindex; + unsigned int hoffset; + struct page *hpage; + + hash_at_level(params, index, level, &hindex, &hoffset); + + pr_debug_ratelimited("Level %d: hindex=%lu, hoffset=%u\n", + level, hindex, hoffset); + + hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode, + hindex); + if (IS_ERR(hpage)) { + err = PTR_ERR(hpage); + fsverity_err(inode, + "Error %d reading Merkle tree page %lu", + err, hindex); + goto out; + } + + if (PageChecked(hpage)) { + extract_hash(hpage, hoffset, hsize, _want_hash); + want_hash = _want_hash; + put_page(hpage); + pr_debug_ratelimited("Hash page already checked, want %s:%*phN\n", + params->hash_alg->name, + hsize, want_hash); + goto descend; + } + pr_debug_ratelimited("Hash page not yet checked\n"); + hpages[level] = hpage; + hoffsets[level] = hoffset; + } + + want_hash = vi->root_hash; + pr_debug("Want root hash: %s:%*phN\n", + params->hash_alg->name, hsize, want_hash); +descend: + /* Descend the tree verifying hash pages */ + for (; level > 0; level--) { + struct page *hpage = hpages[level - 1]; + unsigned int hoffset = hoffsets[level - 1]; + + err = fsverity_hash_page(params, inode, req, hpage, real_hash); + if (err) + goto out; + err = cmp_hashes(vi, want_hash, real_hash, index, level - 1); + if (err) + goto out; + SetPageChecked(hpage); + extract_hash(hpage, hoffset, hsize, _want_hash); + want_hash = _want_hash; + put_page(hpage); + pr_debug("Verified hash page at level %d, now want %s:%*phN\n", + level - 1, params->hash_alg->name, hsize, want_hash); + } + + /* Finally, verify the data page */ + err = fsverity_hash_page(params, inode, req, data_page, real_hash); + if (err) + goto out; + err = cmp_hashes(vi, want_hash, real_hash, index, -1); +out: + for (; level > 0; level--) + put_page(hpages[level - 1]); + + return err == 0; +} + +/** + * fsverity_verify_page() - verify a data page + * + * Verify a page that has just been read from a verity file. The page must be a + * pagecache page that is still locked and not yet uptodate. + * + * Return: true if the page is valid, else false. + */ +bool fsverity_verify_page(struct page *page) +{ + struct inode *inode = page->mapping->host; + const struct fsverity_info *vi = inode->i_verity_info; + struct ahash_request *req; + bool valid; + + req = ahash_request_alloc(vi->tree_params.hash_alg->tfm, GFP_NOFS); + if (unlikely(!req)) + return false; + + valid = verify_page(inode, vi, req, page); + + ahash_request_free(req); + + return valid; +} +EXPORT_SYMBOL_GPL(fsverity_verify_page); + +#ifdef CONFIG_BLOCK +/** + * fsverity_verify_bio() - verify a 'read' bio that has just completed + * + * Verify a set of pages that have just been read from a verity file. The pages + * must be pagecache pages that are still locked and not yet uptodate. Pages + * that fail verification are set to the Error state. Verification is skipped + * for pages already in the Error state, e.g. due to fscrypt decryption failure. + * + * This is a helper function for use by the ->readpages() method of filesystems + * that issue bios to read data directly into the page cache. Filesystems that + * populate the page cache without issuing bios (e.g. non block-based + * filesystems) must instead call fsverity_verify_page() directly on each page. + * All filesystems must also call fsverity_verify_page() on holes. + */ +void fsverity_verify_bio(struct bio *bio) +{ + struct inode *inode = bio_first_page_all(bio)->mapping->host; + const struct fsverity_info *vi = inode->i_verity_info; + struct ahash_request *req; + struct bio_vec *bv; + struct bvec_iter_all iter_all; + + req = ahash_request_alloc(vi->tree_params.hash_alg->tfm, GFP_NOFS); + if (unlikely(!req)) { + bio_for_each_segment_all(bv, bio, iter_all) + SetPageError(bv->bv_page); + return; + } + + bio_for_each_segment_all(bv, bio, iter_all) { + struct page *page = bv->bv_page; + + if (!PageError(page) && !verify_page(inode, vi, req, page)) + SetPageError(page); + } + + ahash_request_free(req); +} +EXPORT_SYMBOL_GPL(fsverity_verify_bio); +#endif /* CONFIG_BLOCK */ + +/** + * fsverity_enqueue_verify_work() - enqueue work on the fs-verity workqueue + * + * Enqueue verification work for asynchronous processing. + */ +void fsverity_enqueue_verify_work(struct work_struct *work) +{ + queue_work(fsverity_read_workqueue, work); +} +EXPORT_SYMBOL_GPL(fsverity_enqueue_verify_work); + +int __init fsverity_init_workqueue(void) +{ + /* + * Use an unbound workqueue to allow bios to be verified in parallel + * even when they happen to complete on the same CPU. This sacrifices + * locality, but it's worthwhile since hashing is CPU-intensive. + * + * Also use a high-priority workqueue to prioritize verification work, + * which blocks reads from completing, over regular application tasks. + */ + fsverity_read_workqueue = alloc_workqueue("fsverity_read_queue", + WQ_UNBOUND | WQ_HIGHPRI, + num_online_cpus()); + if (!fsverity_read_workqueue) + return -ENOMEM; + return 0; +} + +void __init fsverity_exit_workqueue(void) +{ + destroy_workqueue(fsverity_read_workqueue); + fsverity_read_workqueue = NULL; +} diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 6f7848cd5527..affa557c2337 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -67,7 +67,7 @@ xfs_find_handle( return -EBADF; inode = file_inode(f.file); } else { - error = user_lpath((const char __user *)hreq->path, &path); + error = user_path_at(AT_FDCWD, hreq->path, 0, &path); if (error) return error; inode = d_inode(path.dentry); |