diff options
Diffstat (limited to 'fs')
312 files changed, 22291 insertions, 8592 deletions
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 08112fbcaece..ca243e658d71 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -69,8 +69,12 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, if (v9fs_proto_dotl(v9ses)) { sb->s_op = &v9fs_super_ops_dotl; sb->s_xattr = v9fs_xattr_handlers; - } else + } else { sb->s_op = &v9fs_super_ops; + sb->s_time_max = U32_MAX; + } + + sb->s_time_min = 0; ret = super_setup_bdi(sb); if (ret) diff --git a/fs/Kconfig b/fs/Kconfig index bfb1c6095c7a..2501e6f1f965 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -112,6 +112,8 @@ config MANDATORY_FILE_LOCKING source "fs/crypto/Kconfig" +source "fs/verity/Kconfig" + source "fs/notify/Kconfig" source "fs/quota/Kconfig" @@ -261,6 +263,7 @@ source "fs/romfs/Kconfig" source "fs/pstore/Kconfig" source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" +source "fs/erofs/Kconfig" endif # MISC_FILESYSTEMS diff --git a/fs/Makefile b/fs/Makefile index d60089fd689b..14231b4cf383 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -34,6 +34,7 @@ obj-$(CONFIG_AIO) += aio.o obj-$(CONFIG_IO_URING) += io_uring.o obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FS_ENCRYPTION) += crypto/ +obj-$(CONFIG_FS_VERITY) += verity/ obj-$(CONFIG_FILE_LOCKING) += locks.o obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o @@ -130,3 +131,4 @@ obj-$(CONFIG_F2FS_FS) += f2fs/ obj-$(CONFIG_CEPH_FS) += ceph/ obj-$(CONFIG_PSTORE) += pstore/ obj-$(CONFIG_EFIVAR_FS) += efivarfs/ +obj-$(CONFIG_EROFS_FS) += erofs/ diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index 14a6c1b90c9f..f708c45d5f66 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -375,7 +375,7 @@ affs_secs_to_datestamp(time64_t secs, struct affs_date *ds) u32 minute; s32 rem; - secs -= sys_tz.tz_minuteswest * 60 + ((8 * 365 + 2) * 24 * 60 * 60); + secs -= sys_tz.tz_minuteswest * 60 + AFFS_EPOCH_DELTA; if (secs < 0) secs = 0; days = div_s64_rem(secs, 86400, &rem); diff --git a/fs/affs/amigaffs.h b/fs/affs/amigaffs.h index f9bef9056659..81fb396d4dfa 100644 --- a/fs/affs/amigaffs.h +++ b/fs/affs/amigaffs.h @@ -32,6 +32,9 @@ #define AFFS_ROOT_BMAPS 25 +/* Seconds since Amiga epoch of 1978/01/01 to UNIX */ +#define AFFS_EPOCH_DELTA ((8 * 365 + 2) * 86400LL) + struct affs_date { __be32 days; __be32 mins; diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 73598bff8506..a346cf7659f1 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -150,10 +150,10 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) } inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec - = (be32_to_cpu(tail->change.days) * (24 * 60 * 60) + + = (be32_to_cpu(tail->change.days) * 86400LL + be32_to_cpu(tail->change.mins) * 60 + be32_to_cpu(tail->change.ticks) / 50 + - ((8 * 365 + 2) * 24 * 60 * 60)) + + AFFS_EPOCH_DELTA) + sys_tz.tz_minuteswest * 60; inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_atime.tv_nsec = 0; affs_brelse(bh); diff --git a/fs/affs/super.c b/fs/affs/super.c index e7d036efbaa1..cc463ae47c12 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -355,6 +355,10 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) sb->s_op = &affs_sops; sb->s_flags |= SB_NODIRATIME; + sb->s_time_gran = NSEC_PER_SEC; + sb->s_time_min = sys_tz.tz_minuteswest * 60 + AFFS_EPOCH_DELTA; + sb->s_time_max = 86400LL * U32_MAX + 86400 + sb->s_time_min; + sbi = kzalloc(sizeof(struct affs_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 139b4e3cc946..cc12772d0a4d 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -966,6 +966,58 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, } /* + * Check the validity of a dentry under RCU conditions. + */ +static int afs_d_revalidate_rcu(struct dentry *dentry) +{ + struct afs_vnode *dvnode, *vnode; + struct dentry *parent; + struct inode *dir, *inode; + long dir_version, de_version; + + _enter("%p", dentry); + + /* Check the parent directory is still valid first. */ + parent = READ_ONCE(dentry->d_parent); + dir = d_inode_rcu(parent); + if (!dir) + return -ECHILD; + dvnode = AFS_FS_I(dir); + if (test_bit(AFS_VNODE_DELETED, &dvnode->flags)) + return -ECHILD; + + if (!afs_check_validity(dvnode)) + return -ECHILD; + + /* We only need to invalidate a dentry if the server's copy changed + * behind our back. If we made the change, it's no problem. Note that + * on a 32-bit system, we only have 32 bits in the dentry to store the + * version. + */ + dir_version = (long)READ_ONCE(dvnode->status.data_version); + de_version = (long)READ_ONCE(dentry->d_fsdata); + if (de_version != dir_version) { + dir_version = (long)READ_ONCE(dvnode->invalid_before); + if (de_version - dir_version < 0) + return -ECHILD; + } + + /* Check to see if the vnode referred to by the dentry still + * has a callback. + */ + if (d_really_is_positive(dentry)) { + inode = d_inode_rcu(dentry); + if (inode) { + vnode = AFS_FS_I(inode); + if (!afs_check_validity(vnode)) + return -ECHILD; + } + } + + return 1; /* Still valid */ +} + +/* * check that a dentry lookup hit has found a valid entry * - NOTE! the hit can be a negative hit too, so we can't assume we have an * inode @@ -982,7 +1034,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) int ret; if (flags & LOOKUP_RCU) - return -ECHILD; + return afs_d_revalidate_rcu(dentry); if (d_really_is_positive(dentry)) { vnode = AFS_FS_I(d_inode(dentry)); diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 114f281f3687..6f84231f11a5 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -15,8 +15,6 @@ #include "xdr_fs.h" #include "protocol_yfs.h" -static const struct afs_fid afs_zero_fid; - static inline void afs_use_fs_server(struct afs_call *call, struct afs_cb_interest *cbi) { call->cbi = afs_get_cb_interest(cbi); @@ -394,7 +392,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) goto no_more_data; /* Discard any excess data the server gave us */ - iov_iter_discard(&call->iter, READ, req->actual_len - req->len); + afs_extract_discard(call, req->actual_len - req->len); call->unmarshall = 3; /* Fall through */ @@ -1872,7 +1870,7 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call) call->count = count; call->count2 = count; - iov_iter_discard(&call->iter, READ, count * sizeof(__be32)); + afs_extract_discard(call, count * sizeof(__be32)); call->unmarshall++; /* Fall through */ diff --git a/fs/afs/internal.h b/fs/afs/internal.h index f66a3be12fd6..9cdfabaeaa0b 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -1217,6 +1217,7 @@ extern void afs_cache_permit(struct afs_vnode *, struct key *, unsigned int, struct afs_status_cb *); extern void afs_zap_permits(struct rcu_head *); extern struct key *afs_request_key(struct afs_cell *); +extern struct key *afs_request_key_rcu(struct afs_cell *); extern int afs_check_permit(struct afs_vnode *, struct key *, afs_access_t *); extern int afs_permission(struct inode *, int); extern void __exit afs_clean_up_permit_cache(void); diff --git a/fs/afs/security.c b/fs/afs/security.c index 71e71c07568f..ce9de1e6742b 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -27,8 +27,37 @@ struct key *afs_request_key(struct afs_cell *cell) _enter("{%x}", key_serial(cell->anonymous_key)); _debug("key %s", cell->anonymous_key->description); - key = request_key(&key_type_rxrpc, cell->anonymous_key->description, - NULL); + key = request_key_net(&key_type_rxrpc, cell->anonymous_key->description, + cell->net->net, NULL); + if (IS_ERR(key)) { + if (PTR_ERR(key) != -ENOKEY) { + _leave(" = %ld", PTR_ERR(key)); + return key; + } + + /* act as anonymous user */ + _leave(" = {%x} [anon]", key_serial(cell->anonymous_key)); + return key_get(cell->anonymous_key); + } else { + /* act as authorised user */ + _leave(" = {%x} [auth]", key_serial(key)); + return key; + } +} + +/* + * Get a key when pathwalk is in rcuwalk mode. + */ +struct key *afs_request_key_rcu(struct afs_cell *cell) +{ + struct key *key; + + _enter("{%x}", key_serial(cell->anonymous_key)); + + _debug("key %s", cell->anonymous_key->description); + key = request_key_net_rcu(&key_type_rxrpc, + cell->anonymous_key->description, + cell->net->net); if (IS_ERR(key)) { if (PTR_ERR(key) != -ENOKEY) { _leave(" = %ld", PTR_ERR(key)); @@ -274,6 +303,40 @@ someone_else_changed_it: return; } +static bool afs_check_permit_rcu(struct afs_vnode *vnode, struct key *key, + afs_access_t *_access) +{ + const struct afs_permits *permits; + int i; + + _enter("{%llx:%llu},%x", + vnode->fid.vid, vnode->fid.vnode, key_serial(key)); + + /* check the permits to see if we've got one yet */ + if (key == vnode->volume->cell->anonymous_key) { + *_access = vnode->status.anon_access; + _leave(" = t [anon %x]", *_access); + return true; + } + + permits = rcu_dereference(vnode->permit_cache); + if (permits) { + for (i = 0; i < permits->nr_permits; i++) { + if (permits->permits[i].key < key) + continue; + if (permits->permits[i].key > key) + break; + + *_access = permits->permits[i].access; + _leave(" = %u [perm %x]", !permits->invalidated, *_access); + return !permits->invalidated; + } + } + + _leave(" = f"); + return false; +} + /* * check with the fileserver to see if the directory or parent directory is * permitted to be accessed with this authorisation, and if so, what access it @@ -340,33 +403,42 @@ int afs_permission(struct inode *inode, int mask) struct afs_vnode *vnode = AFS_FS_I(inode); afs_access_t uninitialized_var(access); struct key *key; - int ret; - - if (mask & MAY_NOT_BLOCK) - return -ECHILD; + int ret = 0; _enter("{{%llx:%llu},%lx},%x,", vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask); - key = afs_request_key(vnode->volume->cell); - if (IS_ERR(key)) { - _leave(" = %ld [key]", PTR_ERR(key)); - return PTR_ERR(key); - } + if (mask & MAY_NOT_BLOCK) { + key = afs_request_key_rcu(vnode->volume->cell); + if (IS_ERR(key)) + return -ECHILD; + + ret = -ECHILD; + if (!afs_check_validity(vnode) || + !afs_check_permit_rcu(vnode, key, &access)) + goto error; + } else { + key = afs_request_key(vnode->volume->cell); + if (IS_ERR(key)) { + _leave(" = %ld [key]", PTR_ERR(key)); + return PTR_ERR(key); + } - ret = afs_validate(vnode, key); - if (ret < 0) - goto error; + ret = afs_validate(vnode, key); + if (ret < 0) + goto error; - /* check the permits to see if we've got one yet */ - ret = afs_check_permit(vnode, key, &access); - if (ret < 0) - goto error; + /* check the permits to see if we've got one yet */ + ret = afs_check_permit(vnode, key, &access); + if (ret < 0) + goto error; + } /* interpret the access mask */ _debug("REQ %x ACC %x on %s", mask, access, S_ISDIR(inode->i_mode) ? "dir" : "file"); + ret = 0; if (S_ISDIR(inode->i_mode)) { if (mask & (MAY_EXEC | MAY_READ | MAY_CHDIR)) { if (!(access & AFS_ACE_LOOKUP)) diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 1a414300b654..92ca5e27573b 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -12,8 +12,6 @@ unsigned __read_mostly afs_volume_gc_delay = 10; unsigned __read_mostly afs_volume_record_life = 60 * 60; -static const char *const afs_voltypes[] = { "R/W", "R/O", "BAK" }; - /* * Allocate a volume record and load it up from a vldb record. */ diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index ca2452806ebf..3ee7abf4b2d0 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -505,7 +505,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) goto no_more_data; /* Discard any excess data the server gave us */ - iov_iter_discard(&call->iter, READ, req->actual_len - req->len); + afs_extract_discard(call, req->actual_len - req->len); call->unmarshall = 3; /* Fall through */ @@ -2007,7 +2007,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call) acl->size = call->count2; afs_extract_begin(call, acl->data, size); } else { - iov_iter_discard(&call->iter, READ, size); + afs_extract_discard(call, size); } call->unmarshall++; /* Fall through */ @@ -2039,7 +2039,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call) acl->size = call->count2; afs_extract_begin(call, acl->data, size); } else { - iov_iter_discard(&call->iter, READ, size); + afs_extract_discard(call, size); } call->unmarshall++; /* Fall through */ diff --git a/fs/attr.c b/fs/attr.c index d22e8187477f..df28035aa23e 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -183,15 +183,18 @@ void setattr_copy(struct inode *inode, const struct iattr *attr) inode->i_uid = attr->ia_uid; if (ia_valid & ATTR_GID) inode->i_gid = attr->ia_gid; - if (ia_valid & ATTR_ATIME) - inode->i_atime = timespec64_trunc(attr->ia_atime, - inode->i_sb->s_time_gran); - if (ia_valid & ATTR_MTIME) - inode->i_mtime = timespec64_trunc(attr->ia_mtime, - inode->i_sb->s_time_gran); - if (ia_valid & ATTR_CTIME) - inode->i_ctime = timespec64_trunc(attr->ia_ctime, - inode->i_sb->s_time_gran); + if (ia_valid & ATTR_ATIME) { + inode->i_atime = timestamp_truncate(attr->ia_atime, + inode); + } + if (ia_valid & ATTR_MTIME) { + inode->i_mtime = timestamp_truncate(attr->ia_mtime, + inode); + } + if (ia_valid & ATTR_CTIME) { + inode->i_ctime = timestamp_truncate(attr->ia_ctime, + inode); + } if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 8c0c11181fad..8bcec8dcabb6 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -58,7 +58,6 @@ struct autofs_info { struct completion expire_complete; struct list_head active; - int active_count; struct list_head expiring; diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index cdff0567aacb..2866fabf497f 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -70,6 +70,27 @@ done: return status; } +/* p->d_lock held */ +static struct dentry *positive_after(struct dentry *p, struct dentry *child) +{ + if (child) + child = list_next_entry(child, d_child); + else + child = list_first_entry(&p->d_subdirs, struct dentry, d_child); + + list_for_each_entry_from(child, &p->d_subdirs, d_child) { + spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); + if (simple_positive(child)) { + dget_dlock(child); + spin_unlock(&child->d_lock); + return child; + } + spin_unlock(&child->d_lock); + } + + return NULL; +} + /* * Calculate and dget next entry in the subdirs list under root. */ @@ -77,43 +98,14 @@ static struct dentry *get_next_positive_subdir(struct dentry *prev, struct dentry *root) { struct autofs_sb_info *sbi = autofs_sbi(root->d_sb); - struct list_head *next; struct dentry *q; spin_lock(&sbi->lookup_lock); spin_lock(&root->d_lock); - - if (prev) - next = prev->d_child.next; - else { - prev = dget_dlock(root); - next = prev->d_subdirs.next; - } - -cont: - if (next == &root->d_subdirs) { - spin_unlock(&root->d_lock); - spin_unlock(&sbi->lookup_lock); - dput(prev); - return NULL; - } - - q = list_entry(next, struct dentry, d_child); - - spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED); - /* Already gone or negative dentry (under construction) - try next */ - if (!d_count(q) || !simple_positive(q)) { - spin_unlock(&q->d_lock); - next = q->d_child.next; - goto cont; - } - dget_dlock(q); - spin_unlock(&q->d_lock); + q = positive_after(root, prev); spin_unlock(&root->d_lock); spin_unlock(&sbi->lookup_lock); - dput(prev); - return q; } @@ -124,59 +116,28 @@ static struct dentry *get_next_positive_dentry(struct dentry *prev, struct dentry *root) { struct autofs_sb_info *sbi = autofs_sbi(root->d_sb); - struct list_head *next; - struct dentry *p, *ret; + struct dentry *p = prev, *ret = NULL, *d = NULL; if (prev == NULL) return dget(root); spin_lock(&sbi->lookup_lock); -relock: - p = prev; spin_lock(&p->d_lock); -again: - next = p->d_subdirs.next; - if (next == &p->d_subdirs) { - while (1) { - struct dentry *parent; - - if (p == root) { - spin_unlock(&p->d_lock); - spin_unlock(&sbi->lookup_lock); - dput(prev); - return NULL; - } + while (1) { + struct dentry *parent; - parent = p->d_parent; - if (!spin_trylock(&parent->d_lock)) { - spin_unlock(&p->d_lock); - cpu_relax(); - goto relock; - } - spin_unlock(&p->d_lock); - next = p->d_child.next; - p = parent; - if (next != &parent->d_subdirs) - break; - } - } - ret = list_entry(next, struct dentry, d_child); - - spin_lock_nested(&ret->d_lock, DENTRY_D_LOCK_NESTED); - /* Negative dentry - try next */ - if (!simple_positive(ret)) { + ret = positive_after(p, d); + if (ret || p == root) + break; + parent = p->d_parent; spin_unlock(&p->d_lock); - lock_set_subclass(&ret->d_lock.dep_map, 0, _RET_IP_); - p = ret; - goto again; + spin_lock(&parent->d_lock); + d = p; + p = parent; } - dget_dlock(ret); - spin_unlock(&ret->d_lock); spin_unlock(&p->d_lock); spin_unlock(&sbi->lookup_lock); - dput(prev); - return ret; } diff --git a/fs/autofs/root.c b/fs/autofs/root.c index e646569c75ed..29abafc0ce31 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -60,38 +60,15 @@ const struct dentry_operations autofs_dentry_operations = { .d_release = autofs_dentry_release, }; -static void autofs_add_active(struct dentry *dentry) -{ - struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); - struct autofs_info *ino; - - ino = autofs_dentry_ino(dentry); - if (ino) { - spin_lock(&sbi->lookup_lock); - if (!ino->active_count) { - if (list_empty(&ino->active)) - list_add(&ino->active, &sbi->active_list); - } - ino->active_count++; - spin_unlock(&sbi->lookup_lock); - } -} - static void autofs_del_active(struct dentry *dentry) { struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); struct autofs_info *ino; ino = autofs_dentry_ino(dentry); - if (ino) { - spin_lock(&sbi->lookup_lock); - ino->active_count--; - if (!ino->active_count) { - if (!list_empty(&ino->active)) - list_del_init(&ino->active); - } - spin_unlock(&sbi->lookup_lock); - } + spin_lock(&sbi->lookup_lock); + list_del_init(&ino->active); + spin_unlock(&sbi->lookup_lock); } static int autofs_dir_open(struct inode *inode, struct file *file) @@ -527,19 +504,22 @@ static struct dentry *autofs_lookup(struct inode *dir, if (!autofs_oz_mode(sbi) && !IS_ROOT(dentry->d_parent)) return ERR_PTR(-ENOENT); - /* Mark entries in the root as mount triggers */ - if (IS_ROOT(dentry->d_parent) && - autofs_type_indirect(sbi->type)) - __managed_dentry_set_managed(dentry); - ino = autofs_new_ino(sbi); if (!ino) return ERR_PTR(-ENOMEM); + spin_lock(&sbi->lookup_lock); + spin_lock(&dentry->d_lock); + /* Mark entries in the root as mount triggers */ + if (IS_ROOT(dentry->d_parent) && + autofs_type_indirect(sbi->type)) + __managed_dentry_set_managed(dentry); dentry->d_fsdata = ino; ino->dentry = dentry; - autofs_add_active(dentry); + list_add(&ino->active, &sbi->active_list); + spin_unlock(&sbi->lookup_lock); + spin_unlock(&dentry->d_lock); } return NULL; } diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 462d096ff3e9..64cdf4d8e424 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -893,6 +893,8 @@ befs_fill_super(struct super_block *sb, void *data, int silent) sb_set_blocksize(sb, (ulong) befs_sb->block_size); sb->s_op = &befs_sops; sb->s_export_op = &befs_export_operations; + sb->s_time_min = 0; + sb->s_time_max = 0xffffffffffffll; root = befs_iget(sb, iaddr2blockno(sb, &(befs_sb->root_dir))); if (IS_ERR(root)) { ret = PTR_ERR(root); diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 5e97bed073d7..f8ce1368218b 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -324,6 +324,8 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) return -ENOMEM; mutex_init(&info->bfs_lock); s->s_fs_info = info; + s->s_time_min = 0; + s->s_time_max = U32_MAX; sb_set_blocksize(s, BFS_BSIZE); diff --git a/fs/block_dev.c b/fs/block_dev.c index 677cb364d33f..9c073dbdc1b0 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1972,6 +1972,9 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) if (bdev_read_only(I_BDEV(bd_inode))) return -EPERM; + if (IS_SWAPFILE(bd_inode)) + return -ETXTBSY; + if (!iov_iter_count(from)) return 0; diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 76a843198bcb..82200dbca5ac 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -11,7 +11,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ - block-rsv.o delalloc-space.o + block-rsv.o delalloc-space.o block-group.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 122cb97c7909..2e9e13ffbd08 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -12,9 +12,11 @@ #include "async-thread.h" #include "ctree.h" -#define WORK_DONE_BIT 0 -#define WORK_ORDER_DONE_BIT 1 -#define WORK_HIGH_PRIO_BIT 2 +enum { + WORK_DONE_BIT, + WORK_ORDER_DONE_BIT, + WORK_HIGH_PRIO_BIT, +}; #define NO_THRESHOLD (-1) #define DFT_THRESHOLD (32) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c new file mode 100644 index 000000000000..bf7e3f23bba7 --- /dev/null +++ b/fs/btrfs/block-group.c @@ -0,0 +1,3173 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "misc.h" +#include "ctree.h" +#include "block-group.h" +#include "space-info.h" +#include "disk-io.h" +#include "free-space-cache.h" +#include "free-space-tree.h" +#include "disk-io.h" +#include "volumes.h" +#include "transaction.h" +#include "ref-verify.h" +#include "sysfs.h" +#include "tree-log.h" +#include "delalloc-space.h" + +/* + * Return target flags in extended format or 0 if restripe for this chunk_type + * is not in progress + * + * Should be called with balance_lock held + */ +static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) +{ + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + u64 target = 0; + + if (!bctl) + return 0; + + if (flags & BTRFS_BLOCK_GROUP_DATA && + bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { + target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; + } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && + bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { + target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; + } else if (flags & BTRFS_BLOCK_GROUP_METADATA && + bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { + target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; + } + + return target; +} + +/* + * @flags: available profiles in extended format (see ctree.h) + * + * Return reduced profile in chunk format. If profile changing is in progress + * (either running or paused) picks the target profile (if it's already + * available), otherwise falls back to plain reducing. + */ +static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) +{ + u64 num_devices = fs_info->fs_devices->rw_devices; + u64 target; + u64 raid_type; + u64 allowed = 0; + + /* + * See if restripe for this chunk_type is in progress, if so try to + * reduce to the target profile + */ + spin_lock(&fs_info->balance_lock); + target = get_restripe_target(fs_info, flags); + if (target) { + /* Pick target profile only if it's already available */ + if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { + spin_unlock(&fs_info->balance_lock); + return extended_to_chunk(target); + } + } + spin_unlock(&fs_info->balance_lock); + + /* First, mask out the RAID levels which aren't possible */ + for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { + if (num_devices >= btrfs_raid_array[raid_type].devs_min) + allowed |= btrfs_raid_array[raid_type].bg_flag; + } + allowed &= flags; + + if (allowed & BTRFS_BLOCK_GROUP_RAID6) + allowed = BTRFS_BLOCK_GROUP_RAID6; + else if (allowed & BTRFS_BLOCK_GROUP_RAID5) + allowed = BTRFS_BLOCK_GROUP_RAID5; + else if (allowed & BTRFS_BLOCK_GROUP_RAID10) + allowed = BTRFS_BLOCK_GROUP_RAID10; + else if (allowed & BTRFS_BLOCK_GROUP_RAID1) + allowed = BTRFS_BLOCK_GROUP_RAID1; + else if (allowed & BTRFS_BLOCK_GROUP_RAID0) + allowed = BTRFS_BLOCK_GROUP_RAID0; + + flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK; + + return extended_to_chunk(flags | allowed); +} + +static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) +{ + unsigned seq; + u64 flags; + + do { + flags = orig_flags; + seq = read_seqbegin(&fs_info->profiles_lock); + + if (flags & BTRFS_BLOCK_GROUP_DATA) + flags |= fs_info->avail_data_alloc_bits; + else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + flags |= fs_info->avail_system_alloc_bits; + else if (flags & BTRFS_BLOCK_GROUP_METADATA) + flags |= fs_info->avail_metadata_alloc_bits; + } while (read_seqretry(&fs_info->profiles_lock, seq)); + + return btrfs_reduce_alloc_profile(fs_info, flags); +} + +u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) +{ + return get_alloc_profile(fs_info, orig_flags); +} + +void btrfs_get_block_group(struct btrfs_block_group_cache *cache) +{ + atomic_inc(&cache->count); +} + +void btrfs_put_block_group(struct btrfs_block_group_cache *cache) +{ + if (atomic_dec_and_test(&cache->count)) { + WARN_ON(cache->pinned > 0); + WARN_ON(cache->reserved > 0); + + /* + * If not empty, someone is still holding mutex of + * full_stripe_lock, which can only be released by caller. + * And it will definitely cause use-after-free when caller + * tries to release full stripe lock. + * + * No better way to resolve, but only to warn. + */ + WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root)); + kfree(cache->free_space_ctl); + kfree(cache); + } +} + +/* + * This adds the block group to the fs_info rb tree for the block group cache + */ +static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, + struct btrfs_block_group_cache *block_group) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct btrfs_block_group_cache *cache; + + spin_lock(&info->block_group_cache_lock); + p = &info->block_group_cache_tree.rb_node; + + while (*p) { + parent = *p; + cache = rb_entry(parent, struct btrfs_block_group_cache, + cache_node); + if (block_group->key.objectid < cache->key.objectid) { + p = &(*p)->rb_left; + } else if (block_group->key.objectid > cache->key.objectid) { + p = &(*p)->rb_right; + } else { + spin_unlock(&info->block_group_cache_lock); + return -EEXIST; + } + } + + rb_link_node(&block_group->cache_node, parent, p); + rb_insert_color(&block_group->cache_node, + &info->block_group_cache_tree); + + if (info->first_logical_byte > block_group->key.objectid) + info->first_logical_byte = block_group->key.objectid; + + spin_unlock(&info->block_group_cache_lock); + + return 0; +} + +/* + * This will return the block group at or after bytenr if contains is 0, else + * it will return the block group that contains the bytenr + */ +static struct btrfs_block_group_cache *block_group_cache_tree_search( + struct btrfs_fs_info *info, u64 bytenr, int contains) +{ + struct btrfs_block_group_cache *cache, *ret = NULL; + struct rb_node *n; + u64 end, start; + + spin_lock(&info->block_group_cache_lock); + n = info->block_group_cache_tree.rb_node; + + while (n) { + cache = rb_entry(n, struct btrfs_block_group_cache, + cache_node); + end = cache->key.objectid + cache->key.offset - 1; + start = cache->key.objectid; + + if (bytenr < start) { + if (!contains && (!ret || start < ret->key.objectid)) + ret = cache; + n = n->rb_left; + } else if (bytenr > start) { + if (contains && bytenr <= end) { + ret = cache; + break; + } + n = n->rb_right; + } else { + ret = cache; + break; + } + } + if (ret) { + btrfs_get_block_group(ret); + if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) + info->first_logical_byte = ret->key.objectid; + } + spin_unlock(&info->block_group_cache_lock); + + return ret; +} + +/* + * Return the block group that starts at or after bytenr + */ +struct btrfs_block_group_cache *btrfs_lookup_first_block_group( + struct btrfs_fs_info *info, u64 bytenr) +{ + return block_group_cache_tree_search(info, bytenr, 0); +} + +/* + * Return the block group that contains the given bytenr + */ +struct btrfs_block_group_cache *btrfs_lookup_block_group( + struct btrfs_fs_info *info, u64 bytenr) +{ + return block_group_cache_tree_search(info, bytenr, 1); +} + +struct btrfs_block_group_cache *btrfs_next_block_group( + struct btrfs_block_group_cache *cache) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct rb_node *node; + + spin_lock(&fs_info->block_group_cache_lock); + + /* If our block group was removed, we need a full search. */ + if (RB_EMPTY_NODE(&cache->cache_node)) { + const u64 next_bytenr = cache->key.objectid + cache->key.offset; + + spin_unlock(&fs_info->block_group_cache_lock); + btrfs_put_block_group(cache); + cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache; + } + node = rb_next(&cache->cache_node); + btrfs_put_block_group(cache); + if (node) { + cache = rb_entry(node, struct btrfs_block_group_cache, + cache_node); + btrfs_get_block_group(cache); + } else + cache = NULL; + spin_unlock(&fs_info->block_group_cache_lock); + return cache; +} + +bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_block_group_cache *bg; + bool ret = true; + + bg = btrfs_lookup_block_group(fs_info, bytenr); + if (!bg) + return false; + + spin_lock(&bg->lock); + if (bg->ro) + ret = false; + else + atomic_inc(&bg->nocow_writers); + spin_unlock(&bg->lock); + + /* No put on block group, done by btrfs_dec_nocow_writers */ + if (!ret) + btrfs_put_block_group(bg); + + return ret; +} + +void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_block_group_cache *bg; + + bg = btrfs_lookup_block_group(fs_info, bytenr); + ASSERT(bg); + if (atomic_dec_and_test(&bg->nocow_writers)) + wake_up_var(&bg->nocow_writers); + /* + * Once for our lookup and once for the lookup done by a previous call + * to btrfs_inc_nocow_writers() + */ + btrfs_put_block_group(bg); + btrfs_put_block_group(bg); +} + +void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) +{ + wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers)); +} + +void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, + const u64 start) +{ + struct btrfs_block_group_cache *bg; + + bg = btrfs_lookup_block_group(fs_info, start); + ASSERT(bg); + if (atomic_dec_and_test(&bg->reservations)) + wake_up_var(&bg->reservations); + btrfs_put_block_group(bg); +} + +void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) +{ + struct btrfs_space_info *space_info = bg->space_info; + + ASSERT(bg->ro); + + if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) + return; + + /* + * Our block group is read only but before we set it to read only, + * some task might have had allocated an extent from it already, but it + * has not yet created a respective ordered extent (and added it to a + * root's list of ordered extents). + * Therefore wait for any task currently allocating extents, since the + * block group's reservations counter is incremented while a read lock + * on the groups' semaphore is held and decremented after releasing + * the read access on that semaphore and creating the ordered extent. + */ + down_write(&space_info->groups_sem); + up_write(&space_info->groups_sem); + + wait_var_event(&bg->reservations, !atomic_read(&bg->reservations)); +} + +struct btrfs_caching_control *btrfs_get_caching_control( + struct btrfs_block_group_cache *cache) +{ + struct btrfs_caching_control *ctl; + + spin_lock(&cache->lock); + if (!cache->caching_ctl) { + spin_unlock(&cache->lock); + return NULL; + } + + ctl = cache->caching_ctl; + refcount_inc(&ctl->count); + spin_unlock(&cache->lock); + return ctl; +} + +void btrfs_put_caching_control(struct btrfs_caching_control *ctl) +{ + if (refcount_dec_and_test(&ctl->count)) + kfree(ctl); +} + +/* + * When we wait for progress in the block group caching, its because our + * allocation attempt failed at least once. So, we must sleep and let some + * progress happen before we try again. + * + * This function will sleep at least once waiting for new free space to show + * up, and then it will check the block group free space numbers for our min + * num_bytes. Another option is to have it go ahead and look in the rbtree for + * a free extent of a given size, but this is a good start. + * + * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using + * any of the information in this block group. + */ +void btrfs_wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, + u64 num_bytes) +{ + struct btrfs_caching_control *caching_ctl; + + caching_ctl = btrfs_get_caching_control(cache); + if (!caching_ctl) + return; + + wait_event(caching_ctl->wait, btrfs_block_group_cache_done(cache) || + (cache->free_space_ctl->free_space >= num_bytes)); + + btrfs_put_caching_control(caching_ctl); +} + +int btrfs_wait_block_group_cache_done(struct btrfs_block_group_cache *cache) +{ + struct btrfs_caching_control *caching_ctl; + int ret = 0; + + caching_ctl = btrfs_get_caching_control(cache); + if (!caching_ctl) + return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; + + wait_event(caching_ctl->wait, btrfs_block_group_cache_done(cache)); + if (cache->cached == BTRFS_CACHE_ERROR) + ret = -EIO; + btrfs_put_caching_control(caching_ctl); + return ret; +} + +#ifdef CONFIG_BTRFS_DEBUG +static void fragment_free_space(struct btrfs_block_group_cache *block_group) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + u64 start = block_group->key.objectid; + u64 len = block_group->key.offset; + u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? + fs_info->nodesize : fs_info->sectorsize; + u64 step = chunk << 1; + + while (len > chunk) { + btrfs_remove_free_space(block_group, start, chunk); + start += step; + if (len < step) + len = 0; + else + len -= step; + } +} +#endif + +/* + * This is only called by btrfs_cache_block_group, since we could have freed + * extents we need to check the pinned_extents for any extents that can't be + * used yet since their free space will be released as soon as the transaction + * commits. + */ +u64 add_new_free_space(struct btrfs_block_group_cache *block_group, + u64 start, u64 end) +{ + struct btrfs_fs_info *info = block_group->fs_info; + u64 extent_start, extent_end, size, total_added = 0; + int ret; + + while (start < end) { + ret = find_first_extent_bit(info->pinned_extents, start, + &extent_start, &extent_end, + EXTENT_DIRTY | EXTENT_UPTODATE, + NULL); + if (ret) + break; + + if (extent_start <= start) { + start = extent_end + 1; + } else if (extent_start > start && extent_start < end) { + size = extent_start - start; + total_added += size; + ret = btrfs_add_free_space(block_group, start, + size); + BUG_ON(ret); /* -ENOMEM or logic error */ + start = extent_end + 1; + } else { + break; + } + } + + if (start < end) { + size = end - start; + total_added += size; + ret = btrfs_add_free_space(block_group, start, size); + BUG_ON(ret); /* -ENOMEM or logic error */ + } + + return total_added; +} + +static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) +{ + struct btrfs_block_group_cache *block_group = caching_ctl->block_group; + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key; + u64 total_found = 0; + u64 last = 0; + u32 nritems; + int ret; + bool wakeup = true; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); + +#ifdef CONFIG_BTRFS_DEBUG + /* + * If we're fragmenting we don't want to make anybody think we can + * allocate from this block group until we've had a chance to fragment + * the free space. + */ + if (btrfs_should_fragment_free_space(block_group)) + wakeup = false; +#endif + /* + * We don't want to deadlock with somebody trying to allocate a new + * extent for the extent root while also trying to search the extent + * root to add free space. So we skip locking and search the commit + * root, since its read-only + */ + path->skip_locking = 1; + path->search_commit_root = 1; + path->reada = READA_FORWARD; + + key.objectid = last; + key.offset = 0; + key.type = BTRFS_EXTENT_ITEM_KEY; + +next: + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + + while (1) { + if (btrfs_fs_closing(fs_info) > 1) { + last = (u64)-1; + break; + } + + if (path->slots[0] < nritems) { + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + } else { + ret = btrfs_find_next_key(extent_root, path, &key, 0, 0); + if (ret) + break; + + if (need_resched() || + rwsem_is_contended(&fs_info->commit_root_sem)) { + if (wakeup) + caching_ctl->progress = last; + btrfs_release_path(path); + up_read(&fs_info->commit_root_sem); + mutex_unlock(&caching_ctl->mutex); + cond_resched(); + mutex_lock(&caching_ctl->mutex); + down_read(&fs_info->commit_root_sem); + goto next; + } + + ret = btrfs_next_leaf(extent_root, path); + if (ret < 0) + goto out; + if (ret) + break; + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + continue; + } + + if (key.objectid < last) { + key.objectid = last; + key.offset = 0; + key.type = BTRFS_EXTENT_ITEM_KEY; + + if (wakeup) + caching_ctl->progress = last; + btrfs_release_path(path); + goto next; + } + + if (key.objectid < block_group->key.objectid) { + path->slots[0]++; + continue; + } + + if (key.objectid >= block_group->key.objectid + + block_group->key.offset) + break; + + if (key.type == BTRFS_EXTENT_ITEM_KEY || + key.type == BTRFS_METADATA_ITEM_KEY) { + total_found += add_new_free_space(block_group, last, + key.objectid); + if (key.type == BTRFS_METADATA_ITEM_KEY) + last = key.objectid + + fs_info->nodesize; + else + last = key.objectid + key.offset; + + if (total_found > CACHING_CTL_WAKE_UP) { + total_found = 0; + if (wakeup) + wake_up(&caching_ctl->wait); + } + } + path->slots[0]++; + } + ret = 0; + + total_found += add_new_free_space(block_group, last, + block_group->key.objectid + + block_group->key.offset); + caching_ctl->progress = (u64)-1; + +out: + btrfs_free_path(path); + return ret; +} + +static noinline void caching_thread(struct btrfs_work *work) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_fs_info *fs_info; + struct btrfs_caching_control *caching_ctl; + int ret; + + caching_ctl = container_of(work, struct btrfs_caching_control, work); + block_group = caching_ctl->block_group; + fs_info = block_group->fs_info; + + mutex_lock(&caching_ctl->mutex); + down_read(&fs_info->commit_root_sem); + + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + ret = load_free_space_tree(caching_ctl); + else + ret = load_extent_tree_free(caching_ctl); + + spin_lock(&block_group->lock); + block_group->caching_ctl = NULL; + block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED; + spin_unlock(&block_group->lock); + +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_should_fragment_free_space(block_group)) { + u64 bytes_used; + + spin_lock(&block_group->space_info->lock); + spin_lock(&block_group->lock); + bytes_used = block_group->key.offset - + btrfs_block_group_used(&block_group->item); + block_group->space_info->bytes_used += bytes_used >> 1; + spin_unlock(&block_group->lock); + spin_unlock(&block_group->space_info->lock); + fragment_free_space(block_group); + } +#endif + + caching_ctl->progress = (u64)-1; + + up_read(&fs_info->commit_root_sem); + btrfs_free_excluded_extents(block_group); + mutex_unlock(&caching_ctl->mutex); + + wake_up(&caching_ctl->wait); + + btrfs_put_caching_control(caching_ctl); + btrfs_put_block_group(block_group); +} + +int btrfs_cache_block_group(struct btrfs_block_group_cache *cache, + int load_cache_only) +{ + DEFINE_WAIT(wait); + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_caching_control *caching_ctl; + int ret = 0; + + caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); + if (!caching_ctl) + return -ENOMEM; + + INIT_LIST_HEAD(&caching_ctl->list); + mutex_init(&caching_ctl->mutex); + init_waitqueue_head(&caching_ctl->wait); + caching_ctl->block_group = cache; + caching_ctl->progress = cache->key.objectid; + refcount_set(&caching_ctl->count, 1); + btrfs_init_work(&caching_ctl->work, btrfs_cache_helper, + caching_thread, NULL, NULL); + + spin_lock(&cache->lock); + /* + * This should be a rare occasion, but this could happen I think in the + * case where one thread starts to load the space cache info, and then + * some other thread starts a transaction commit which tries to do an + * allocation while the other thread is still loading the space cache + * info. The previous loop should have kept us from choosing this block + * group, but if we've moved to the state where we will wait on caching + * block groups we need to first check if we're doing a fast load here, + * so we can wait for it to finish, otherwise we could end up allocating + * from a block group who's cache gets evicted for one reason or + * another. + */ + while (cache->cached == BTRFS_CACHE_FAST) { + struct btrfs_caching_control *ctl; + + ctl = cache->caching_ctl; + refcount_inc(&ctl->count); + prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&cache->lock); + + schedule(); + + finish_wait(&ctl->wait, &wait); + btrfs_put_caching_control(ctl); + spin_lock(&cache->lock); + } + + if (cache->cached != BTRFS_CACHE_NO) { + spin_unlock(&cache->lock); + kfree(caching_ctl); + return 0; + } + WARN_ON(cache->caching_ctl); + cache->caching_ctl = caching_ctl; + cache->cached = BTRFS_CACHE_FAST; + spin_unlock(&cache->lock); + + if (btrfs_test_opt(fs_info, SPACE_CACHE)) { + mutex_lock(&caching_ctl->mutex); + ret = load_free_space_cache(cache); + + spin_lock(&cache->lock); + if (ret == 1) { + cache->caching_ctl = NULL; + cache->cached = BTRFS_CACHE_FINISHED; + cache->last_byte_to_unpin = (u64)-1; + caching_ctl->progress = (u64)-1; + } else { + if (load_cache_only) { + cache->caching_ctl = NULL; + cache->cached = BTRFS_CACHE_NO; + } else { + cache->cached = BTRFS_CACHE_STARTED; + cache->has_caching_ctl = 1; + } + } + spin_unlock(&cache->lock); +#ifdef CONFIG_BTRFS_DEBUG + if (ret == 1 && + btrfs_should_fragment_free_space(cache)) { + u64 bytes_used; + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + bytes_used = cache->key.offset - + btrfs_block_group_used(&cache->item); + cache->space_info->bytes_used += bytes_used >> 1; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + fragment_free_space(cache); + } +#endif + mutex_unlock(&caching_ctl->mutex); + + wake_up(&caching_ctl->wait); + if (ret == 1) { + btrfs_put_caching_control(caching_ctl); + btrfs_free_excluded_extents(cache); + return 0; + } + } else { + /* + * We're either using the free space tree or no caching at all. + * Set cached to the appropriate value and wakeup any waiters. + */ + spin_lock(&cache->lock); + if (load_cache_only) { + cache->caching_ctl = NULL; + cache->cached = BTRFS_CACHE_NO; + } else { + cache->cached = BTRFS_CACHE_STARTED; + cache->has_caching_ctl = 1; + } + spin_unlock(&cache->lock); + wake_up(&caching_ctl->wait); + } + + if (load_cache_only) { + btrfs_put_caching_control(caching_ctl); + return 0; + } + + down_write(&fs_info->commit_root_sem); + refcount_inc(&caching_ctl->count); + list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); + up_write(&fs_info->commit_root_sem); + + btrfs_get_block_group(cache); + + btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); + + return ret; +} + +static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) +{ + u64 extra_flags = chunk_to_extended(flags) & + BTRFS_EXTENDED_PROFILE_MASK; + + write_seqlock(&fs_info->profiles_lock); + if (flags & BTRFS_BLOCK_GROUP_DATA) + fs_info->avail_data_alloc_bits &= ~extra_flags; + if (flags & BTRFS_BLOCK_GROUP_METADATA) + fs_info->avail_metadata_alloc_bits &= ~extra_flags; + if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + fs_info->avail_system_alloc_bits &= ~extra_flags; + write_sequnlock(&fs_info->profiles_lock); +} + +/* + * Clear incompat bits for the following feature(s): + * + * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group + * in the whole filesystem + */ +static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags) +{ + if (flags & BTRFS_BLOCK_GROUP_RAID56_MASK) { + struct list_head *head = &fs_info->space_info; + struct btrfs_space_info *sinfo; + + list_for_each_entry_rcu(sinfo, head, list) { + bool found = false; + + down_read(&sinfo->groups_sem); + if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5])) + found = true; + if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6])) + found = true; + up_read(&sinfo->groups_sem); + + if (found) + return; + } + btrfs_clear_fs_incompat(fs_info, RAID56); + } +} + +int btrfs_remove_block_group(struct btrfs_trans_handle *trans, + u64 group_start, struct extent_map *em) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *root = fs_info->extent_root; + struct btrfs_path *path; + struct btrfs_block_group_cache *block_group; + struct btrfs_free_cluster *cluster; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_key key; + struct inode *inode; + struct kobject *kobj = NULL; + int ret; + int index; + int factor; + struct btrfs_caching_control *caching_ctl = NULL; + bool remove_em; + bool remove_rsv = false; + + block_group = btrfs_lookup_block_group(fs_info, group_start); + BUG_ON(!block_group); + BUG_ON(!block_group->ro); + + trace_btrfs_remove_block_group(block_group); + /* + * Free the reserved super bytes from this block group before + * remove it. + */ + btrfs_free_excluded_extents(block_group); + btrfs_free_ref_tree_range(fs_info, block_group->key.objectid, + block_group->key.offset); + + memcpy(&key, &block_group->key, sizeof(key)); + index = btrfs_bg_flags_to_raid_index(block_group->flags); + factor = btrfs_bg_type_to_factor(block_group->flags); + + /* make sure this block group isn't part of an allocation cluster */ + cluster = &fs_info->data_alloc_cluster; + spin_lock(&cluster->refill_lock); + btrfs_return_cluster_to_free_space(block_group, cluster); + spin_unlock(&cluster->refill_lock); + + /* + * make sure this block group isn't part of a metadata + * allocation cluster + */ + cluster = &fs_info->meta_alloc_cluster; + spin_lock(&cluster->refill_lock); + btrfs_return_cluster_to_free_space(block_group, cluster); + spin_unlock(&cluster->refill_lock); + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + /* + * get the inode first so any iput calls done for the io_list + * aren't the final iput (no unlinks allowed now) + */ + inode = lookup_free_space_inode(block_group, path); + + mutex_lock(&trans->transaction->cache_write_mutex); + /* + * Make sure our free space cache IO is done before removing the + * free space inode + */ + spin_lock(&trans->transaction->dirty_bgs_lock); + if (!list_empty(&block_group->io_list)) { + list_del_init(&block_group->io_list); + + WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); + + spin_unlock(&trans->transaction->dirty_bgs_lock); + btrfs_wait_cache_io(trans, block_group, path); + btrfs_put_block_group(block_group); + spin_lock(&trans->transaction->dirty_bgs_lock); + } + + if (!list_empty(&block_group->dirty_list)) { + list_del_init(&block_group->dirty_list); + remove_rsv = true; + btrfs_put_block_group(block_group); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + mutex_unlock(&trans->transaction->cache_write_mutex); + + if (!IS_ERR(inode)) { + ret = btrfs_orphan_add(trans, BTRFS_I(inode)); + if (ret) { + btrfs_add_delayed_iput(inode); + goto out; + } + clear_nlink(inode); + /* One for the block groups ref */ + spin_lock(&block_group->lock); + if (block_group->iref) { + block_group->iref = 0; + block_group->inode = NULL; + spin_unlock(&block_group->lock); + iput(inode); + } else { + spin_unlock(&block_group->lock); + } + /* One for our lookup ref */ + btrfs_add_delayed_iput(inode); + } + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = block_group->key.objectid; + key.type = 0; + + ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); + if (ret < 0) + goto out; + if (ret > 0) + btrfs_release_path(path); + if (ret == 0) { + ret = btrfs_del_item(trans, tree_root, path); + if (ret) + goto out; + btrfs_release_path(path); + } + + spin_lock(&fs_info->block_group_cache_lock); + rb_erase(&block_group->cache_node, + &fs_info->block_group_cache_tree); + RB_CLEAR_NODE(&block_group->cache_node); + + if (fs_info->first_logical_byte == block_group->key.objectid) + fs_info->first_logical_byte = (u64)-1; + spin_unlock(&fs_info->block_group_cache_lock); + + down_write(&block_group->space_info->groups_sem); + /* + * we must use list_del_init so people can check to see if they + * are still on the list after taking the semaphore + */ + list_del_init(&block_group->list); + if (list_empty(&block_group->space_info->block_groups[index])) { + kobj = block_group->space_info->block_group_kobjs[index]; + block_group->space_info->block_group_kobjs[index] = NULL; + clear_avail_alloc_bits(fs_info, block_group->flags); + } + up_write(&block_group->space_info->groups_sem); + clear_incompat_bg_bits(fs_info, block_group->flags); + if (kobj) { + kobject_del(kobj); + kobject_put(kobj); + } + + if (block_group->has_caching_ctl) + caching_ctl = btrfs_get_caching_control(block_group); + if (block_group->cached == BTRFS_CACHE_STARTED) + btrfs_wait_block_group_cache_done(block_group); + if (block_group->has_caching_ctl) { + down_write(&fs_info->commit_root_sem); + if (!caching_ctl) { + struct btrfs_caching_control *ctl; + + list_for_each_entry(ctl, + &fs_info->caching_block_groups, list) + if (ctl->block_group == block_group) { + caching_ctl = ctl; + refcount_inc(&caching_ctl->count); + break; + } + } + if (caching_ctl) + list_del_init(&caching_ctl->list); + up_write(&fs_info->commit_root_sem); + if (caching_ctl) { + /* Once for the caching bgs list and once for us. */ + btrfs_put_caching_control(caching_ctl); + btrfs_put_caching_control(caching_ctl); + } + } + + spin_lock(&trans->transaction->dirty_bgs_lock); + WARN_ON(!list_empty(&block_group->dirty_list)); + WARN_ON(!list_empty(&block_group->io_list)); + spin_unlock(&trans->transaction->dirty_bgs_lock); + + btrfs_remove_free_space_cache(block_group); + + spin_lock(&block_group->space_info->lock); + list_del_init(&block_group->ro_list); + + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { + WARN_ON(block_group->space_info->total_bytes + < block_group->key.offset); + WARN_ON(block_group->space_info->bytes_readonly + < block_group->key.offset); + WARN_ON(block_group->space_info->disk_total + < block_group->key.offset * factor); + } + block_group->space_info->total_bytes -= block_group->key.offset; + block_group->space_info->bytes_readonly -= block_group->key.offset; + block_group->space_info->disk_total -= block_group->key.offset * factor; + + spin_unlock(&block_group->space_info->lock); + + memcpy(&key, &block_group->key, sizeof(key)); + + mutex_lock(&fs_info->chunk_mutex); + spin_lock(&block_group->lock); + block_group->removed = 1; + /* + * At this point trimming can't start on this block group, because we + * removed the block group from the tree fs_info->block_group_cache_tree + * so no one can't find it anymore and even if someone already got this + * block group before we removed it from the rbtree, they have already + * incremented block_group->trimming - if they didn't, they won't find + * any free space entries because we already removed them all when we + * called btrfs_remove_free_space_cache(). + * + * And we must not remove the extent map from the fs_info->mapping_tree + * to prevent the same logical address range and physical device space + * ranges from being reused for a new block group. This is because our + * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is + * completely transactionless, so while it is trimming a range the + * currently running transaction might finish and a new one start, + * allowing for new block groups to be created that can reuse the same + * physical device locations unless we take this special care. + * + * There may also be an implicit trim operation if the file system + * is mounted with -odiscard. The same protections must remain + * in place until the extents have been discarded completely when + * the transaction commit has completed. + */ + remove_em = (atomic_read(&block_group->trimming) == 0); + spin_unlock(&block_group->lock); + + mutex_unlock(&fs_info->chunk_mutex); + + ret = remove_block_group_free_space(trans, block_group); + if (ret) + goto out; + + btrfs_put_block_group(block_group); + btrfs_put_block_group(block_group); + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) + ret = -EIO; + if (ret < 0) + goto out; + + ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; + + if (remove_em) { + struct extent_map_tree *em_tree; + + em_tree = &fs_info->mapping_tree; + write_lock(&em_tree->lock); + remove_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + /* once for the tree */ + free_extent_map(em); + } +out: + if (remove_rsv) + btrfs_delayed_refs_rsv_release(fs_info, 1); + btrfs_free_path(path); + return ret; +} + +struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( + struct btrfs_fs_info *fs_info, const u64 chunk_offset) +{ + struct extent_map_tree *em_tree = &fs_info->mapping_tree; + struct extent_map *em; + struct map_lookup *map; + unsigned int num_items; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, chunk_offset, 1); + read_unlock(&em_tree->lock); + ASSERT(em && em->start == chunk_offset); + + /* + * We need to reserve 3 + N units from the metadata space info in order + * to remove a block group (done at btrfs_remove_chunk() and at + * btrfs_remove_block_group()), which are used for: + * + * 1 unit for adding the free space inode's orphan (located in the tree + * of tree roots). + * 1 unit for deleting the block group item (located in the extent + * tree). + * 1 unit for deleting the free space item (located in tree of tree + * roots). + * N units for deleting N device extent items corresponding to each + * stripe (located in the device tree). + * + * In order to remove a block group we also need to reserve units in the + * system space info in order to update the chunk tree (update one or + * more device items and remove one chunk item), but this is done at + * btrfs_remove_chunk() through a call to check_system_chunk(). + */ + map = em->map_lookup; + num_items = 3 + map->num_stripes; + free_extent_map(em); + + return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root, + num_items, 1); +} + +/* + * Mark block group @cache read-only, so later write won't happen to block + * group @cache. + * + * If @force is not set, this function will only mark the block group readonly + * if we have enough free space (1M) in other metadata/system block groups. + * If @force is not set, this function will mark the block group readonly + * without checking free space. + * + * NOTE: This function doesn't care if other block groups can contain all the + * data in this block group. That check should be done by relocation routine, + * not this function. + */ +static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) +{ + struct btrfs_space_info *sinfo = cache->space_info; + u64 num_bytes; + u64 sinfo_used; + u64 min_allocable_bytes; + int ret = -ENOSPC; + + /* + * We need some metadata space and system metadata space for + * allocating chunks in some corner cases until we force to set + * it to be readonly. + */ + if ((sinfo->flags & + (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && + !force) + min_allocable_bytes = SZ_1M; + else + min_allocable_bytes = 0; + + spin_lock(&sinfo->lock); + spin_lock(&cache->lock); + + if (cache->ro) { + cache->ro++; + ret = 0; + goto out; + } + + num_bytes = cache->key.offset - cache->reserved - cache->pinned - + cache->bytes_super - btrfs_block_group_used(&cache->item); + sinfo_used = btrfs_space_info_used(sinfo, true); + + /* + * sinfo_used + num_bytes should always <= sinfo->total_bytes. + * + * Here we make sure if we mark this bg RO, we still have enough + * free space as buffer (if min_allocable_bytes is not 0). + */ + if (sinfo_used + num_bytes + min_allocable_bytes <= + sinfo->total_bytes) { + sinfo->bytes_readonly += num_bytes; + cache->ro++; + list_add_tail(&cache->ro_list, &sinfo->ro_bgs); + ret = 0; + } +out: + spin_unlock(&cache->lock); + spin_unlock(&sinfo->lock); + if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { + btrfs_info(cache->fs_info, + "unable to make block group %llu ro", + cache->key.objectid); + btrfs_info(cache->fs_info, + "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu", + sinfo_used, num_bytes, min_allocable_bytes); + btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0); + } + return ret; +} + +/* + * Process the unused_bgs list and remove any that don't have any allocated + * space inside of them. + */ +void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; + struct btrfs_trans_handle *trans; + int ret = 0; + + if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) + return; + + spin_lock(&fs_info->unused_bgs_lock); + while (!list_empty(&fs_info->unused_bgs)) { + u64 start, end; + int trimming; + + block_group = list_first_entry(&fs_info->unused_bgs, + struct btrfs_block_group_cache, + bg_list); + list_del_init(&block_group->bg_list); + + space_info = block_group->space_info; + + if (ret || btrfs_mixed_space_info(space_info)) { + btrfs_put_block_group(block_group); + continue; + } + spin_unlock(&fs_info->unused_bgs_lock); + + mutex_lock(&fs_info->delete_unused_bgs_mutex); + + /* Don't want to race with allocators so take the groups_sem */ + down_write(&space_info->groups_sem); + spin_lock(&block_group->lock); + if (block_group->reserved || block_group->pinned || + btrfs_block_group_used(&block_group->item) || + block_group->ro || + list_is_singular(&block_group->list)) { + /* + * We want to bail if we made new allocations or have + * outstanding allocations in this block group. We do + * the ro check in case balance is currently acting on + * this block group. + */ + trace_btrfs_skip_unused_block_group(block_group); + spin_unlock(&block_group->lock); + up_write(&space_info->groups_sem); + goto next; + } + spin_unlock(&block_group->lock); + + /* We don't want to force the issue, only flip if it's ok. */ + ret = inc_block_group_ro(block_group, 0); + up_write(&space_info->groups_sem); + if (ret < 0) { + ret = 0; + goto next; + } + + /* + * Want to do this before we do anything else so we can recover + * properly if we fail to join the transaction. + */ + trans = btrfs_start_trans_remove_block_group(fs_info, + block_group->key.objectid); + if (IS_ERR(trans)) { + btrfs_dec_block_group_ro(block_group); + ret = PTR_ERR(trans); + goto next; + } + + /* + * We could have pending pinned extents for this block group, + * just delete them, we don't care about them anymore. + */ + start = block_group->key.objectid; + end = start + block_group->key.offset - 1; + /* + * Hold the unused_bg_unpin_mutex lock to avoid racing with + * btrfs_finish_extent_commit(). If we are at transaction N, + * another task might be running finish_extent_commit() for the + * previous transaction N - 1, and have seen a range belonging + * to the block group in freed_extents[] before we were able to + * clear the whole block group range from freed_extents[]. This + * means that task can lookup for the block group after we + * unpinned it from freed_extents[] and removed it, leading to + * a BUG_ON() at btrfs_unpin_extent_range(). + */ + mutex_lock(&fs_info->unused_bg_unpin_mutex); + ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, + EXTENT_DIRTY); + if (ret) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + btrfs_dec_block_group_ro(block_group); + goto end_trans; + } + ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, + EXTENT_DIRTY); + if (ret) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + btrfs_dec_block_group_ro(block_group); + goto end_trans; + } + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + + /* Reset pinned so btrfs_put_block_group doesn't complain */ + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + + btrfs_space_info_update_bytes_pinned(fs_info, space_info, + -block_group->pinned); + space_info->bytes_readonly += block_group->pinned; + percpu_counter_add_batch(&space_info->total_bytes_pinned, + -block_group->pinned, + BTRFS_TOTAL_BYTES_PINNED_BATCH); + block_group->pinned = 0; + + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + + /* DISCARD can flip during remount */ + trimming = btrfs_test_opt(fs_info, DISCARD); + + /* Implicit trim during transaction commit. */ + if (trimming) + btrfs_get_block_group_trimming(block_group); + + /* + * Btrfs_remove_chunk will abort the transaction if things go + * horribly wrong. + */ + ret = btrfs_remove_chunk(trans, block_group->key.objectid); + + if (ret) { + if (trimming) + btrfs_put_block_group_trimming(block_group); + goto end_trans; + } + + /* + * If we're not mounted with -odiscard, we can just forget + * about this block group. Otherwise we'll need to wait + * until transaction commit to do the actual discard. + */ + if (trimming) { + spin_lock(&fs_info->unused_bgs_lock); + /* + * A concurrent scrub might have added us to the list + * fs_info->unused_bgs, so use a list_move operation + * to add the block group to the deleted_bgs list. + */ + list_move(&block_group->bg_list, + &trans->transaction->deleted_bgs); + spin_unlock(&fs_info->unused_bgs_lock); + btrfs_get_block_group(block_group); + } +end_trans: + btrfs_end_transaction(trans); +next: + mutex_unlock(&fs_info->delete_unused_bgs_mutex); + btrfs_put_block_group(block_group); + spin_lock(&fs_info->unused_bgs_lock); + } + spin_unlock(&fs_info->unused_bgs_lock); +} + +void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + + spin_lock(&fs_info->unused_bgs_lock); + if (list_empty(&bg->bg_list)) { + btrfs_get_block_group(bg); + trace_btrfs_add_unused_block_group(bg); + list_add_tail(&bg->bg_list, &fs_info->unused_bgs); + } + spin_unlock(&fs_info->unused_bgs_lock); +} + +static int find_first_block_group(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + struct btrfs_key *key) +{ + struct btrfs_root *root = fs_info->extent_root; + int ret = 0; + struct btrfs_key found_key; + struct extent_buffer *leaf; + struct btrfs_block_group_item bg; + u64 flags; + int slot; + + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret < 0) + goto out; + + while (1) { + slot = path->slots[0]; + leaf = path->nodes[0]; + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto out; + break; + } + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + if (found_key.objectid >= key->objectid && + found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { + struct extent_map_tree *em_tree; + struct extent_map *em; + + em_tree = &root->fs_info->mapping_tree; + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, found_key.objectid, + found_key.offset); + read_unlock(&em_tree->lock); + if (!em) { + btrfs_err(fs_info, + "logical %llu len %llu found bg but no related chunk", + found_key.objectid, found_key.offset); + ret = -ENOENT; + } else if (em->start != found_key.objectid || + em->len != found_key.offset) { + btrfs_err(fs_info, + "block group %llu len %llu mismatch with chunk %llu len %llu", + found_key.objectid, found_key.offset, + em->start, em->len); + ret = -EUCLEAN; + } else { + read_extent_buffer(leaf, &bg, + btrfs_item_ptr_offset(leaf, slot), + sizeof(bg)); + flags = btrfs_block_group_flags(&bg) & + BTRFS_BLOCK_GROUP_TYPE_MASK; + + if (flags != (em->map_lookup->type & + BTRFS_BLOCK_GROUP_TYPE_MASK)) { + btrfs_err(fs_info, +"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", + found_key.objectid, + found_key.offset, flags, + (BTRFS_BLOCK_GROUP_TYPE_MASK & + em->map_lookup->type)); + ret = -EUCLEAN; + } else { + ret = 0; + } + } + free_extent_map(em); + goto out; + } + path->slots[0]++; + } +out: + return ret; +} + +static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) +{ + u64 extra_flags = chunk_to_extended(flags) & + BTRFS_EXTENDED_PROFILE_MASK; + + write_seqlock(&fs_info->profiles_lock); + if (flags & BTRFS_BLOCK_GROUP_DATA) + fs_info->avail_data_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_METADATA) + fs_info->avail_metadata_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + fs_info->avail_system_alloc_bits |= extra_flags; + write_sequnlock(&fs_info->profiles_lock); +} + +static int exclude_super_stripes(struct btrfs_block_group_cache *cache) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + u64 bytenr; + u64 *logical; + int stripe_len; + int i, nr, ret; + + if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { + stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; + cache->bytes_super += stripe_len; + ret = btrfs_add_excluded_extent(fs_info, cache->key.objectid, + stripe_len); + if (ret) + return ret; + } + + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + bytenr = btrfs_sb_offset(i); + ret = btrfs_rmap_block(fs_info, cache->key.objectid, + bytenr, &logical, &nr, &stripe_len); + if (ret) + return ret; + + while (nr--) { + u64 start, len; + + if (logical[nr] > cache->key.objectid + + cache->key.offset) + continue; + + if (logical[nr] + stripe_len <= cache->key.objectid) + continue; + + start = logical[nr]; + if (start < cache->key.objectid) { + start = cache->key.objectid; + len = (logical[nr] + stripe_len) - start; + } else { + len = min_t(u64, stripe_len, + cache->key.objectid + + cache->key.offset - start); + } + + cache->bytes_super += len; + ret = btrfs_add_excluded_extent(fs_info, start, len); + if (ret) { + kfree(logical); + return ret; + } + } + + kfree(logical); + } + return 0; +} + +static void link_block_group(struct btrfs_block_group_cache *cache) +{ + struct btrfs_space_info *space_info = cache->space_info; + int index = btrfs_bg_flags_to_raid_index(cache->flags); + bool first = false; + + down_write(&space_info->groups_sem); + if (list_empty(&space_info->block_groups[index])) + first = true; + list_add_tail(&cache->list, &space_info->block_groups[index]); + up_write(&space_info->groups_sem); + + if (first) + btrfs_sysfs_add_block_group_type(cache); +} + +static struct btrfs_block_group_cache *btrfs_create_block_group_cache( + struct btrfs_fs_info *fs_info, u64 start, u64 size) +{ + struct btrfs_block_group_cache *cache; + + cache = kzalloc(sizeof(*cache), GFP_NOFS); + if (!cache) + return NULL; + + cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), + GFP_NOFS); + if (!cache->free_space_ctl) { + kfree(cache); + return NULL; + } + + cache->key.objectid = start; + cache->key.offset = size; + cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + + cache->fs_info = fs_info; + cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start); + set_free_space_tree_thresholds(cache); + + atomic_set(&cache->count, 1); + spin_lock_init(&cache->lock); + init_rwsem(&cache->data_rwsem); + INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); + INIT_LIST_HEAD(&cache->bg_list); + INIT_LIST_HEAD(&cache->ro_list); + INIT_LIST_HEAD(&cache->dirty_list); + INIT_LIST_HEAD(&cache->io_list); + btrfs_init_free_space_ctl(cache); + atomic_set(&cache->trimming, 0); + mutex_init(&cache->free_space_lock); + btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root); + + return cache; +} + +/* + * Iterate all chunks and verify that each of them has the corresponding block + * group + */ +static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) +{ + struct extent_map_tree *map_tree = &fs_info->mapping_tree; + struct extent_map *em; + struct btrfs_block_group_cache *bg; + u64 start = 0; + int ret = 0; + + while (1) { + read_lock(&map_tree->lock); + /* + * lookup_extent_mapping will return the first extent map + * intersecting the range, so setting @len to 1 is enough to + * get the first chunk. + */ + em = lookup_extent_mapping(map_tree, start, 1); + read_unlock(&map_tree->lock); + if (!em) + break; + + bg = btrfs_lookup_block_group(fs_info, em->start); + if (!bg) { + btrfs_err(fs_info, + "chunk start=%llu len=%llu doesn't have corresponding block group", + em->start, em->len); + ret = -EUCLEAN; + free_extent_map(em); + break; + } + if (bg->key.objectid != em->start || + bg->key.offset != em->len || + (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != + (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + btrfs_err(fs_info, +"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx", + em->start, em->len, + em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK, + bg->key.objectid, bg->key.offset, + bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK); + ret = -EUCLEAN; + free_extent_map(em); + btrfs_put_block_group(bg); + break; + } + start = em->start + em->len; + free_extent_map(em); + btrfs_put_block_group(bg); + } + return ret; +} + +int btrfs_read_block_groups(struct btrfs_fs_info *info) +{ + struct btrfs_path *path; + int ret; + struct btrfs_block_group_cache *cache; + struct btrfs_space_info *space_info; + struct btrfs_key key; + struct btrfs_key found_key; + struct extent_buffer *leaf; + int need_clear = 0; + u64 cache_gen; + u64 feature; + int mixed; + + feature = btrfs_super_incompat_flags(info->super_copy); + mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS); + + key.objectid = 0; + key.offset = 0; + key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = READA_FORWARD; + + cache_gen = btrfs_super_cache_generation(info->super_copy); + if (btrfs_test_opt(info, SPACE_CACHE) && + btrfs_super_generation(info->super_copy) != cache_gen) + need_clear = 1; + if (btrfs_test_opt(info, CLEAR_CACHE)) + need_clear = 1; + + while (1) { + ret = find_first_block_group(info, path, &key); + if (ret > 0) + break; + if (ret != 0) + goto error; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + cache = btrfs_create_block_group_cache(info, found_key.objectid, + found_key.offset); + if (!cache) { + ret = -ENOMEM; + goto error; + } + + if (need_clear) { + /* + * When we mount with old space cache, we need to + * set BTRFS_DC_CLEAR and set dirty flag. + * + * a) Setting 'BTRFS_DC_CLEAR' makes sure that we + * truncate the old free space cache inode and + * setup a new one. + * b) Setting 'dirty flag' makes sure that we flush + * the new space cache info onto disk. + */ + if (btrfs_test_opt(info, SPACE_CACHE)) + cache->disk_cache_state = BTRFS_DC_CLEAR; + } + + read_extent_buffer(leaf, &cache->item, + btrfs_item_ptr_offset(leaf, path->slots[0]), + sizeof(cache->item)); + cache->flags = btrfs_block_group_flags(&cache->item); + if (!mixed && + ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) && + (cache->flags & BTRFS_BLOCK_GROUP_DATA))) { + btrfs_err(info, +"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups", + cache->key.objectid); + ret = -EINVAL; + goto error; + } + + key.objectid = found_key.objectid + found_key.offset; + btrfs_release_path(path); + + /* + * We need to exclude the super stripes now so that the space + * info has super bytes accounted for, otherwise we'll think + * we have more space than we actually do. + */ + ret = exclude_super_stripes(cache); + if (ret) { + /* + * We may have excluded something, so call this just in + * case. + */ + btrfs_free_excluded_extents(cache); + btrfs_put_block_group(cache); + goto error; + } + + /* + * Check for two cases, either we are full, and therefore + * don't need to bother with the caching work since we won't + * find any space, or we are empty, and we can just add all + * the space in and be done with it. This saves us _a_lot_ of + * time, particularly in the full case. + */ + if (found_key.offset == btrfs_block_group_used(&cache->item)) { + cache->last_byte_to_unpin = (u64)-1; + cache->cached = BTRFS_CACHE_FINISHED; + btrfs_free_excluded_extents(cache); + } else if (btrfs_block_group_used(&cache->item) == 0) { + cache->last_byte_to_unpin = (u64)-1; + cache->cached = BTRFS_CACHE_FINISHED; + add_new_free_space(cache, found_key.objectid, + found_key.objectid + + found_key.offset); + btrfs_free_excluded_extents(cache); + } + + ret = btrfs_add_block_group_cache(info, cache); + if (ret) { + btrfs_remove_free_space_cache(cache); + btrfs_put_block_group(cache); + goto error; + } + + trace_btrfs_add_block_group(info, cache, 0); + btrfs_update_space_info(info, cache->flags, found_key.offset, + btrfs_block_group_used(&cache->item), + cache->bytes_super, &space_info); + + cache->space_info = space_info; + + link_block_group(cache); + + set_avail_alloc_bits(info, cache->flags); + if (btrfs_chunk_readonly(info, cache->key.objectid)) { + inc_block_group_ro(cache, 1); + } else if (btrfs_block_group_used(&cache->item) == 0) { + ASSERT(list_empty(&cache->bg_list)); + btrfs_mark_bg_unused(cache); + } + } + + list_for_each_entry_rcu(space_info, &info->space_info, list) { + if (!(btrfs_get_alloc_profile(info, space_info->flags) & + (BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID1_MASK | + BTRFS_BLOCK_GROUP_RAID56_MASK | + BTRFS_BLOCK_GROUP_DUP))) + continue; + /* + * Avoid allocating from un-mirrored block group if there are + * mirrored block groups. + */ + list_for_each_entry(cache, + &space_info->block_groups[BTRFS_RAID_RAID0], + list) + inc_block_group_ro(cache, 1); + list_for_each_entry(cache, + &space_info->block_groups[BTRFS_RAID_SINGLE], + list) + inc_block_group_ro(cache, 1); + } + + btrfs_init_global_block_rsv(info); + ret = check_chunk_block_group_mappings(info); +error: + btrfs_free_path(path); + return ret; +} + +void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_group_cache *block_group; + struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_block_group_item item; + struct btrfs_key key; + int ret = 0; + + if (!trans->can_flush_pending_bgs) + return; + + while (!list_empty(&trans->new_bgs)) { + block_group = list_first_entry(&trans->new_bgs, + struct btrfs_block_group_cache, + bg_list); + if (ret) + goto next; + + spin_lock(&block_group->lock); + memcpy(&item, &block_group->item, sizeof(item)); + memcpy(&key, &block_group->key, sizeof(key)); + spin_unlock(&block_group->lock); + + ret = btrfs_insert_item(trans, extent_root, &key, &item, + sizeof(item)); + if (ret) + btrfs_abort_transaction(trans, ret); + ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset); + if (ret) + btrfs_abort_transaction(trans, ret); + add_block_group_free_space(trans, block_group); + /* Already aborted the transaction if it failed. */ +next: + btrfs_delayed_refs_rsv_release(fs_info, 1); + list_del_init(&block_group->bg_list); + } + btrfs_trans_release_chunk_metadata(trans); +} + +int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, + u64 type, u64 chunk_offset, u64 size) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_group_cache *cache; + int ret; + + btrfs_set_log_full_commit(trans); + + cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size); + if (!cache) + return -ENOMEM; + + btrfs_set_block_group_used(&cache->item, bytes_used); + btrfs_set_block_group_chunk_objectid(&cache->item, + BTRFS_FIRST_CHUNK_TREE_OBJECTID); + btrfs_set_block_group_flags(&cache->item, type); + + cache->flags = type; + cache->last_byte_to_unpin = (u64)-1; + cache->cached = BTRFS_CACHE_FINISHED; + cache->needs_free_space = 1; + ret = exclude_super_stripes(cache); + if (ret) { + /* We may have excluded something, so call this just in case */ + btrfs_free_excluded_extents(cache); + btrfs_put_block_group(cache); + return ret; + } + + add_new_free_space(cache, chunk_offset, chunk_offset + size); + + btrfs_free_excluded_extents(cache); + +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_should_fragment_free_space(cache)) { + u64 new_bytes_used = size - bytes_used; + + bytes_used += new_bytes_used >> 1; + fragment_free_space(cache); + } +#endif + /* + * Ensure the corresponding space_info object is created and + * assigned to our block group. We want our bg to be added to the rbtree + * with its ->space_info set. + */ + cache->space_info = btrfs_find_space_info(fs_info, cache->flags); + ASSERT(cache->space_info); + + ret = btrfs_add_block_group_cache(fs_info, cache); + if (ret) { + btrfs_remove_free_space_cache(cache); + btrfs_put_block_group(cache); + return ret; + } + + /* + * Now that our block group has its ->space_info set and is inserted in + * the rbtree, update the space info's counters. + */ + trace_btrfs_add_block_group(fs_info, cache, 1); + btrfs_update_space_info(fs_info, cache->flags, size, bytes_used, + cache->bytes_super, &cache->space_info); + btrfs_update_global_block_rsv(fs_info); + + link_block_group(cache); + + list_add_tail(&cache->bg_list, &trans->new_bgs); + trans->delayed_ref_updates++; + btrfs_update_delayed_refs_rsv(trans); + + set_avail_alloc_bits(fs_info, type); + return 0; +} + +static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags) +{ + u64 num_devices; + u64 stripped; + + /* + * if restripe for this chunk_type is on pick target profile and + * return, otherwise do the usual balance + */ + stripped = get_restripe_target(fs_info, flags); + if (stripped) + return extended_to_chunk(stripped); + + num_devices = fs_info->fs_devices->rw_devices; + + stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK | + BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10; + + if (num_devices == 1) { + stripped |= BTRFS_BLOCK_GROUP_DUP; + stripped = flags & ~stripped; + + /* turn raid0 into single device chunks */ + if (flags & BTRFS_BLOCK_GROUP_RAID0) + return stripped; + + /* turn mirroring into duplication */ + if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK | + BTRFS_BLOCK_GROUP_RAID10)) + return stripped | BTRFS_BLOCK_GROUP_DUP; + } else { + /* they already had raid on here, just return */ + if (flags & stripped) + return flags; + + stripped |= BTRFS_BLOCK_GROUP_DUP; + stripped = flags & ~stripped; + + /* switch duplicated blocks with raid1 */ + if (flags & BTRFS_BLOCK_GROUP_DUP) + return stripped | BTRFS_BLOCK_GROUP_RAID1; + + /* this is drive concat, leave it alone */ + } + + return flags; +} + +int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache) + +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_trans_handle *trans; + u64 alloc_flags; + int ret; + +again: + trans = btrfs_join_transaction(fs_info->extent_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + /* + * we're not allowed to set block groups readonly after the dirty + * block groups cache has started writing. If it already started, + * back off and let this transaction commit + */ + mutex_lock(&fs_info->ro_block_group_mutex); + if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { + u64 transid = trans->transid; + + mutex_unlock(&fs_info->ro_block_group_mutex); + btrfs_end_transaction(trans); + + ret = btrfs_wait_for_commit(fs_info, transid); + if (ret) + return ret; + goto again; + } + + /* + * if we are changing raid levels, try to allocate a corresponding + * block group with the new raid level. + */ + alloc_flags = update_block_group_flags(fs_info, cache->flags); + if (alloc_flags != cache->flags) { + ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); + /* + * ENOSPC is allowed here, we may have enough space + * already allocated at the new raid level to + * carry on + */ + if (ret == -ENOSPC) + ret = 0; + if (ret < 0) + goto out; + } + + ret = inc_block_group_ro(cache, 0); + if (!ret) + goto out; + alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags); + ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); + if (ret < 0) + goto out; + ret = inc_block_group_ro(cache, 0); +out: + if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { + alloc_flags = update_block_group_flags(fs_info, cache->flags); + mutex_lock(&fs_info->chunk_mutex); + check_system_chunk(trans, alloc_flags); + mutex_unlock(&fs_info->chunk_mutex); + } + mutex_unlock(&fs_info->ro_block_group_mutex); + + btrfs_end_transaction(trans); + return ret; +} + +void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache) +{ + struct btrfs_space_info *sinfo = cache->space_info; + u64 num_bytes; + + BUG_ON(!cache->ro); + + spin_lock(&sinfo->lock); + spin_lock(&cache->lock); + if (!--cache->ro) { + num_bytes = cache->key.offset - cache->reserved - + cache->pinned - cache->bytes_super - + btrfs_block_group_used(&cache->item); + sinfo->bytes_readonly -= num_bytes; + list_del_init(&cache->ro_list); + } + spin_unlock(&cache->lock); + spin_unlock(&sinfo->lock); +} + +static int write_one_cache_group(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_block_group_cache *cache) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + int ret; + struct btrfs_root *extent_root = fs_info->extent_root; + unsigned long bi; + struct extent_buffer *leaf; + + ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); + if (ret) { + if (ret > 0) + ret = -ENOENT; + goto fail; + } + + leaf = path->nodes[0]; + bi = btrfs_item_ptr_offset(leaf, path->slots[0]); + write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); + btrfs_mark_buffer_dirty(leaf); +fail: + btrfs_release_path(path); + return ret; + +} + +static int cache_save_setup(struct btrfs_block_group_cache *block_group, + struct btrfs_trans_handle *trans, + struct btrfs_path *path) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_root *root = fs_info->tree_root; + struct inode *inode = NULL; + struct extent_changeset *data_reserved = NULL; + u64 alloc_hint = 0; + int dcs = BTRFS_DC_ERROR; + u64 num_pages = 0; + int retries = 0; + int ret = 0; + + /* + * If this block group is smaller than 100 megs don't bother caching the + * block group. + */ + if (block_group->key.offset < (100 * SZ_1M)) { + spin_lock(&block_group->lock); + block_group->disk_cache_state = BTRFS_DC_WRITTEN; + spin_unlock(&block_group->lock); + return 0; + } + + if (trans->aborted) + return 0; +again: + inode = lookup_free_space_inode(block_group, path); + if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { + ret = PTR_ERR(inode); + btrfs_release_path(path); + goto out; + } + + if (IS_ERR(inode)) { + BUG_ON(retries); + retries++; + + if (block_group->ro) + goto out_free; + + ret = create_free_space_inode(trans, block_group, path); + if (ret) + goto out_free; + goto again; + } + + /* + * We want to set the generation to 0, that way if anything goes wrong + * from here on out we know not to trust this cache when we load up next + * time. + */ + BTRFS_I(inode)->generation = 0; + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + /* + * So theoretically we could recover from this, simply set the + * super cache generation to 0 so we know to invalidate the + * cache, but then we'd have to keep track of the block groups + * that fail this way so we know we _have_ to reset this cache + * before the next commit or risk reading stale cache. So to + * limit our exposure to horrible edge cases lets just abort the + * transaction, this only happens in really bad situations + * anyway. + */ + btrfs_abort_transaction(trans, ret); + goto out_put; + } + WARN_ON(ret); + + /* We've already setup this transaction, go ahead and exit */ + if (block_group->cache_generation == trans->transid && + i_size_read(inode)) { + dcs = BTRFS_DC_SETUP; + goto out_put; + } + + if (i_size_read(inode) > 0) { + ret = btrfs_check_trunc_cache_free_space(fs_info, + &fs_info->global_block_rsv); + if (ret) + goto out_put; + + ret = btrfs_truncate_free_space_cache(trans, NULL, inode); + if (ret) + goto out_put; + } + + spin_lock(&block_group->lock); + if (block_group->cached != BTRFS_CACHE_FINISHED || + !btrfs_test_opt(fs_info, SPACE_CACHE)) { + /* + * don't bother trying to write stuff out _if_ + * a) we're not cached, + * b) we're with nospace_cache mount option, + * c) we're with v2 space_cache (FREE_SPACE_TREE). + */ + dcs = BTRFS_DC_WRITTEN; + spin_unlock(&block_group->lock); + goto out_put; + } + spin_unlock(&block_group->lock); + + /* + * We hit an ENOSPC when setting up the cache in this transaction, just + * skip doing the setup, we've already cleared the cache so we're safe. + */ + if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { + ret = -ENOSPC; + goto out_put; + } + + /* + * Try to preallocate enough space based on how big the block group is. + * Keep in mind this has to include any pinned space which could end up + * taking up quite a bit since it's not folded into the other space + * cache. + */ + num_pages = div_u64(block_group->key.offset, SZ_256M); + if (!num_pages) + num_pages = 1; + + num_pages *= 16; + num_pages *= PAGE_SIZE; + + ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages); + if (ret) + goto out_put; + + ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, + num_pages, num_pages, + &alloc_hint); + /* + * Our cache requires contiguous chunks so that we don't modify a bunch + * of metadata or split extents when writing the cache out, which means + * we can enospc if we are heavily fragmented in addition to just normal + * out of space conditions. So if we hit this just skip setting up any + * other block groups for this transaction, maybe we'll unpin enough + * space the next time around. + */ + if (!ret) + dcs = BTRFS_DC_SETUP; + else if (ret == -ENOSPC) + set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); + +out_put: + iput(inode); +out_free: + btrfs_release_path(path); +out: + spin_lock(&block_group->lock); + if (!ret && dcs == BTRFS_DC_SETUP) + block_group->cache_generation = trans->transid; + block_group->disk_cache_state = dcs; + spin_unlock(&block_group->lock); + + extent_changeset_free(data_reserved); + return ret; +} + +int btrfs_setup_space_cache(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_group_cache *cache, *tmp; + struct btrfs_transaction *cur_trans = trans->transaction; + struct btrfs_path *path; + + if (list_empty(&cur_trans->dirty_bgs) || + !btrfs_test_opt(fs_info, SPACE_CACHE)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* Could add new block groups, use _safe just in case */ + list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs, + dirty_list) { + if (cache->disk_cache_state == BTRFS_DC_CLEAR) + cache_save_setup(cache, trans, path); + } + + btrfs_free_path(path); + return 0; +} + +/* + * Transaction commit does final block group cache writeback during a critical + * section where nothing is allowed to change the FS. This is required in + * order for the cache to actually match the block group, but can introduce a + * lot of latency into the commit. + * + * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO. + * There's a chance we'll have to redo some of it if the block group changes + * again during the commit, but it greatly reduces the commit latency by + * getting rid of the easy block groups while we're still allowing others to + * join the commit. + */ +int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_group_cache *cache; + struct btrfs_transaction *cur_trans = trans->transaction; + int ret = 0; + int should_put; + struct btrfs_path *path = NULL; + LIST_HEAD(dirty); + struct list_head *io = &cur_trans->io_bgs; + int num_started = 0; + int loops = 0; + + spin_lock(&cur_trans->dirty_bgs_lock); + if (list_empty(&cur_trans->dirty_bgs)) { + spin_unlock(&cur_trans->dirty_bgs_lock); + return 0; + } + list_splice_init(&cur_trans->dirty_bgs, &dirty); + spin_unlock(&cur_trans->dirty_bgs_lock); + +again: + /* Make sure all the block groups on our dirty list actually exist */ + btrfs_create_pending_block_groups(trans); + + if (!path) { + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + } + + /* + * cache_write_mutex is here only to save us from balance or automatic + * removal of empty block groups deleting this block group while we are + * writing out the cache + */ + mutex_lock(&trans->transaction->cache_write_mutex); + while (!list_empty(&dirty)) { + bool drop_reserve = true; + + cache = list_first_entry(&dirty, + struct btrfs_block_group_cache, + dirty_list); + /* + * This can happen if something re-dirties a block group that + * is already under IO. Just wait for it to finish and then do + * it all again + */ + if (!list_empty(&cache->io_list)) { + list_del_init(&cache->io_list); + btrfs_wait_cache_io(trans, cache, path); + btrfs_put_block_group(cache); + } + + + /* + * btrfs_wait_cache_io uses the cache->dirty_list to decide if + * it should update the cache_state. Don't delete until after + * we wait. + * + * Since we're not running in the commit critical section + * we need the dirty_bgs_lock to protect from update_block_group + */ + spin_lock(&cur_trans->dirty_bgs_lock); + list_del_init(&cache->dirty_list); + spin_unlock(&cur_trans->dirty_bgs_lock); + + should_put = 1; + + cache_save_setup(cache, trans, path); + + if (cache->disk_cache_state == BTRFS_DC_SETUP) { + cache->io_ctl.inode = NULL; + ret = btrfs_write_out_cache(trans, cache, path); + if (ret == 0 && cache->io_ctl.inode) { + num_started++; + should_put = 0; + + /* + * The cache_write_mutex is protecting the + * io_list, also refer to the definition of + * btrfs_transaction::io_bgs for more details + */ + list_add_tail(&cache->io_list, io); + } else { + /* + * If we failed to write the cache, the + * generation will be bad and life goes on + */ + ret = 0; + } + } + if (!ret) { + ret = write_one_cache_group(trans, path, cache); + /* + * Our block group might still be attached to the list + * of new block groups in the transaction handle of some + * other task (struct btrfs_trans_handle->new_bgs). This + * means its block group item isn't yet in the extent + * tree. If this happens ignore the error, as we will + * try again later in the critical section of the + * transaction commit. + */ + if (ret == -ENOENT) { + ret = 0; + spin_lock(&cur_trans->dirty_bgs_lock); + if (list_empty(&cache->dirty_list)) { + list_add_tail(&cache->dirty_list, + &cur_trans->dirty_bgs); + btrfs_get_block_group(cache); + drop_reserve = false; + } + spin_unlock(&cur_trans->dirty_bgs_lock); + } else if (ret) { + btrfs_abort_transaction(trans, ret); + } + } + + /* If it's not on the io list, we need to put the block group */ + if (should_put) + btrfs_put_block_group(cache); + if (drop_reserve) + btrfs_delayed_refs_rsv_release(fs_info, 1); + + if (ret) + break; + + /* + * Avoid blocking other tasks for too long. It might even save + * us from writing caches for block groups that are going to be + * removed. + */ + mutex_unlock(&trans->transaction->cache_write_mutex); + mutex_lock(&trans->transaction->cache_write_mutex); + } + mutex_unlock(&trans->transaction->cache_write_mutex); + + /* + * Go through delayed refs for all the stuff we've just kicked off + * and then loop back (just once) + */ + ret = btrfs_run_delayed_refs(trans, 0); + if (!ret && loops == 0) { + loops++; + spin_lock(&cur_trans->dirty_bgs_lock); + list_splice_init(&cur_trans->dirty_bgs, &dirty); + /* + * dirty_bgs_lock protects us from concurrent block group + * deletes too (not just cache_write_mutex). + */ + if (!list_empty(&dirty)) { + spin_unlock(&cur_trans->dirty_bgs_lock); + goto again; + } + spin_unlock(&cur_trans->dirty_bgs_lock); + } else if (ret < 0) { + btrfs_cleanup_dirty_bgs(cur_trans, fs_info); + } + + btrfs_free_path(path); + return ret; +} + +int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_group_cache *cache; + struct btrfs_transaction *cur_trans = trans->transaction; + int ret = 0; + int should_put; + struct btrfs_path *path; + struct list_head *io = &cur_trans->io_bgs; + int num_started = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * Even though we are in the critical section of the transaction commit, + * we can still have concurrent tasks adding elements to this + * transaction's list of dirty block groups. These tasks correspond to + * endio free space workers started when writeback finishes for a + * space cache, which run inode.c:btrfs_finish_ordered_io(), and can + * allocate new block groups as a result of COWing nodes of the root + * tree when updating the free space inode. The writeback for the space + * caches is triggered by an earlier call to + * btrfs_start_dirty_block_groups() and iterations of the following + * loop. + * Also we want to do the cache_save_setup first and then run the + * delayed refs to make sure we have the best chance at doing this all + * in one shot. + */ + spin_lock(&cur_trans->dirty_bgs_lock); + while (!list_empty(&cur_trans->dirty_bgs)) { + cache = list_first_entry(&cur_trans->dirty_bgs, + struct btrfs_block_group_cache, + dirty_list); + + /* + * This can happen if cache_save_setup re-dirties a block group + * that is already under IO. Just wait for it to finish and + * then do it all again + */ + if (!list_empty(&cache->io_list)) { + spin_unlock(&cur_trans->dirty_bgs_lock); + list_del_init(&cache->io_list); + btrfs_wait_cache_io(trans, cache, path); + btrfs_put_block_group(cache); + spin_lock(&cur_trans->dirty_bgs_lock); + } + + /* + * Don't remove from the dirty list until after we've waited on + * any pending IO + */ + list_del_init(&cache->dirty_list); + spin_unlock(&cur_trans->dirty_bgs_lock); + should_put = 1; + + cache_save_setup(cache, trans, path); + + if (!ret) + ret = btrfs_run_delayed_refs(trans, + (unsigned long) -1); + + if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { + cache->io_ctl.inode = NULL; + ret = btrfs_write_out_cache(trans, cache, path); + if (ret == 0 && cache->io_ctl.inode) { + num_started++; + should_put = 0; + list_add_tail(&cache->io_list, io); + } else { + /* + * If we failed to write the cache, the + * generation will be bad and life goes on + */ + ret = 0; + } + } + if (!ret) { + ret = write_one_cache_group(trans, path, cache); + /* + * One of the free space endio workers might have + * created a new block group while updating a free space + * cache's inode (at inode.c:btrfs_finish_ordered_io()) + * and hasn't released its transaction handle yet, in + * which case the new block group is still attached to + * its transaction handle and its creation has not + * finished yet (no block group item in the extent tree + * yet, etc). If this is the case, wait for all free + * space endio workers to finish and retry. This is a + * a very rare case so no need for a more efficient and + * complex approach. + */ + if (ret == -ENOENT) { + wait_event(cur_trans->writer_wait, + atomic_read(&cur_trans->num_writers) == 1); + ret = write_one_cache_group(trans, path, cache); + } + if (ret) + btrfs_abort_transaction(trans, ret); + } + + /* If its not on the io list, we need to put the block group */ + if (should_put) + btrfs_put_block_group(cache); + btrfs_delayed_refs_rsv_release(fs_info, 1); + spin_lock(&cur_trans->dirty_bgs_lock); + } + spin_unlock(&cur_trans->dirty_bgs_lock); + + /* + * Refer to the definition of io_bgs member for details why it's safe + * to use it without any locking + */ + while (!list_empty(io)) { + cache = list_first_entry(io, struct btrfs_block_group_cache, + io_list); + list_del_init(&cache->io_list); + btrfs_wait_cache_io(trans, cache, path); + btrfs_put_block_group(cache); + } + + btrfs_free_path(path); + return ret; +} + +int btrfs_update_block_group(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, int alloc) +{ + struct btrfs_fs_info *info = trans->fs_info; + struct btrfs_block_group_cache *cache = NULL; + u64 total = num_bytes; + u64 old_val; + u64 byte_in_group; + int factor; + int ret = 0; + + /* Block accounting for super block */ + spin_lock(&info->delalloc_root_lock); + old_val = btrfs_super_bytes_used(info->super_copy); + if (alloc) + old_val += num_bytes; + else + old_val -= num_bytes; + btrfs_set_super_bytes_used(info->super_copy, old_val); + spin_unlock(&info->delalloc_root_lock); + + while (total) { + cache = btrfs_lookup_block_group(info, bytenr); + if (!cache) { + ret = -ENOENT; + break; + } + factor = btrfs_bg_type_to_factor(cache->flags); + + /* + * If this block group has free space cache written out, we + * need to make sure to load it if we are removing space. This + * is because we need the unpinning stage to actually add the + * space back to the block group, otherwise we will leak space. + */ + if (!alloc && cache->cached == BTRFS_CACHE_NO) + btrfs_cache_block_group(cache, 1); + + byte_in_group = bytenr - cache->key.objectid; + WARN_ON(byte_in_group > cache->key.offset); + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + + if (btrfs_test_opt(info, SPACE_CACHE) && + cache->disk_cache_state < BTRFS_DC_CLEAR) + cache->disk_cache_state = BTRFS_DC_CLEAR; + + old_val = btrfs_block_group_used(&cache->item); + num_bytes = min(total, cache->key.offset - byte_in_group); + if (alloc) { + old_val += num_bytes; + btrfs_set_block_group_used(&cache->item, old_val); + cache->reserved -= num_bytes; + cache->space_info->bytes_reserved -= num_bytes; + cache->space_info->bytes_used += num_bytes; + cache->space_info->disk_used += num_bytes * factor; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + } else { + old_val -= num_bytes; + btrfs_set_block_group_used(&cache->item, old_val); + cache->pinned += num_bytes; + btrfs_space_info_update_bytes_pinned(info, + cache->space_info, num_bytes); + cache->space_info->bytes_used -= num_bytes; + cache->space_info->disk_used -= num_bytes * factor; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + + percpu_counter_add_batch( + &cache->space_info->total_bytes_pinned, + num_bytes, + BTRFS_TOTAL_BYTES_PINNED_BATCH); + set_extent_dirty(info->pinned_extents, + bytenr, bytenr + num_bytes - 1, + GFP_NOFS | __GFP_NOFAIL); + } + + spin_lock(&trans->transaction->dirty_bgs_lock); + if (list_empty(&cache->dirty_list)) { + list_add_tail(&cache->dirty_list, + &trans->transaction->dirty_bgs); + trans->delayed_ref_updates++; + btrfs_get_block_group(cache); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + + /* + * No longer have used bytes in this block group, queue it for + * deletion. We do this after adding the block group to the + * dirty list to avoid races between cleaner kthread and space + * cache writeout. + */ + if (!alloc && old_val == 0) + btrfs_mark_bg_unused(cache); + + btrfs_put_block_group(cache); + total -= num_bytes; + bytenr += num_bytes; + } + + /* Modified block groups are accounted for in the delayed_refs_rsv. */ + btrfs_update_delayed_refs_rsv(trans); + return ret; +} + +/** + * btrfs_add_reserved_bytes - update the block_group and space info counters + * @cache: The cache we are manipulating + * @ram_bytes: The number of bytes of file content, and will be same to + * @num_bytes except for the compress path. + * @num_bytes: The number of bytes in question + * @delalloc: The blocks are allocated for the delalloc write + * + * This is called by the allocator when it reserves space. If this is a + * reservation and the block group has become read only we cannot make the + * reservation and return -EAGAIN, otherwise this function always succeeds. + */ +int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 ram_bytes, u64 num_bytes, int delalloc) +{ + struct btrfs_space_info *space_info = cache->space_info; + int ret = 0; + + spin_lock(&space_info->lock); + spin_lock(&cache->lock); + if (cache->ro) { + ret = -EAGAIN; + } else { + cache->reserved += num_bytes; + space_info->bytes_reserved += num_bytes; + trace_btrfs_space_reservation(cache->fs_info, "space_info", + space_info->flags, num_bytes, 1); + btrfs_space_info_update_bytes_may_use(cache->fs_info, + space_info, -ram_bytes); + if (delalloc) + cache->delalloc_bytes += num_bytes; + } + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); + return ret; +} + +/** + * btrfs_free_reserved_bytes - update the block_group and space info counters + * @cache: The cache we are manipulating + * @num_bytes: The number of bytes in question + * @delalloc: The blocks are allocated for the delalloc write + * + * This is called by somebody who is freeing space that was never actually used + * on disk. For example if you reserve some space for a new leaf in transaction + * A and before transaction A commits you free that leaf, you call this with + * reserve set to 0 in order to clear the reservation. + */ +void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int delalloc) +{ + struct btrfs_space_info *space_info = cache->space_info; + + spin_lock(&space_info->lock); + spin_lock(&cache->lock); + if (cache->ro) + space_info->bytes_readonly += num_bytes; + cache->reserved -= num_bytes; + space_info->bytes_reserved -= num_bytes; + space_info->max_extent_size = 0; + + if (delalloc) + cache->delalloc_bytes -= num_bytes; + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); +} + +static void force_metadata_allocation(struct btrfs_fs_info *info) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) { + if (found->flags & BTRFS_BLOCK_GROUP_METADATA) + found->force_alloc = CHUNK_ALLOC_FORCE; + } + rcu_read_unlock(); +} + +static int should_alloc_chunk(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *sinfo, int force) +{ + u64 bytes_used = btrfs_space_info_used(sinfo, false); + u64 thresh; + + if (force == CHUNK_ALLOC_FORCE) + return 1; + + /* + * in limited mode, we want to have some free space up to + * about 1% of the FS size. + */ + if (force == CHUNK_ALLOC_LIMITED) { + thresh = btrfs_super_total_bytes(fs_info->super_copy); + thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1)); + + if (sinfo->total_bytes - bytes_used < thresh) + return 1; + } + + if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8)) + return 0; + return 1; +} + +int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) +{ + u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type); + + return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); +} + +/* + * If force is CHUNK_ALLOC_FORCE: + * - return 1 if it successfully allocates a chunk, + * - return errors including -ENOSPC otherwise. + * If force is NOT CHUNK_ALLOC_FORCE: + * - return 0 if it doesn't need to allocate a new chunk, + * - return 1 if it successfully allocates a chunk, + * - return errors including -ENOSPC otherwise. + */ +int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, + enum btrfs_chunk_alloc_enum force) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_space_info *space_info; + bool wait_for_alloc = false; + bool should_alloc = false; + int ret = 0; + + /* Don't re-enter if we're already allocating a chunk */ + if (trans->allocating_chunk) + return -ENOSPC; + + space_info = btrfs_find_space_info(fs_info, flags); + ASSERT(space_info); + + do { + spin_lock(&space_info->lock); + if (force < space_info->force_alloc) + force = space_info->force_alloc; + should_alloc = should_alloc_chunk(fs_info, space_info, force); + if (space_info->full) { + /* No more free physical space */ + if (should_alloc) + ret = -ENOSPC; + else + ret = 0; + spin_unlock(&space_info->lock); + return ret; + } else if (!should_alloc) { + spin_unlock(&space_info->lock); + return 0; + } else if (space_info->chunk_alloc) { + /* + * Someone is already allocating, so we need to block + * until this someone is finished and then loop to + * recheck if we should continue with our allocation + * attempt. + */ + wait_for_alloc = true; + spin_unlock(&space_info->lock); + mutex_lock(&fs_info->chunk_mutex); + mutex_unlock(&fs_info->chunk_mutex); + } else { + /* Proceed with allocation */ + space_info->chunk_alloc = 1; + wait_for_alloc = false; + spin_unlock(&space_info->lock); + } + + cond_resched(); + } while (wait_for_alloc); + + mutex_lock(&fs_info->chunk_mutex); + trans->allocating_chunk = true; + + /* + * If we have mixed data/metadata chunks we want to make sure we keep + * allocating mixed chunks instead of individual chunks. + */ + if (btrfs_mixed_space_info(space_info)) + flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); + + /* + * if we're doing a data chunk, go ahead and make sure that + * we keep a reasonable number of metadata chunks allocated in the + * FS as well. + */ + if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { + fs_info->data_chunk_allocations++; + if (!(fs_info->data_chunk_allocations % + fs_info->metadata_ratio)) + force_metadata_allocation(fs_info); + } + + /* + * Check if we have enough space in SYSTEM chunk because we may need + * to update devices. + */ + check_system_chunk(trans, flags); + + ret = btrfs_alloc_chunk(trans, flags); + trans->allocating_chunk = false; + + spin_lock(&space_info->lock); + if (ret < 0) { + if (ret == -ENOSPC) + space_info->full = 1; + else + goto out; + } else { + ret = 1; + space_info->max_extent_size = 0; + } + + space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; +out: + space_info->chunk_alloc = 0; + spin_unlock(&space_info->lock); + mutex_unlock(&fs_info->chunk_mutex); + /* + * When we allocate a new chunk we reserve space in the chunk block + * reserve to make sure we can COW nodes/leafs in the chunk tree or + * add new nodes/leafs to it if we end up needing to do it when + * inserting the chunk item and updating device items as part of the + * second phase of chunk allocation, performed by + * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a + * large number of new block groups to create in our transaction + * handle's new_bgs list to avoid exhausting the chunk block reserve + * in extreme cases - like having a single transaction create many new + * block groups when starting to write out the free space caches of all + * the block groups that were made dirty during the lifetime of the + * transaction. + */ + if (trans->chunk_bytes_reserved >= (u64)SZ_2M) + btrfs_create_pending_block_groups(trans); + + return ret; +} + +static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) +{ + u64 num_dev; + + num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max; + if (!num_dev) + num_dev = fs_info->fs_devices->rw_devices; + + return num_dev; +} + +/* + * If @is_allocation is true, reserve space in the system space info necessary + * for allocating a chunk, otherwise if it's false, reserve space necessary for + * removing a chunk. + */ +void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_space_info *info; + u64 left; + u64 thresh; + int ret = 0; + u64 num_devs; + + /* + * Needed because we can end up allocating a system chunk and for an + * atomic and race free space reservation in the chunk block reserve. + */ + lockdep_assert_held(&fs_info->chunk_mutex); + + info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); + spin_lock(&info->lock); + left = info->total_bytes - btrfs_space_info_used(info, true); + spin_unlock(&info->lock); + + num_devs = get_profile_num_devs(fs_info, type); + + /* num_devs device items to update and 1 chunk item to add or remove */ + thresh = btrfs_calc_metadata_size(fs_info, num_devs) + + btrfs_calc_insert_metadata_size(fs_info, 1); + + if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { + btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", + left, thresh, type); + btrfs_dump_space_info(fs_info, info, 0, 0); + } + + if (left < thresh) { + u64 flags = btrfs_system_alloc_profile(fs_info); + + /* + * Ignore failure to create system chunk. We might end up not + * needing it, as we might not need to COW all nodes/leafs from + * the paths we visit in the chunk tree (they were already COWed + * or created in the current transaction for example). + */ + ret = btrfs_alloc_chunk(trans, flags); + } + + if (!ret) { + ret = btrfs_block_rsv_add(fs_info->chunk_root, + &fs_info->chunk_block_rsv, + thresh, BTRFS_RESERVE_NO_FLUSH); + if (!ret) + trans->chunk_bytes_reserved += thresh; + } +} + +void btrfs_put_block_group_cache(struct btrfs_fs_info *info) +{ + struct btrfs_block_group_cache *block_group; + u64 last = 0; + + while (1) { + struct inode *inode; + + block_group = btrfs_lookup_first_block_group(info, last); + while (block_group) { + btrfs_wait_block_group_cache_done(block_group); + spin_lock(&block_group->lock); + if (block_group->iref) + break; + spin_unlock(&block_group->lock); + block_group = btrfs_next_block_group(block_group); + } + if (!block_group) { + if (last == 0) + break; + last = 0; + continue; + } + + inode = block_group->inode; + block_group->iref = 0; + block_group->inode = NULL; + spin_unlock(&block_group->lock); + ASSERT(block_group->io_ctl.inode == NULL); + iput(inode); + last = block_group->key.objectid + block_group->key.offset; + btrfs_put_block_group(block_group); + } +} + +/* + * Must be called only after stopping all workers, since we could have block + * group caching kthreads running, and therefore they could race with us if we + * freed the block groups before stopping them. + */ +int btrfs_free_block_groups(struct btrfs_fs_info *info) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; + struct btrfs_caching_control *caching_ctl; + struct rb_node *n; + + down_write(&info->commit_root_sem); + while (!list_empty(&info->caching_block_groups)) { + caching_ctl = list_entry(info->caching_block_groups.next, + struct btrfs_caching_control, list); + list_del(&caching_ctl->list); + btrfs_put_caching_control(caching_ctl); + } + up_write(&info->commit_root_sem); + + spin_lock(&info->unused_bgs_lock); + while (!list_empty(&info->unused_bgs)) { + block_group = list_first_entry(&info->unused_bgs, + struct btrfs_block_group_cache, + bg_list); + list_del_init(&block_group->bg_list); + btrfs_put_block_group(block_group); + } + spin_unlock(&info->unused_bgs_lock); + + spin_lock(&info->block_group_cache_lock); + while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { + block_group = rb_entry(n, struct btrfs_block_group_cache, + cache_node); + rb_erase(&block_group->cache_node, + &info->block_group_cache_tree); + RB_CLEAR_NODE(&block_group->cache_node); + spin_unlock(&info->block_group_cache_lock); + + down_write(&block_group->space_info->groups_sem); + list_del(&block_group->list); + up_write(&block_group->space_info->groups_sem); + + /* + * We haven't cached this block group, which means we could + * possibly have excluded extents on this block group. + */ + if (block_group->cached == BTRFS_CACHE_NO || + block_group->cached == BTRFS_CACHE_ERROR) + btrfs_free_excluded_extents(block_group); + + btrfs_remove_free_space_cache(block_group); + ASSERT(block_group->cached != BTRFS_CACHE_STARTED); + ASSERT(list_empty(&block_group->dirty_list)); + ASSERT(list_empty(&block_group->io_list)); + ASSERT(list_empty(&block_group->bg_list)); + ASSERT(atomic_read(&block_group->count) == 1); + btrfs_put_block_group(block_group); + + spin_lock(&info->block_group_cache_lock); + } + spin_unlock(&info->block_group_cache_lock); + + /* + * Now that all the block groups are freed, go through and free all the + * space_info structs. This is only called during the final stages of + * unmount, and so we know nobody is using them. We call + * synchronize_rcu() once before we start, just to be on the safe side. + */ + synchronize_rcu(); + + btrfs_release_global_block_rsv(info); + + while (!list_empty(&info->space_info)) { + space_info = list_entry(info->space_info.next, + struct btrfs_space_info, + list); + + /* + * Do not hide this behind enospc_debug, this is actually + * important and indicates a real bug if this happens. + */ + if (WARN_ON(space_info->bytes_pinned > 0 || + space_info->bytes_reserved > 0 || + space_info->bytes_may_use > 0)) + btrfs_dump_space_info(info, space_info, 0, 0); + list_del(&space_info->list); + btrfs_sysfs_remove_space_info(space_info); + } + return 0; +} diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h new file mode 100644 index 000000000000..c391800388dd --- /dev/null +++ b/fs/btrfs/block-group.h @@ -0,0 +1,250 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_BLOCK_GROUP_H +#define BTRFS_BLOCK_GROUP_H + +#include "free-space-cache.h" + +enum btrfs_disk_cache_state { + BTRFS_DC_WRITTEN, + BTRFS_DC_ERROR, + BTRFS_DC_CLEAR, + BTRFS_DC_SETUP, +}; + +/* + * Control flags for do_chunk_alloc's force field CHUNK_ALLOC_NO_FORCE means to + * only allocate a chunk if we really need one. + * + * CHUNK_ALLOC_LIMITED means to only try and allocate one if we have very few + * chunks already allocated. This is used as part of the clustering code to + * help make sure we have a good pool of storage to cluster in, without filling + * the FS with empty chunks + * + * CHUNK_ALLOC_FORCE means it must try to allocate one + */ +enum btrfs_chunk_alloc_enum { + CHUNK_ALLOC_NO_FORCE, + CHUNK_ALLOC_LIMITED, + CHUNK_ALLOC_FORCE, +}; + +struct btrfs_caching_control { + struct list_head list; + struct mutex mutex; + wait_queue_head_t wait; + struct btrfs_work work; + struct btrfs_block_group_cache *block_group; + u64 progress; + refcount_t count; +}; + +/* Once caching_thread() finds this much free space, it will wake up waiters. */ +#define CACHING_CTL_WAKE_UP SZ_2M + +struct btrfs_block_group_cache { + struct btrfs_key key; + struct btrfs_block_group_item item; + struct btrfs_fs_info *fs_info; + struct inode *inode; + spinlock_t lock; + u64 pinned; + u64 reserved; + u64 delalloc_bytes; + u64 bytes_super; + u64 flags; + u64 cache_generation; + + /* + * If the free space extent count exceeds this number, convert the block + * group to bitmaps. + */ + u32 bitmap_high_thresh; + + /* + * If the free space extent count drops below this number, convert the + * block group back to extents. + */ + u32 bitmap_low_thresh; + + /* + * It is just used for the delayed data space allocation because + * only the data space allocation and the relative metadata update + * can be done cross the transaction. + */ + struct rw_semaphore data_rwsem; + + /* For raid56, this is a full stripe, without parity */ + unsigned long full_stripe_len; + + unsigned int ro; + unsigned int iref:1; + unsigned int has_caching_ctl:1; + unsigned int removed:1; + + int disk_cache_state; + + /* Cache tracking stuff */ + int cached; + struct btrfs_caching_control *caching_ctl; + u64 last_byte_to_unpin; + + struct btrfs_space_info *space_info; + + /* Free space cache stuff */ + struct btrfs_free_space_ctl *free_space_ctl; + + /* Block group cache stuff */ + struct rb_node cache_node; + + /* For block groups in the same raid type */ + struct list_head list; + + /* Usage count */ + atomic_t count; + + /* + * List of struct btrfs_free_clusters for this block group. + * Today it will only have one thing on it, but that may change + */ + struct list_head cluster_list; + + /* For delayed block group creation or deletion of empty block groups */ + struct list_head bg_list; + + /* For read-only block groups */ + struct list_head ro_list; + + atomic_t trimming; + + /* For dirty block groups */ + struct list_head dirty_list; + struct list_head io_list; + + struct btrfs_io_ctl io_ctl; + + /* + * Incremented when doing extent allocations and holding a read lock + * on the space_info's groups_sem semaphore. + * Decremented when an ordered extent that represents an IO against this + * block group's range is created (after it's added to its inode's + * root's list of ordered extents) or immediately after the allocation + * if it's a metadata extent or fallocate extent (for these cases we + * don't create ordered extents). + */ + atomic_t reservations; + + /* + * Incremented while holding the spinlock *lock* by a task checking if + * it can perform a nocow write (incremented if the value for the *ro* + * field is 0). Decremented by such tasks once they create an ordered + * extent or before that if some error happens before reaching that step. + * This is to prevent races between block group relocation and nocow + * writes through direct IO. + */ + atomic_t nocow_writers; + + /* Lock for free space tree operations. */ + struct mutex free_space_lock; + + /* + * Does the block group need to be added to the free space tree? + * Protected by free_space_lock. + */ + int needs_free_space; + + /* Record locked full stripes for RAID5/6 block group */ + struct btrfs_full_stripe_locks_tree full_stripe_locks_root; +}; + +#ifdef CONFIG_BTRFS_DEBUG +static inline int btrfs_should_fragment_free_space( + struct btrfs_block_group_cache *block_group) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + + return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) && + block_group->flags & BTRFS_BLOCK_GROUP_METADATA) || + (btrfs_test_opt(fs_info, FRAGMENT_DATA) && + block_group->flags & BTRFS_BLOCK_GROUP_DATA); +} +#endif + +struct btrfs_block_group_cache *btrfs_lookup_first_block_group( + struct btrfs_fs_info *info, u64 bytenr); +struct btrfs_block_group_cache *btrfs_lookup_block_group( + struct btrfs_fs_info *info, u64 bytenr); +struct btrfs_block_group_cache *btrfs_next_block_group( + struct btrfs_block_group_cache *cache); +void btrfs_get_block_group(struct btrfs_block_group_cache *cache); +void btrfs_put_block_group(struct btrfs_block_group_cache *cache); +void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, + const u64 start); +void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); +bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); +void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); +void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg); +void btrfs_wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, + u64 num_bytes); +int btrfs_wait_block_group_cache_done(struct btrfs_block_group_cache *cache); +int btrfs_cache_block_group(struct btrfs_block_group_cache *cache, + int load_cache_only); +void btrfs_put_caching_control(struct btrfs_caching_control *ctl); +struct btrfs_caching_control *btrfs_get_caching_control( + struct btrfs_block_group_cache *cache); +u64 add_new_free_space(struct btrfs_block_group_cache *block_group, + u64 start, u64 end); +struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( + struct btrfs_fs_info *fs_info, + const u64 chunk_offset); +int btrfs_remove_block_group(struct btrfs_trans_handle *trans, + u64 group_start, struct extent_map *em); +void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); +void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg); +int btrfs_read_block_groups(struct btrfs_fs_info *info); +int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, + u64 type, u64 chunk_offset, u64 size); +void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans); +int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache); +void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache); +int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans); +int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans); +int btrfs_setup_space_cache(struct btrfs_trans_handle *trans); +int btrfs_update_block_group(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, int alloc); +int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 ram_bytes, u64 num_bytes, int delalloc); +void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int delalloc); +int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, + enum btrfs_chunk_alloc_enum force); +int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type); +void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type); +u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags); +void btrfs_put_block_group_cache(struct btrfs_fs_info *info); +int btrfs_free_block_groups(struct btrfs_fs_info *info); + +static inline u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info) +{ + return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA); +} + +static inline u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info) +{ + return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA); +} + +static inline u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info) +{ + return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); +} + +static inline int btrfs_block_group_cache_done( + struct btrfs_block_group_cache *cache) +{ + smp_mb(); + return cache->cached == BTRFS_CACHE_FINISHED || + cache->cached == BTRFS_CACHE_ERROR; +} + +#endif /* BTRFS_BLOCK_GROUP_H */ diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 698470b9f32d..d07bd41a7c1e 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -1,9 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 +#include "misc.h" #include "ctree.h" #include "block-rsv.h" #include "space-info.h" -#include "math.h" #include "transaction.h" static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, @@ -54,8 +54,9 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, spin_unlock(&dest->lock); } if (num_bytes) - btrfs_space_info_add_old_bytes(fs_info, space_info, - num_bytes); + btrfs_space_info_free_bytes_may_use(fs_info, + space_info, + num_bytes); } if (qgroup_to_release_ret) *qgroup_to_release_ret = qgroup_to_release; @@ -258,6 +259,7 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; struct btrfs_space_info *sinfo = block_rsv->space_info; u64 num_bytes; + unsigned min_items; /* * The global block rsv is based on the size of the extent tree, the @@ -267,7 +269,26 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) + btrfs_root_used(&fs_info->csum_root->root_item) + btrfs_root_used(&fs_info->tree_root->root_item); - num_bytes = max_t(u64, num_bytes, SZ_16M); + + /* + * We at a minimum are going to modify the csum root, the tree root, and + * the extent root. + */ + min_items = 3; + + /* + * But we also want to reserve enough space so we can do the fallback + * global reserve for an unlink, which is an additional 5 items (see the + * comment in __unlink_start_trans for what we're modifying.) + * + * But we also need space for the delayed ref updates from the unlink, + * so its 10, 5 for the actual operation, and 5 for the delayed ref + * updates. + */ + min_items += 10; + + num_bytes = max_t(u64, num_bytes, + btrfs_calc_insert_metadata_size(fs_info, min_items)); spin_lock(&sinfo->lock); spin_lock(&block_rsv->lock); @@ -275,25 +296,16 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) block_rsv->size = min_t(u64, num_bytes, SZ_512M); if (block_rsv->reserved < block_rsv->size) { - num_bytes = btrfs_space_info_used(sinfo, true); - if (sinfo->total_bytes > num_bytes) { - num_bytes = sinfo->total_bytes - num_bytes; - num_bytes = min(num_bytes, - block_rsv->size - block_rsv->reserved); - block_rsv->reserved += num_bytes; - btrfs_space_info_update_bytes_may_use(fs_info, sinfo, - num_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - sinfo->flags, num_bytes, - 1); - } + num_bytes = block_rsv->size - block_rsv->reserved; + block_rsv->reserved += num_bytes; + btrfs_space_info_update_bytes_may_use(fs_info, sinfo, + num_bytes); } else if (block_rsv->reserved > block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; btrfs_space_info_update_bytes_may_use(fs_info, sinfo, -num_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - sinfo->flags, num_bytes, 0); block_rsv->reserved = block_rsv->size; + btrfs_try_granting_tickets(fs_info, sinfo); } if (block_rsv->reserved == block_rsv->size) diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 81a9731959a9..0b52ab4cb964 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -940,7 +940,7 @@ static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf) kfree(sf); } -static int btrfsic_process_metablock( +static noinline_for_stack int btrfsic_process_metablock( struct btrfsic_state *state, struct btrfsic_block *const first_block, struct btrfsic_block_data_ctx *const first_block_ctx, @@ -1706,8 +1706,9 @@ static void btrfsic_dump_database(struct btrfsic_state *state) * Test whether the disk block contains a tree block (leaf or node) * (note that this test fails for the super block) */ -static int btrfsic_test_for_metadata(struct btrfsic_state *state, - char **datav, unsigned int num_pages) +static noinline_for_stack int btrfsic_test_for_metadata( + struct btrfsic_state *state, + char **datav, unsigned int num_pages) { struct btrfs_fs_info *fs_info = state->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 60c47b417a4b..b05b361e2062 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -18,6 +18,7 @@ #include <linux/sched/mm.h> #include <linux/log2.h> #include <crypto/hash.h> +#include "misc.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -1039,7 +1040,7 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, struct list_head *workspace; int ret; - level = btrfs_compress_op[type]->set_level(level); + level = btrfs_compress_set_level(type, level); workspace = get_workspace(type, level); ret = btrfs_compress_op[type]->compress_pages(workspace, mapping, start, pages, @@ -1611,7 +1612,23 @@ unsigned int btrfs_compress_str2level(unsigned int type, const char *str) level = 0; } - level = btrfs_compress_op[type]->set_level(level); + level = btrfs_compress_set_level(type, level); + + return level; +} + +/* + * Adjust @level according to the limits of the compression algorithm or + * fallback to default + */ +unsigned int btrfs_compress_set_level(int type, unsigned level) +{ + const struct btrfs_compress_op *ops = btrfs_compress_op[type]; + + if (level == 0) + level = ops->default_level; + else + level = min(level, ops->max_level); return level; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 2035b8eb1290..4cb8be9ff88b 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -156,12 +156,9 @@ struct btrfs_compress_op { unsigned long start_byte, size_t srclen, size_t destlen); - /* - * This bounds the level set by the user to be within range of a - * particular compression type. It returns the level that will be used - * if the level is out of bounds or the default if 0 is passed in. - */ - unsigned int (*set_level)(unsigned int level); + /* Maximum level supported by the compression algorithm */ + unsigned int max_level; + unsigned int default_level; }; /* The heuristic workspaces are managed via the 0th workspace manager */ @@ -175,6 +172,8 @@ extern const struct btrfs_compress_op btrfs_zstd_compress; const char* btrfs_compress_type2str(enum btrfs_compression_type type); bool btrfs_compress_is_valid_type(const char *str, size_t len); +unsigned int btrfs_compress_set_level(int type, unsigned level); + int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end); #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 5df76c17775a..e59cde204b2f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -29,6 +29,28 @@ static int balance_node_right(struct btrfs_trans_handle *trans, static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level, int slot); +static const struct btrfs_csums { + u16 size; + const char *name; +} btrfs_csums[] = { + [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, +}; + +int btrfs_super_csum_size(const struct btrfs_super_block *s) +{ + u16 t = btrfs_super_csum_type(s); + /* + * csum type is validated at mount time + */ + return btrfs_csums[t].size; +} + +const char *btrfs_super_csum_name(u16 csum_type) +{ + /* csum type is validated at mount time */ + return btrfs_csums[csum_type].name; +} + struct btrfs_path *btrfs_alloc_path(void) { return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); @@ -376,8 +398,6 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, * The 'start address' is the logical address of the *new* root node * for root replace operations, or the logical address of the affected * block for all other operations. - * - * Note: must be called with write lock for fs_info::tree_mod_log_lock. */ static noinline int __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) @@ -387,6 +407,8 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) struct rb_node *parent = NULL; struct tree_mod_elem *cur; + lockdep_assert_held_write(&fs_info->tree_mod_log_lock); + tm->seq = btrfs_inc_tree_mod_seq(fs_info); tm_root = &fs_info->tree_mod_log; @@ -1343,6 +1365,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) struct tree_mod_elem *tm; struct extent_buffer *eb = NULL; struct extent_buffer *eb_root; + u64 eb_root_owner = 0; struct extent_buffer *old; struct tree_mod_root *old_root = NULL; u64 old_generation = 0; @@ -1380,6 +1403,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) free_extent_buffer(old); } } else if (old_root) { + eb_root_owner = btrfs_header_owner(eb_root); btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); eb = alloc_dummy_extent_buffer(fs_info, logical); @@ -1396,7 +1420,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) if (old_root) { btrfs_set_header_bytenr(eb, eb->start); btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); - btrfs_set_header_owner(eb, btrfs_header_owner(eb_root)); + btrfs_set_header_owner(eb, eb_root_owner); btrfs_set_header_level(eb, old_root->level); btrfs_set_header_generation(eb, old_generation); } @@ -1790,8 +1814,8 @@ static void root_sub_used(struct btrfs_root *root, u32 size) /* given a node and slot number, this reads the blocks it points to. The * extent buffer is returned with a reference taken (but unlocked). */ -static noinline struct extent_buffer *read_node_slot( - struct extent_buffer *parent, int slot) +struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, + int slot) { int level = btrfs_header_level(parent); struct extent_buffer *eb; @@ -1860,7 +1884,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, return 0; /* promote the child to a root */ - child = read_node_slot(mid, 0); + child = btrfs_read_node_slot(mid, 0); if (IS_ERR(child)) { ret = PTR_ERR(child); btrfs_handle_fs_error(fs_info, ret, NULL); @@ -1900,7 +1924,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4) return 0; - left = read_node_slot(parent, pslot - 1); + left = btrfs_read_node_slot(parent, pslot - 1); if (IS_ERR(left)) left = NULL; @@ -1915,7 +1939,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, } } - right = read_node_slot(parent, pslot + 1); + right = btrfs_read_node_slot(parent, pslot + 1); if (IS_ERR(right)) right = NULL; @@ -2075,7 +2099,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (!parent) return 1; - left = read_node_slot(parent, pslot - 1); + left = btrfs_read_node_slot(parent, pslot - 1); if (IS_ERR(left)) left = NULL; @@ -2127,7 +2151,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_tree_unlock(left); free_extent_buffer(left); } - right = read_node_slot(parent, pslot + 1); + right = btrfs_read_node_slot(parent, pslot + 1); if (IS_ERR(right)) right = NULL; @@ -2889,15 +2913,13 @@ cow_done: if (!p->skip_locking) { level = btrfs_header_level(b); if (level <= write_lock_level) { - err = btrfs_try_tree_write_lock(b); - if (!err) { + if (!btrfs_try_tree_write_lock(b)) { btrfs_set_path_blocking(p); btrfs_tree_lock(b); } p->locks[level] = BTRFS_WRITE_LOCK; } else { - err = btrfs_tree_read_lock_atomic(b); - if (!err) { + if (!btrfs_tree_read_lock_atomic(b)) { btrfs_set_path_blocking(p); btrfs_tree_read_lock(b); } @@ -3031,8 +3053,7 @@ again: } level = btrfs_header_level(b); - err = btrfs_tree_read_lock_atomic(b); - if (!err) { + if (!btrfs_tree_read_lock_atomic(b)) { btrfs_set_path_blocking(p); btrfs_tree_read_lock(b); } @@ -3572,7 +3593,7 @@ static int leaf_space_used(struct extent_buffer *l, int start, int nr) if (!nr) return 0; - btrfs_init_map_token(&token); + btrfs_init_map_token(&token, l); start_item = btrfs_item_nr(start); end_item = btrfs_item_nr(end); data_len = btrfs_token_item_offset(l, start_item, &token) + @@ -3630,8 +3651,6 @@ static noinline int __push_leaf_right(struct btrfs_path *path, u32 data_end; u32 this_item_size; - btrfs_init_map_token(&token); - if (empty) nr = 0; else @@ -3704,6 +3723,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path, push_items * sizeof(struct btrfs_item)); /* update the item pointers */ + btrfs_init_map_token(&token, right); right_nritems += push_items; btrfs_set_header_nritems(right, right_nritems); push_space = BTRFS_LEAF_DATA_SIZE(fs_info); @@ -3781,7 +3801,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_locked(path->nodes[1]); - right = read_node_slot(upper, slot + 1); + right = btrfs_read_node_slot(upper, slot + 1); /* * slot + 1 is not valid or we fail to read the right node, * no big deal, just return. @@ -3858,8 +3878,6 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, u32 old_left_item_size; struct btrfs_map_token token; - btrfs_init_map_token(&token); - if (empty) nr = min(right_nritems, max_slot); else @@ -3913,6 +3931,7 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, old_left_nritems = btrfs_header_nritems(left); BUG_ON(old_left_nritems <= 0); + btrfs_init_map_token(&token, left); old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1); for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { u32 ioff; @@ -3944,6 +3963,8 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, (btrfs_header_nritems(right) - push_items) * sizeof(struct btrfs_item)); } + + btrfs_init_map_token(&token, right); right_nritems -= push_items; btrfs_set_header_nritems(right, right_nritems); push_space = BTRFS_LEAF_DATA_SIZE(fs_info); @@ -4015,7 +4036,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_locked(path->nodes[1]); - left = read_node_slot(path->nodes[1], slot - 1); + left = btrfs_read_node_slot(path->nodes[1], slot - 1); /* * slot - 1 is not valid or we fail to read the left node, * no big deal, just return. @@ -4074,8 +4095,6 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; struct btrfs_map_token token; - btrfs_init_map_token(&token); - nritems = nritems - mid; btrfs_set_header_nritems(right, nritems); data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l); @@ -4091,6 +4110,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid); + btrfs_init_map_token(&token, right); for (i = 0; i < nritems; i++) { struct btrfs_item *item = btrfs_item_nr(i); u32 ioff; @@ -4574,8 +4594,6 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) int i; struct btrfs_map_token token; - btrfs_init_map_token(&token); - leaf = path->nodes[0]; slot = path->slots[0]; @@ -4597,6 +4615,7 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) * item0..itemN ... dataN.offset..dataN.size .. data0.size */ /* first correct the data pointers */ + btrfs_init_map_token(&token, leaf); for (i = slot; i < nritems; i++) { u32 ioff; item = btrfs_item_nr(i); @@ -4671,8 +4690,6 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) int i; struct btrfs_map_token token; - btrfs_init_map_token(&token); - leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); @@ -4697,6 +4714,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) * item0..itemN ... dataN.offset..dataN.size .. data0.size */ /* first correct the data pointers */ + btrfs_init_map_token(&token, leaf); for (i = slot; i < nritems; i++) { u32 ioff; item = btrfs_item_nr(i); @@ -4748,8 +4766,6 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, } btrfs_unlock_up_safe(path, 1); - btrfs_init_map_token(&token); - leaf = path->nodes[0]; slot = path->slots[0]; @@ -4763,6 +4779,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, BUG(); } + btrfs_init_map_token(&token, leaf); if (slot != nritems) { unsigned int old_data = btrfs_item_end_nr(leaf, slot); @@ -4969,9 +4986,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, int wret; int i; u32 nritems; - struct btrfs_map_token token; - - btrfs_init_map_token(&token); leaf = path->nodes[0]; last_off = btrfs_item_offset_nr(leaf, slot + nr - 1); @@ -4983,12 +4997,14 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (slot + nr != nritems) { int data_end = leaf_data_end(leaf); + struct btrfs_map_token token; memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + data_end + dsize, BTRFS_LEAF_DATA_OFFSET + data_end, last_off - data_end); + btrfs_init_map_token(&token, leaf); for (i = slot + nr; i < nritems; i++) { u32 ioff; @@ -5222,7 +5238,7 @@ find_next_key: goto out; } btrfs_set_path_blocking(path); - cur = read_node_slot(cur, slot); + cur = btrfs_read_node_slot(cur, slot); if (IS_ERR(cur)) { ret = PTR_ERR(cur); goto out; @@ -5244,368 +5260,6 @@ out: return ret; } -static int tree_move_down(struct btrfs_path *path, int *level) -{ - struct extent_buffer *eb; - - BUG_ON(*level == 0); - eb = read_node_slot(path->nodes[*level], path->slots[*level]); - if (IS_ERR(eb)) - return PTR_ERR(eb); - - path->nodes[*level - 1] = eb; - path->slots[*level - 1] = 0; - (*level)--; - return 0; -} - -static int tree_move_next_or_upnext(struct btrfs_path *path, - int *level, int root_level) -{ - int ret = 0; - int nritems; - nritems = btrfs_header_nritems(path->nodes[*level]); - - path->slots[*level]++; - - while (path->slots[*level] >= nritems) { - if (*level == root_level) - return -1; - - /* move upnext */ - path->slots[*level] = 0; - free_extent_buffer(path->nodes[*level]); - path->nodes[*level] = NULL; - (*level)++; - path->slots[*level]++; - - nritems = btrfs_header_nritems(path->nodes[*level]); - ret = 1; - } - return ret; -} - -/* - * Returns 1 if it had to move up and next. 0 is returned if it moved only next - * or down. - */ -static int tree_advance(struct btrfs_path *path, - int *level, int root_level, - int allow_down, - struct btrfs_key *key) -{ - int ret; - - if (*level == 0 || !allow_down) { - ret = tree_move_next_or_upnext(path, level, root_level); - } else { - ret = tree_move_down(path, level); - } - if (ret >= 0) { - if (*level == 0) - btrfs_item_key_to_cpu(path->nodes[*level], key, - path->slots[*level]); - else - btrfs_node_key_to_cpu(path->nodes[*level], key, - path->slots[*level]); - } - return ret; -} - -static int tree_compare_item(struct btrfs_path *left_path, - struct btrfs_path *right_path, - char *tmp_buf) -{ - int cmp; - int len1, len2; - unsigned long off1, off2; - - len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]); - len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]); - if (len1 != len2) - return 1; - - off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]); - off2 = btrfs_item_ptr_offset(right_path->nodes[0], - right_path->slots[0]); - - read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1); - - cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1); - if (cmp) - return 1; - return 0; -} - -#define ADVANCE 1 -#define ADVANCE_ONLY_NEXT -1 - -/* - * This function compares two trees and calls the provided callback for - * every changed/new/deleted item it finds. - * If shared tree blocks are encountered, whole subtrees are skipped, making - * the compare pretty fast on snapshotted subvolumes. - * - * This currently works on commit roots only. As commit roots are read only, - * we don't do any locking. The commit roots are protected with transactions. - * Transactions are ended and rejoined when a commit is tried in between. - * - * This function checks for modifications done to the trees while comparing. - * If it detects a change, it aborts immediately. - */ -int btrfs_compare_trees(struct btrfs_root *left_root, - struct btrfs_root *right_root, - btrfs_changed_cb_t changed_cb, void *ctx) -{ - struct btrfs_fs_info *fs_info = left_root->fs_info; - int ret; - int cmp; - struct btrfs_path *left_path = NULL; - struct btrfs_path *right_path = NULL; - struct btrfs_key left_key; - struct btrfs_key right_key; - char *tmp_buf = NULL; - int left_root_level; - int right_root_level; - int left_level; - int right_level; - int left_end_reached; - int right_end_reached; - int advance_left; - int advance_right; - u64 left_blockptr; - u64 right_blockptr; - u64 left_gen; - u64 right_gen; - - left_path = btrfs_alloc_path(); - if (!left_path) { - ret = -ENOMEM; - goto out; - } - right_path = btrfs_alloc_path(); - if (!right_path) { - ret = -ENOMEM; - goto out; - } - - tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); - if (!tmp_buf) { - ret = -ENOMEM; - goto out; - } - - left_path->search_commit_root = 1; - left_path->skip_locking = 1; - right_path->search_commit_root = 1; - right_path->skip_locking = 1; - - /* - * Strategy: Go to the first items of both trees. Then do - * - * If both trees are at level 0 - * Compare keys of current items - * If left < right treat left item as new, advance left tree - * and repeat - * If left > right treat right item as deleted, advance right tree - * and repeat - * If left == right do deep compare of items, treat as changed if - * needed, advance both trees and repeat - * If both trees are at the same level but not at level 0 - * Compare keys of current nodes/leafs - * If left < right advance left tree and repeat - * If left > right advance right tree and repeat - * If left == right compare blockptrs of the next nodes/leafs - * If they match advance both trees but stay at the same level - * and repeat - * If they don't match advance both trees while allowing to go - * deeper and repeat - * If tree levels are different - * Advance the tree that needs it and repeat - * - * Advancing a tree means: - * If we are at level 0, try to go to the next slot. If that's not - * possible, go one level up and repeat. Stop when we found a level - * where we could go to the next slot. We may at this point be on a - * node or a leaf. - * - * If we are not at level 0 and not on shared tree blocks, go one - * level deeper. - * - * If we are not at level 0 and on shared tree blocks, go one slot to - * the right if possible or go up and right. - */ - - down_read(&fs_info->commit_root_sem); - left_level = btrfs_header_level(left_root->commit_root); - left_root_level = left_level; - left_path->nodes[left_level] = - btrfs_clone_extent_buffer(left_root->commit_root); - if (!left_path->nodes[left_level]) { - up_read(&fs_info->commit_root_sem); - ret = -ENOMEM; - goto out; - } - - right_level = btrfs_header_level(right_root->commit_root); - right_root_level = right_level; - right_path->nodes[right_level] = - btrfs_clone_extent_buffer(right_root->commit_root); - if (!right_path->nodes[right_level]) { - up_read(&fs_info->commit_root_sem); - ret = -ENOMEM; - goto out; - } - up_read(&fs_info->commit_root_sem); - - if (left_level == 0) - btrfs_item_key_to_cpu(left_path->nodes[left_level], - &left_key, left_path->slots[left_level]); - else - btrfs_node_key_to_cpu(left_path->nodes[left_level], - &left_key, left_path->slots[left_level]); - if (right_level == 0) - btrfs_item_key_to_cpu(right_path->nodes[right_level], - &right_key, right_path->slots[right_level]); - else - btrfs_node_key_to_cpu(right_path->nodes[right_level], - &right_key, right_path->slots[right_level]); - - left_end_reached = right_end_reached = 0; - advance_left = advance_right = 0; - - while (1) { - if (advance_left && !left_end_reached) { - ret = tree_advance(left_path, &left_level, - left_root_level, - advance_left != ADVANCE_ONLY_NEXT, - &left_key); - if (ret == -1) - left_end_reached = ADVANCE; - else if (ret < 0) - goto out; - advance_left = 0; - } - if (advance_right && !right_end_reached) { - ret = tree_advance(right_path, &right_level, - right_root_level, - advance_right != ADVANCE_ONLY_NEXT, - &right_key); - if (ret == -1) - right_end_reached = ADVANCE; - else if (ret < 0) - goto out; - advance_right = 0; - } - - if (left_end_reached && right_end_reached) { - ret = 0; - goto out; - } else if (left_end_reached) { - if (right_level == 0) { - ret = changed_cb(left_path, right_path, - &right_key, - BTRFS_COMPARE_TREE_DELETED, - ctx); - if (ret < 0) - goto out; - } - advance_right = ADVANCE; - continue; - } else if (right_end_reached) { - if (left_level == 0) { - ret = changed_cb(left_path, right_path, - &left_key, - BTRFS_COMPARE_TREE_NEW, - ctx); - if (ret < 0) - goto out; - } - advance_left = ADVANCE; - continue; - } - - if (left_level == 0 && right_level == 0) { - cmp = btrfs_comp_cpu_keys(&left_key, &right_key); - if (cmp < 0) { - ret = changed_cb(left_path, right_path, - &left_key, - BTRFS_COMPARE_TREE_NEW, - ctx); - if (ret < 0) - goto out; - advance_left = ADVANCE; - } else if (cmp > 0) { - ret = changed_cb(left_path, right_path, - &right_key, - BTRFS_COMPARE_TREE_DELETED, - ctx); - if (ret < 0) - goto out; - advance_right = ADVANCE; - } else { - enum btrfs_compare_tree_result result; - - WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); - ret = tree_compare_item(left_path, right_path, - tmp_buf); - if (ret) - result = BTRFS_COMPARE_TREE_CHANGED; - else - result = BTRFS_COMPARE_TREE_SAME; - ret = changed_cb(left_path, right_path, - &left_key, result, ctx); - if (ret < 0) - goto out; - advance_left = ADVANCE; - advance_right = ADVANCE; - } - } else if (left_level == right_level) { - cmp = btrfs_comp_cpu_keys(&left_key, &right_key); - if (cmp < 0) { - advance_left = ADVANCE; - } else if (cmp > 0) { - advance_right = ADVANCE; - } else { - left_blockptr = btrfs_node_blockptr( - left_path->nodes[left_level], - left_path->slots[left_level]); - right_blockptr = btrfs_node_blockptr( - right_path->nodes[right_level], - right_path->slots[right_level]); - left_gen = btrfs_node_ptr_generation( - left_path->nodes[left_level], - left_path->slots[left_level]); - right_gen = btrfs_node_ptr_generation( - right_path->nodes[right_level], - right_path->slots[right_level]); - if (left_blockptr == right_blockptr && - left_gen == right_gen) { - /* - * As we're on a shared block, don't - * allow to go deeper. - */ - advance_left = ADVANCE_ONLY_NEXT; - advance_right = ADVANCE_ONLY_NEXT; - } else { - advance_left = ADVANCE; - advance_right = ADVANCE; - } - } - } else if (left_level < right_level) { - advance_right = ADVANCE; - } else { - advance_left = ADVANCE; - } - } - -out: - btrfs_free_path(left_path); - btrfs_free_path(right_path); - kvfree(tmp_buf); - return ret; -} - /* * this is similar to btrfs_next_leaf, but does not try to preserve * and fixup the path. It looks for and returns the next key in the @@ -5623,7 +5277,7 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, int slot; struct extent_buffer *c; - WARN_ON(!path->keep_locks); + WARN_ON(!path->keep_locks && !path->skip_locking); while (level < BTRFS_MAX_LEVEL) { if (!path->nodes[level]) return 1; @@ -5639,7 +5293,7 @@ next: !path->nodes[level + 1]) return 1; - if (path->locks[level + 1]) { + if (path->locks[level + 1] || path->skip_locking) { level++; continue; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 94660063a162..19d669d12ca1 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -16,7 +16,6 @@ #include <linux/backing-dev.h> #include <linux/wait.h> #include <linux/slab.h> -#include <linux/kobject.h> #include <trace/events/btrfs.h> #include <asm/kmap_types.h> #include <asm/unaligned.h> @@ -39,10 +38,12 @@ struct btrfs_transaction; struct btrfs_pending_snapshot; struct btrfs_delayed_ref_root; struct btrfs_space_info; +struct btrfs_block_group_cache; extern struct kmem_cache *btrfs_trans_handle_cachep; extern struct kmem_cache *btrfs_bit_radix_cachep; extern struct kmem_cache *btrfs_path_cachep; extern struct kmem_cache *btrfs_free_space_cachep; +extern struct kmem_cache *btrfs_free_space_bitmap_cachep; struct btrfs_ordered_sum; struct btrfs_ref; @@ -82,10 +83,6 @@ struct btrfs_ref; */ #define BTRFS_LINK_MAX 65535U -/* four bytes for CRC32 */ -static const int btrfs_csum_sizes[] = { 4 }; -static const char *btrfs_csum_names[] = { "crc32c" }; - #define BTRFS_EMPTY_DIR_SIZE 0 /* ioprio of readahead is set to idle */ @@ -397,12 +394,6 @@ struct btrfs_dev_replace { wait_queue_head_t replace_wait; }; -/* For raid type sysfs entries */ -struct raid_kobject { - u64 flags; - struct kobject kobj; -}; - /* * free clusters are used to claim free space in relatively large chunks, * allowing us to do less seeky writes. They are used for all metadata @@ -439,40 +430,6 @@ enum btrfs_caching_type { BTRFS_CACHE_ERROR, }; -enum btrfs_disk_cache_state { - BTRFS_DC_WRITTEN, - BTRFS_DC_ERROR, - BTRFS_DC_CLEAR, - BTRFS_DC_SETUP, -}; - -struct btrfs_caching_control { - struct list_head list; - struct mutex mutex; - wait_queue_head_t wait; - struct btrfs_work work; - struct btrfs_block_group_cache *block_group; - u64 progress; - refcount_t count; -}; - -/* Once caching_thread() finds this much free space, it will wake up waiters. */ -#define CACHING_CTL_WAKE_UP SZ_2M - -struct btrfs_io_ctl { - void *cur, *orig; - struct page *page; - struct page **pages; - struct btrfs_fs_info *fs_info; - struct inode *inode; - unsigned long size; - int index; - int num_pages; - int entries; - int bitmaps; - unsigned check_crcs:1; -}; - /* * Tree to record all locked full stripes of a RAID5/6 block group */ @@ -481,120 +438,6 @@ struct btrfs_full_stripe_locks_tree { struct mutex lock; }; -struct btrfs_block_group_cache { - struct btrfs_key key; - struct btrfs_block_group_item item; - struct btrfs_fs_info *fs_info; - struct inode *inode; - spinlock_t lock; - u64 pinned; - u64 reserved; - u64 delalloc_bytes; - u64 bytes_super; - u64 flags; - u64 cache_generation; - - /* - * If the free space extent count exceeds this number, convert the block - * group to bitmaps. - */ - u32 bitmap_high_thresh; - - /* - * If the free space extent count drops below this number, convert the - * block group back to extents. - */ - u32 bitmap_low_thresh; - - /* - * It is just used for the delayed data space allocation because - * only the data space allocation and the relative metadata update - * can be done cross the transaction. - */ - struct rw_semaphore data_rwsem; - - /* for raid56, this is a full stripe, without parity */ - unsigned long full_stripe_len; - - unsigned int ro; - unsigned int iref:1; - unsigned int has_caching_ctl:1; - unsigned int removed:1; - - int disk_cache_state; - - /* cache tracking stuff */ - int cached; - struct btrfs_caching_control *caching_ctl; - u64 last_byte_to_unpin; - - struct btrfs_space_info *space_info; - - /* free space cache stuff */ - struct btrfs_free_space_ctl *free_space_ctl; - - /* block group cache stuff */ - struct rb_node cache_node; - - /* for block groups in the same raid type */ - struct list_head list; - - /* usage count */ - atomic_t count; - - /* List of struct btrfs_free_clusters for this block group. - * Today it will only have one thing on it, but that may change - */ - struct list_head cluster_list; - - /* For delayed block group creation or deletion of empty block groups */ - struct list_head bg_list; - - /* For read-only block groups */ - struct list_head ro_list; - - atomic_t trimming; - - /* For dirty block groups */ - struct list_head dirty_list; - struct list_head io_list; - - struct btrfs_io_ctl io_ctl; - - /* - * Incremented when doing extent allocations and holding a read lock - * on the space_info's groups_sem semaphore. - * Decremented when an ordered extent that represents an IO against this - * block group's range is created (after it's added to its inode's - * root's list of ordered extents) or immediately after the allocation - * if it's a metadata extent or fallocate extent (for these cases we - * don't create ordered extents). - */ - atomic_t reservations; - - /* - * Incremented while holding the spinlock *lock* by a task checking if - * it can perform a nocow write (incremented if the value for the *ro* - * field is 0). Decremented by such tasks once they create an ordered - * extent or before that if some error happens before reaching that step. - * This is to prevent races between block group relocation and nocow - * writes through direct IO. - */ - atomic_t nocow_writers; - - /* Lock for free space tree operations. */ - struct mutex free_space_lock; - - /* - * Does the block group need to be added to the free space tree? - * Protected by free_space_lock. - */ - int needs_free_space; - - /* Record locked full stripes for RAID5/6 block group */ - struct btrfs_full_stripe_locks_tree full_stripe_locks_root; -}; - /* delayed seq elem */ struct seq_list { struct list_head list; @@ -610,22 +453,6 @@ enum btrfs_orphan_cleanup_state { ORPHAN_CLEANUP_DONE = 2, }; -/* used by the raid56 code to lock stripes for read/modify/write */ -struct btrfs_stripe_hash { - struct list_head hash_list; - spinlock_t lock; -}; - -/* used by the raid56 code to lock stripes for read/modify/write */ -struct btrfs_stripe_hash_table { - struct list_head stripe_cache; - spinlock_t cache_lock; - int cache_size; - struct btrfs_stripe_hash table[]; -}; - -#define BTRFS_STRIPE_HASH_TABLE_BITS 11 - void btrfs_init_async_reclaim_work(struct work_struct *work); /* fs_info */ @@ -1279,6 +1106,16 @@ struct btrfs_root { #endif }; +struct btrfs_clone_extent_info { + u64 disk_offset; + u64 disk_len; + u64 data_offset; + u64 data_len; + u64 file_offset; + char *extent_buf; + u32 item_size; +}; + struct btrfs_file_private { void *filldir_buf; }; @@ -1377,19 +1214,6 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) btrfs_clear_opt(fs_info->mount_opt, opt); \ } -#ifdef CONFIG_BTRFS_DEBUG -static inline int -btrfs_should_fragment_free_space(struct btrfs_block_group_cache *block_group) -{ - struct btrfs_fs_info *fs_info = block_group->fs_info; - - return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) && - block_group->flags & BTRFS_BLOCK_GROUP_METADATA) || - (btrfs_test_opt(fs_info, FRAGMENT_DATA) && - block_group->flags & BTRFS_BLOCK_GROUP_DATA); -} -#endif - /* * Requests for changes that need to be done during transaction commit. * @@ -1475,8 +1299,10 @@ struct btrfs_map_token { #define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \ ((bytes) >> (fs_info)->sb->s_blocksize_bits) -static inline void btrfs_init_map_token (struct btrfs_map_token *token) +static inline void btrfs_init_map_token(struct btrfs_map_token *token, + struct extent_buffer *eb) { + token->eb = eb; token->kaddr = NULL; } @@ -1507,17 +1333,10 @@ u##bits btrfs_get_token_##bits(const struct extent_buffer *eb, \ void btrfs_set_token_##bits(struct extent_buffer *eb, const void *ptr, \ unsigned long off, u##bits val, \ struct btrfs_map_token *token); \ -static inline u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ - const void *ptr, \ - unsigned long off) \ -{ \ - return btrfs_get_token_##bits(eb, ptr, off, NULL); \ -} \ -static inline void btrfs_set_##bits(struct extent_buffer *eb, void *ptr,\ - unsigned long off, u##bits val) \ -{ \ - btrfs_set_token_##bits(eb, ptr, off, val, NULL); \ -} +u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ + const void *ptr, unsigned long off); \ +void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \ + unsigned long off, u##bits val); DECLARE_BTRFS_SETGET_BITS(8) DECLARE_BTRFS_SETGET_BITS(16) @@ -2059,16 +1878,6 @@ static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, btrfs_disk_key_to_cpu(key, &disk_key); } -static inline u8 btrfs_key_type(const struct btrfs_key *key) -{ - return key->type; -} - -static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val) -{ - key->type = val; -} - /* struct btrfs_header */ BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64); BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header, @@ -2354,20 +2163,8 @@ BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64); BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, uuid_tree_generation, 64); -static inline int btrfs_super_csum_size(const struct btrfs_super_block *s) -{ - u16 t = btrfs_super_csum_type(s); - /* - * csum type is validated at mount time - */ - return btrfs_csum_sizes[t]; -} - -static inline const char *btrfs_super_csum_name(u16 csum_type) -{ - /* csum type is validated at mount time */ - return btrfs_csum_names[csum_type]; -} +int btrfs_super_csum_size(const struct btrfs_super_block *s); +const char *btrfs_super_csum_name(u16 csum_type); /* * The leaf data grows from end-to-front in the node. @@ -2440,30 +2237,6 @@ static inline u32 btrfs_file_extent_inline_item_len( return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START; } -/* btrfs_dev_stats_item */ -static inline u64 btrfs_dev_stats_value(const struct extent_buffer *eb, - const struct btrfs_dev_stats_item *ptr, - int index) -{ - u64 val; - - read_extent_buffer(eb, &val, - offsetof(struct btrfs_dev_stats_item, values) + - ((unsigned long)ptr) + (index * sizeof(u64)), - sizeof(val)); - return val; -} - -static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb, - struct btrfs_dev_stats_item *ptr, - int index, u64 val) -{ - write_extent_buffer(eb, &val, - offsetof(struct btrfs_dev_stats_item, values) + - ((unsigned long)ptr) + (index * sizeof(u64)), - sizeof(val)); -} - /* btrfs_qgroup_status_item */ BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item, generation, 64); @@ -2600,32 +2373,33 @@ enum btrfs_inline_ref_type { int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, struct btrfs_extent_inline_ref *iref, enum btrfs_inline_ref_type is_data); +u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset); u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes); -static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_fs_info *fs_info, - unsigned num_items) +/* + * Use this if we would be adding new items, as we could split nodes as we cow + * down the tree. + */ +static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info, + unsigned num_items) { return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; } /* - * Doing a truncate won't result in new nodes or leaves, just what we need for - * COW. + * Doing a truncate or a modification won't result in new nodes or leaves, just + * what we need for COW. */ -static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info, +static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info, unsigned num_items) { return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; } -void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, - const u64 start); -void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); -bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); -void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); -void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg); -void btrfs_put_block_group(struct btrfs_block_group_cache *cache); +int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, + u64 start, u64 num_bytes); +void btrfs_free_excluded_extents(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count); void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, @@ -2642,11 +2416,6 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, int btrfs_exclude_logged_extents(struct extent_buffer *eb); int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, u64 bytenr); -struct btrfs_block_group_cache *btrfs_lookup_block_group( - struct btrfs_fs_info *info, - u64 bytenr); -void btrfs_get_block_group(struct btrfs_block_group_cache *cache); -void btrfs_put_block_group(struct btrfs_block_group_cache *cache); struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, @@ -2685,28 +2454,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref); -int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans); -int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans); -int btrfs_setup_space_cache(struct btrfs_trans_handle *trans); int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr); -int btrfs_free_block_groups(struct btrfs_fs_info *info); -int btrfs_read_block_groups(struct btrfs_fs_info *info); -int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr); -int btrfs_make_block_group(struct btrfs_trans_handle *trans, - u64 bytes_used, u64 type, u64 chunk_offset, - u64 size); -struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( - struct btrfs_fs_info *fs_info, - const u64 chunk_offset); -int btrfs_remove_block_group(struct btrfs_trans_handle *trans, - u64 group_start, struct extent_map *em); -void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache); void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache); -void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans); -u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info); -u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info); -u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); enum btrfs_reserve_flush_enum { @@ -2717,6 +2467,7 @@ enum btrfs_reserve_flush_enum { * case, use FLUSH LIMIT */ BTRFS_RESERVE_FLUSH_LIMIT, + BTRFS_RESERVE_FLUSH_EVICT, BTRFS_RESERVE_FLUSH_ALL, }; @@ -2729,31 +2480,10 @@ enum btrfs_flush_state { FLUSH_DELALLOC_WAIT = 6, ALLOC_CHUNK = 7, ALLOC_CHUNK_FORCE = 8, - COMMIT_TRANS = 9, -}; - -/* - * control flags for do_chunk_alloc's force field - * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk - * if we really need one. - * - * CHUNK_ALLOC_LIMITED means to only try and allocate one - * if we have very few chunks already allocated. This is - * used as part of the clustering code to help make sure - * we have a good pool of storage to cluster in, without - * filling the FS with empty chunks - * - * CHUNK_ALLOC_FORCE means it must try to allocate one - * - */ -enum btrfs_chunk_alloc_enum { - CHUNK_ALLOC_NO_FORCE, - CHUNK_ALLOC_LIMITED, - CHUNK_ALLOC_FORCE, + RUN_DELAYED_IPUTS = 9, + COMMIT_TRANS = 10, }; -int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, - enum btrfs_chunk_alloc_enum force); int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, bool use_global_rsv); @@ -2763,15 +2493,11 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, bool qgroup_free); int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); -int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache); -void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache); -void btrfs_put_block_group_cache(struct btrfs_fs_info *info); u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 *actual_bytes); -int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type); int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); int btrfs_init_space_info(struct btrfs_fs_info *fs_info); @@ -2780,10 +2506,6 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, int btrfs_start_write_no_snapshotting(struct btrfs_root *root); void btrfs_end_write_no_snapshotting(struct btrfs_root *root); void btrfs_wait_for_snapshot_creation(struct btrfs_root *root); -void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type); -u64 add_new_free_space(struct btrfs_block_group_cache *block_group, - u64 start, u64 end); -void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, @@ -2806,20 +2528,9 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, struct btrfs_path *path, u64 min_trans); -enum btrfs_compare_tree_result { - BTRFS_COMPARE_TREE_NEW, - BTRFS_COMPARE_TREE_DELETED, - BTRFS_COMPARE_TREE_CHANGED, - BTRFS_COMPARE_TREE_SAME, -}; -typedef int (*btrfs_changed_cb_t)(struct btrfs_path *left_path, - struct btrfs_path *right_path, - struct btrfs_key *key, - enum btrfs_compare_tree_result result, - void *ctx); -int btrfs_compare_trees(struct btrfs_root *left_root, - struct btrfs_root *right_root, - btrfs_changed_cb_t cb, void *ctx); +struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, + int slot); + int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, @@ -3068,14 +2779,12 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, u64 inode_objectid, u64 ref_objectid, int ins_len, int cow); -int btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot, - const char *name, - int name_len, struct btrfs_inode_ref **ref_ret); -int btrfs_find_name_in_ext_backref(struct extent_buffer *leaf, int slot, - u64 ref_objectid, const char *name, - int name_len, - struct btrfs_inode_extref **extref_ret); - +struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, + int slot, const char *name, + int name_len); +struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( + struct extent_buffer *leaf, int slot, u64 ref_objectid, + const char *name, int name_len); /* file-item.c */ struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, @@ -3137,7 +2846,7 @@ int btrfs_start_delalloc_snapshot(struct btrfs_root *root); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, unsigned int extra_bits, - struct extent_state **cached_state, int dedupe); + struct extent_state **cached_state); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, struct btrfs_root *parent_root, @@ -3233,6 +2942,10 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, int drop_cache); +int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, + const u64 start, const u64 end, + struct btrfs_clone_extent_info *clone_info, + struct btrfs_trans_handle **trans_out); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 start, u64 end); int btrfs_release_file(struct inode *inode, struct file *file); @@ -3248,12 +2961,6 @@ loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in, int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root); -/* sysfs.c */ -int __init btrfs_init_sysfs(void); -void __cold btrfs_exit_sysfs(void); -int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info); -void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info); - /* super.c */ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, unsigned long new_flags); @@ -3722,26 +3429,4 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) } #endif -static inline void cond_wake_up(struct wait_queue_head *wq) -{ - /* - * This implies a full smp_mb barrier, see comments for - * waitqueue_active why. - */ - if (wq_has_sleeper(wq)) - wake_up(wq); -} - -static inline void cond_wake_up_nomb(struct wait_queue_head *wq) -{ - /* - * Special case for conditional wakeup where the barrier required for - * waitqueue_active is implied by some of the preceding code. Eg. one - * of such atomic operations (atomic_dec_and_return, ...), or a - * unlock/lock sequence, etc. - */ - if (waitqueue_active(wq)) - wake_up(wq); -} - #endif diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h deleted file mode 100644 index 90281a7a35a8..000000000000 --- a/fs/btrfs/dedupe.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2016 Fujitsu. All rights reserved. - */ - -#ifndef BTRFS_DEDUPE_H -#define BTRFS_DEDUPE_H - -/* later in-band dedupe will expand this struct */ -struct btrfs_dedupe_hash; - -#endif diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 17f7c0d38768..d949d7d2abed 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -7,6 +7,7 @@ #include "space-info.h" #include "transaction.h" #include "qgroup.h" +#include "block-group.h" int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) { @@ -129,8 +130,6 @@ commit_trans: return -ENOSPC; } btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - data_sinfo->flags, bytes, 1); spin_unlock(&data_sinfo->lock); return 0; @@ -182,8 +181,6 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, data_sinfo = fs_info->data_sinfo; spin_lock(&data_sinfo->lock); btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len); - trace_btrfs_space_reservation(fs_info, "space_info", - data_sinfo->flags, len, 0); spin_unlock(&data_sinfo->lock); } @@ -254,13 +251,20 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, lockdep_assert_held(&inode->lock); outstanding_extents = inode->outstanding_extents; - if (outstanding_extents) - reserve_size = btrfs_calc_trans_metadata_size(fs_info, - outstanding_extents + 1); + + /* + * Insert size for the number of outstanding extents, 1 normal size for + * updating the inode. + */ + if (outstanding_extents) { + reserve_size = btrfs_calc_insert_metadata_size(fs_info, + outstanding_extents); + reserve_size += btrfs_calc_metadata_size(fs_info, 1); + } csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes); - reserve_size += btrfs_calc_trans_metadata_size(fs_info, - csum_leaves); + reserve_size += btrfs_calc_insert_metadata_size(fs_info, + csum_leaves); /* * For qgroup rsv, the calculation is very simple: * account one nodesize for each outstanding extent @@ -281,10 +285,16 @@ static void calc_inode_reservations(struct btrfs_fs_info *fs_info, { u64 nr_extents = count_max_extents(num_bytes); u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes); + u64 inode_update = btrfs_calc_metadata_size(fs_info, 1); - /* We add one for the inode update at finish ordered time */ - *meta_reserve = btrfs_calc_trans_metadata_size(fs_info, - nr_extents + csum_leaves + 1); + *meta_reserve = btrfs_calc_insert_metadata_size(fs_info, + nr_extents + csum_leaves); + + /* + * finish_ordered_io has to update the inode, so add the space required + * for an inode update. + */ + *meta_reserve += inode_update; *qgroup_reserve = nr_extents * fs_info->nodesize; } diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 43fdb2992956..1f7f39b10bd0 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -6,6 +6,7 @@ #include <linux/slab.h> #include <linux/iversion.h> +#include "misc.h" #include "delayed-inode.h" #include "disk-io.h" #include "transaction.h" @@ -474,6 +475,9 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) struct rb_root_cached *root; struct btrfs_delayed_root *delayed_root; + /* Not associated with any delayed_node */ + if (!delayed_item->delayed_node) + return; delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root; BUG_ON(!delayed_root); @@ -555,7 +559,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, src_rsv = trans->block_rsv; dst_rsv = &fs_info->delayed_block_rsv; - num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); + num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1); /* * Here we migrate space rsv from transaction rsv, since have already @@ -609,7 +613,7 @@ static int btrfs_delayed_inode_reserve_metadata( src_rsv = trans->block_rsv; dst_rsv = &fs_info->delayed_block_rsv; - num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); + num_bytes = btrfs_calc_metadata_size(fs_info, 1); /* * btrfs_dirty_inode will update the inode under btrfs_join_transaction @@ -1525,7 +1529,12 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, * we have reserved enough space when we start a new transaction, * so reserving metadata failure is impossible. */ - BUG_ON(ret); + if (ret < 0) { + btrfs_err(trans->fs_info, +"metadata reservation failed for delayed dir item deltiona, should have been reserved"); + btrfs_release_delayed_item(item); + goto end; + } mutex_lock(&node->mutex); ret = __btrfs_add_delayed_deletion_item(node, item); @@ -1534,7 +1543,8 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", index, node->root->root_key.objectid, node->inode_id, ret); - BUG(); + btrfs_delayed_item_release_metadata(dir->root, item); + btrfs_release_delayed_item(item); } mutex_unlock(&node->mutex); end: diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 9a91d1eb0af4..df3bd880061d 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -79,7 +79,7 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) { struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; - u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr); + u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr); u64 released = 0; released = __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, @@ -105,8 +105,8 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) if (!trans->delayed_ref_updates) return; - num_bytes = btrfs_calc_trans_metadata_size(fs_info, - trans->delayed_ref_updates); + num_bytes = btrfs_calc_insert_metadata_size(fs_info, + trans->delayed_ref_updates); spin_lock(&delayed_rsv->lock); delayed_rsv->size += num_bytes; delayed_rsv->full = 0; @@ -158,7 +158,7 @@ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, num_bytes, 1); if (to_free) - btrfs_space_info_add_old_bytes(fs_info, + btrfs_space_info_free_bytes_may_use(fs_info, delayed_refs_rsv->space_info, to_free); } @@ -174,7 +174,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, enum btrfs_reserve_flush_enum flush) { struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; - u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1); + u64 limit = btrfs_calc_insert_metadata_size(fs_info, 1); u64 num_bytes = 0; int ret = -ENOSPC; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 6b2e9aa83ffa..48890826b5e6 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -9,6 +9,7 @@ #include <linux/blkdev.h> #include <linux/kthread.h> #include <linux/math64.h> +#include "misc.h" #include "ctree.h" #include "extent_map.h" #include "disk-io.h" @@ -56,7 +57,7 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) no_valid_dev_replace_entry_found: ret = 0; dev_replace->replace_state = - BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED; + BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; dev_replace->cont_reading_from_srcdev_mode = BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS; dev_replace->time_started = 0; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 97beb351a10c..044981cf6df9 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -40,6 +40,7 @@ #include "compression.h" #include "tree-checker.h" #include "ref-verify.h" +#include "block-group.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ @@ -416,6 +417,16 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level, */ if (btrfs_header_generation(eb) > fs_info->last_trans_committed) return 0; + + /* We have @first_key, so this @eb must have at least one item */ + if (btrfs_header_nritems(eb) == 0) { + btrfs_err(fs_info, + "invalid tree nritems, bytenr=%llu nritems=0 expect >0", + eb->start); + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + return -EUCLEAN; + } + if (found_level) btrfs_node_key_to_cpu(eb, &found_key, 0); else @@ -1037,35 +1048,6 @@ void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr) free_extent_buffer(buf); } -int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, - int mirror_num, struct extent_buffer **eb) -{ - struct extent_buffer *buf = NULL; - int ret; - - buf = btrfs_find_create_tree_block(fs_info, bytenr); - if (IS_ERR(buf)) - return 0; - - set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags); - - ret = read_extent_buffer_pages(buf, WAIT_PAGE_LOCK, mirror_num); - if (ret) { - free_extent_buffer_stale(buf); - return ret; - } - - if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) { - free_extent_buffer_stale(buf); - return -EIO; - } else if (extent_buffer_uptodate(buf)) { - *eb = buf; - } else { - free_extent_buffer(buf); - } - return 0; -} - struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, u64 bytenr) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index e80f7c45a307..a6958103d87e 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -45,8 +45,6 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 parent_transid, int level, struct btrfs_key *first_key); void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr); -int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, - int mirror_num, struct extent_buffer **eb); struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, u64 bytenr); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 8b7eb22d508a..49cb26fa7c63 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4,7 +4,6 @@ */ #include <linux/sched.h> -#include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/pagemap.h> #include <linux/writeback.h> @@ -17,6 +16,7 @@ #include <linux/percpu_counter.h> #include <linux/lockdep.h> #include <linux/crc32c.h> +#include "misc.h" #include "tree-log.h" #include "disk-io.h" #include "print-tree.h" @@ -25,13 +25,13 @@ #include "locking.h" #include "free-space-cache.h" #include "free-space-tree.h" -#include "math.h" #include "sysfs.h" #include "qgroup.h" #include "ref-verify.h" #include "space-info.h" #include "block-rsv.h" #include "delalloc-space.h" +#include "block-group.h" #undef SCRAMBLE_DELAYED_REFS @@ -54,132 +54,13 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); -static noinline int -block_group_cache_done(struct btrfs_block_group_cache *cache) -{ - smp_mb(); - return cache->cached == BTRFS_CACHE_FINISHED || - cache->cached == BTRFS_CACHE_ERROR; -} - static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) { return (cache->flags & bits) == bits; } -void btrfs_get_block_group(struct btrfs_block_group_cache *cache) -{ - atomic_inc(&cache->count); -} - -void btrfs_put_block_group(struct btrfs_block_group_cache *cache) -{ - if (atomic_dec_and_test(&cache->count)) { - WARN_ON(cache->pinned > 0); - WARN_ON(cache->reserved > 0); - - /* - * If not empty, someone is still holding mutex of - * full_stripe_lock, which can only be released by caller. - * And it will definitely cause use-after-free when caller - * tries to release full stripe lock. - * - * No better way to resolve, but only to warn. - */ - WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root)); - kfree(cache->free_space_ctl); - kfree(cache); - } -} - -/* - * this adds the block group to the fs_info rb tree for the block group - * cache - */ -static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, - struct btrfs_block_group_cache *block_group) -{ - struct rb_node **p; - struct rb_node *parent = NULL; - struct btrfs_block_group_cache *cache; - - spin_lock(&info->block_group_cache_lock); - p = &info->block_group_cache_tree.rb_node; - - while (*p) { - parent = *p; - cache = rb_entry(parent, struct btrfs_block_group_cache, - cache_node); - if (block_group->key.objectid < cache->key.objectid) { - p = &(*p)->rb_left; - } else if (block_group->key.objectid > cache->key.objectid) { - p = &(*p)->rb_right; - } else { - spin_unlock(&info->block_group_cache_lock); - return -EEXIST; - } - } - - rb_link_node(&block_group->cache_node, parent, p); - rb_insert_color(&block_group->cache_node, - &info->block_group_cache_tree); - - if (info->first_logical_byte > block_group->key.objectid) - info->first_logical_byte = block_group->key.objectid; - - spin_unlock(&info->block_group_cache_lock); - - return 0; -} - -/* - * This will return the block group at or after bytenr if contains is 0, else - * it will return the block group that contains the bytenr - */ -static struct btrfs_block_group_cache * -block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, - int contains) -{ - struct btrfs_block_group_cache *cache, *ret = NULL; - struct rb_node *n; - u64 end, start; - - spin_lock(&info->block_group_cache_lock); - n = info->block_group_cache_tree.rb_node; - - while (n) { - cache = rb_entry(n, struct btrfs_block_group_cache, - cache_node); - end = cache->key.objectid + cache->key.offset - 1; - start = cache->key.objectid; - - if (bytenr < start) { - if (!contains && (!ret || start < ret->key.objectid)) - ret = cache; - n = n->rb_left; - } else if (bytenr > start) { - if (contains && bytenr <= end) { - ret = cache; - break; - } - n = n->rb_right; - } else { - ret = cache; - break; - } - } - if (ret) { - btrfs_get_block_group(ret); - if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) - info->first_logical_byte = ret->key.objectid; - } - spin_unlock(&info->block_group_cache_lock); - - return ret; -} - -static int add_excluded_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 num_bytes) +int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, + u64 start, u64 num_bytes) { u64 end = start + num_bytes - 1; set_extent_bits(&fs_info->freed_extents[0], @@ -189,7 +70,7 @@ static int add_excluded_extent(struct btrfs_fs_info *fs_info, return 0; } -static void free_excluded_extents(struct btrfs_block_group_cache *cache) +void btrfs_free_excluded_extents(struct btrfs_block_group_cache *cache) { struct btrfs_fs_info *fs_info = cache->fs_info; u64 start, end; @@ -203,494 +84,6 @@ static void free_excluded_extents(struct btrfs_block_group_cache *cache) start, end, EXTENT_UPTODATE); } -static int exclude_super_stripes(struct btrfs_block_group_cache *cache) -{ - struct btrfs_fs_info *fs_info = cache->fs_info; - u64 bytenr; - u64 *logical; - int stripe_len; - int i, nr, ret; - - if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { - stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; - cache->bytes_super += stripe_len; - ret = add_excluded_extent(fs_info, cache->key.objectid, - stripe_len); - if (ret) - return ret; - } - - for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { - bytenr = btrfs_sb_offset(i); - ret = btrfs_rmap_block(fs_info, cache->key.objectid, - bytenr, &logical, &nr, &stripe_len); - if (ret) - return ret; - - while (nr--) { - u64 start, len; - - if (logical[nr] > cache->key.objectid + - cache->key.offset) - continue; - - if (logical[nr] + stripe_len <= cache->key.objectid) - continue; - - start = logical[nr]; - if (start < cache->key.objectid) { - start = cache->key.objectid; - len = (logical[nr] + stripe_len) - start; - } else { - len = min_t(u64, stripe_len, - cache->key.objectid + - cache->key.offset - start); - } - - cache->bytes_super += len; - ret = add_excluded_extent(fs_info, start, len); - if (ret) { - kfree(logical); - return ret; - } - } - - kfree(logical); - } - return 0; -} - -static struct btrfs_caching_control * -get_caching_control(struct btrfs_block_group_cache *cache) -{ - struct btrfs_caching_control *ctl; - - spin_lock(&cache->lock); - if (!cache->caching_ctl) { - spin_unlock(&cache->lock); - return NULL; - } - - ctl = cache->caching_ctl; - refcount_inc(&ctl->count); - spin_unlock(&cache->lock); - return ctl; -} - -static void put_caching_control(struct btrfs_caching_control *ctl) -{ - if (refcount_dec_and_test(&ctl->count)) - kfree(ctl); -} - -#ifdef CONFIG_BTRFS_DEBUG -static void fragment_free_space(struct btrfs_block_group_cache *block_group) -{ - struct btrfs_fs_info *fs_info = block_group->fs_info; - u64 start = block_group->key.objectid; - u64 len = block_group->key.offset; - u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? - fs_info->nodesize : fs_info->sectorsize; - u64 step = chunk << 1; - - while (len > chunk) { - btrfs_remove_free_space(block_group, start, chunk); - start += step; - if (len < step) - len = 0; - else - len -= step; - } -} -#endif - -/* - * this is only called by cache_block_group, since we could have freed extents - * we need to check the pinned_extents for any extents that can't be used yet - * since their free space will be released as soon as the transaction commits. - */ -u64 add_new_free_space(struct btrfs_block_group_cache *block_group, - u64 start, u64 end) -{ - struct btrfs_fs_info *info = block_group->fs_info; - u64 extent_start, extent_end, size, total_added = 0; - int ret; - - while (start < end) { - ret = find_first_extent_bit(info->pinned_extents, start, - &extent_start, &extent_end, - EXTENT_DIRTY | EXTENT_UPTODATE, - NULL); - if (ret) - break; - - if (extent_start <= start) { - start = extent_end + 1; - } else if (extent_start > start && extent_start < end) { - size = extent_start - start; - total_added += size; - ret = btrfs_add_free_space(block_group, start, - size); - BUG_ON(ret); /* -ENOMEM or logic error */ - start = extent_end + 1; - } else { - break; - } - } - - if (start < end) { - size = end - start; - total_added += size; - ret = btrfs_add_free_space(block_group, start, size); - BUG_ON(ret); /* -ENOMEM or logic error */ - } - - return total_added; -} - -static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) -{ - struct btrfs_block_group_cache *block_group = caching_ctl->block_group; - struct btrfs_fs_info *fs_info = block_group->fs_info; - struct btrfs_root *extent_root = fs_info->extent_root; - struct btrfs_path *path; - struct extent_buffer *leaf; - struct btrfs_key key; - u64 total_found = 0; - u64 last = 0; - u32 nritems; - int ret; - bool wakeup = true; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); - -#ifdef CONFIG_BTRFS_DEBUG - /* - * If we're fragmenting we don't want to make anybody think we can - * allocate from this block group until we've had a chance to fragment - * the free space. - */ - if (btrfs_should_fragment_free_space(block_group)) - wakeup = false; -#endif - /* - * We don't want to deadlock with somebody trying to allocate a new - * extent for the extent root while also trying to search the extent - * root to add free space. So we skip locking and search the commit - * root, since its read-only - */ - path->skip_locking = 1; - path->search_commit_root = 1; - path->reada = READA_FORWARD; - - key.objectid = last; - key.offset = 0; - key.type = BTRFS_EXTENT_ITEM_KEY; - -next: - ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); - if (ret < 0) - goto out; - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - - while (1) { - if (btrfs_fs_closing(fs_info) > 1) { - last = (u64)-1; - break; - } - - if (path->slots[0] < nritems) { - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - } else { - ret = find_next_key(path, 0, &key); - if (ret) - break; - - if (need_resched() || - rwsem_is_contended(&fs_info->commit_root_sem)) { - if (wakeup) - caching_ctl->progress = last; - btrfs_release_path(path); - up_read(&fs_info->commit_root_sem); - mutex_unlock(&caching_ctl->mutex); - cond_resched(); - mutex_lock(&caching_ctl->mutex); - down_read(&fs_info->commit_root_sem); - goto next; - } - - ret = btrfs_next_leaf(extent_root, path); - if (ret < 0) - goto out; - if (ret) - break; - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - continue; - } - - if (key.objectid < last) { - key.objectid = last; - key.offset = 0; - key.type = BTRFS_EXTENT_ITEM_KEY; - - if (wakeup) - caching_ctl->progress = last; - btrfs_release_path(path); - goto next; - } - - if (key.objectid < block_group->key.objectid) { - path->slots[0]++; - continue; - } - - if (key.objectid >= block_group->key.objectid + - block_group->key.offset) - break; - - if (key.type == BTRFS_EXTENT_ITEM_KEY || - key.type == BTRFS_METADATA_ITEM_KEY) { - total_found += add_new_free_space(block_group, last, - key.objectid); - if (key.type == BTRFS_METADATA_ITEM_KEY) - last = key.objectid + - fs_info->nodesize; - else - last = key.objectid + key.offset; - - if (total_found > CACHING_CTL_WAKE_UP) { - total_found = 0; - if (wakeup) - wake_up(&caching_ctl->wait); - } - } - path->slots[0]++; - } - ret = 0; - - total_found += add_new_free_space(block_group, last, - block_group->key.objectid + - block_group->key.offset); - caching_ctl->progress = (u64)-1; - -out: - btrfs_free_path(path); - return ret; -} - -static noinline void caching_thread(struct btrfs_work *work) -{ - struct btrfs_block_group_cache *block_group; - struct btrfs_fs_info *fs_info; - struct btrfs_caching_control *caching_ctl; - int ret; - - caching_ctl = container_of(work, struct btrfs_caching_control, work); - block_group = caching_ctl->block_group; - fs_info = block_group->fs_info; - - mutex_lock(&caching_ctl->mutex); - down_read(&fs_info->commit_root_sem); - - if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) - ret = load_free_space_tree(caching_ctl); - else - ret = load_extent_tree_free(caching_ctl); - - spin_lock(&block_group->lock); - block_group->caching_ctl = NULL; - block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED; - spin_unlock(&block_group->lock); - -#ifdef CONFIG_BTRFS_DEBUG - if (btrfs_should_fragment_free_space(block_group)) { - u64 bytes_used; - - spin_lock(&block_group->space_info->lock); - spin_lock(&block_group->lock); - bytes_used = block_group->key.offset - - btrfs_block_group_used(&block_group->item); - block_group->space_info->bytes_used += bytes_used >> 1; - spin_unlock(&block_group->lock); - spin_unlock(&block_group->space_info->lock); - fragment_free_space(block_group); - } -#endif - - caching_ctl->progress = (u64)-1; - - up_read(&fs_info->commit_root_sem); - free_excluded_extents(block_group); - mutex_unlock(&caching_ctl->mutex); - - wake_up(&caching_ctl->wait); - - put_caching_control(caching_ctl); - btrfs_put_block_group(block_group); -} - -static int cache_block_group(struct btrfs_block_group_cache *cache, - int load_cache_only) -{ - DEFINE_WAIT(wait); - struct btrfs_fs_info *fs_info = cache->fs_info; - struct btrfs_caching_control *caching_ctl; - int ret = 0; - - caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); - if (!caching_ctl) - return -ENOMEM; - - INIT_LIST_HEAD(&caching_ctl->list); - mutex_init(&caching_ctl->mutex); - init_waitqueue_head(&caching_ctl->wait); - caching_ctl->block_group = cache; - caching_ctl->progress = cache->key.objectid; - refcount_set(&caching_ctl->count, 1); - btrfs_init_work(&caching_ctl->work, btrfs_cache_helper, - caching_thread, NULL, NULL); - - spin_lock(&cache->lock); - /* - * This should be a rare occasion, but this could happen I think in the - * case where one thread starts to load the space cache info, and then - * some other thread starts a transaction commit which tries to do an - * allocation while the other thread is still loading the space cache - * info. The previous loop should have kept us from choosing this block - * group, but if we've moved to the state where we will wait on caching - * block groups we need to first check if we're doing a fast load here, - * so we can wait for it to finish, otherwise we could end up allocating - * from a block group who's cache gets evicted for one reason or - * another. - */ - while (cache->cached == BTRFS_CACHE_FAST) { - struct btrfs_caching_control *ctl; - - ctl = cache->caching_ctl; - refcount_inc(&ctl->count); - prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock(&cache->lock); - - schedule(); - - finish_wait(&ctl->wait, &wait); - put_caching_control(ctl); - spin_lock(&cache->lock); - } - - if (cache->cached != BTRFS_CACHE_NO) { - spin_unlock(&cache->lock); - kfree(caching_ctl); - return 0; - } - WARN_ON(cache->caching_ctl); - cache->caching_ctl = caching_ctl; - cache->cached = BTRFS_CACHE_FAST; - spin_unlock(&cache->lock); - - if (btrfs_test_opt(fs_info, SPACE_CACHE)) { - mutex_lock(&caching_ctl->mutex); - ret = load_free_space_cache(cache); - - spin_lock(&cache->lock); - if (ret == 1) { - cache->caching_ctl = NULL; - cache->cached = BTRFS_CACHE_FINISHED; - cache->last_byte_to_unpin = (u64)-1; - caching_ctl->progress = (u64)-1; - } else { - if (load_cache_only) { - cache->caching_ctl = NULL; - cache->cached = BTRFS_CACHE_NO; - } else { - cache->cached = BTRFS_CACHE_STARTED; - cache->has_caching_ctl = 1; - } - } - spin_unlock(&cache->lock); -#ifdef CONFIG_BTRFS_DEBUG - if (ret == 1 && - btrfs_should_fragment_free_space(cache)) { - u64 bytes_used; - - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - bytes_used = cache->key.offset - - btrfs_block_group_used(&cache->item); - cache->space_info->bytes_used += bytes_used >> 1; - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - fragment_free_space(cache); - } -#endif - mutex_unlock(&caching_ctl->mutex); - - wake_up(&caching_ctl->wait); - if (ret == 1) { - put_caching_control(caching_ctl); - free_excluded_extents(cache); - return 0; - } - } else { - /* - * We're either using the free space tree or no caching at all. - * Set cached to the appropriate value and wakeup any waiters. - */ - spin_lock(&cache->lock); - if (load_cache_only) { - cache->caching_ctl = NULL; - cache->cached = BTRFS_CACHE_NO; - } else { - cache->cached = BTRFS_CACHE_STARTED; - cache->has_caching_ctl = 1; - } - spin_unlock(&cache->lock); - wake_up(&caching_ctl->wait); - } - - if (load_cache_only) { - put_caching_control(caching_ctl); - return 0; - } - - down_write(&fs_info->commit_root_sem); - refcount_inc(&caching_ctl->count); - list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); - up_write(&fs_info->commit_root_sem); - - btrfs_get_block_group(cache); - - btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); - - return ret; -} - -/* - * return the block group that starts at or after bytenr - */ -static struct btrfs_block_group_cache * -btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) -{ - return block_group_cache_tree_search(info, bytenr, 0); -} - -/* - * return the block group that contains the given bytenr - */ -struct btrfs_block_group_cache *btrfs_lookup_block_group( - struct btrfs_fs_info *info, - u64 bytenr) -{ - return block_group_cache_tree_search(info, bytenr, 1); -} - static u64 generic_ref_to_space_flags(struct btrfs_ref *ref) { if (ref->type == BTRFS_REF_METADATA) { @@ -1045,7 +438,7 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, return BTRFS_REF_TYPE_INVALID; } -static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) +u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) { u32 high_crc = ~(u32)0; u32 low_crc = ~(u32)0; @@ -2964,16 +2357,19 @@ static noinline int check_committed_ref(struct btrfs_root *root, item_size = btrfs_item_size_nr(leaf, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + /* If extent item has more than 1 inline ref then it's shared */ if (item_size != sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) goto out; + /* If extent created before last snapshot => it's definitely shared */ if (btrfs_extent_generation(leaf, ei) <= btrfs_root_last_snapshot(&root->root_item)) goto out; iref = (struct btrfs_extent_inline_ref *)(ei + 1); + /* If this extent has SHARED_DATA_REF then it's shared */ type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); if (type != BTRFS_EXTENT_DATA_REF_KEY) goto out; @@ -3118,552 +2514,6 @@ int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, return __btrfs_mod_ref(trans, root, buf, full_backref, 0); } -static int write_one_cache_group(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_block_group_cache *cache) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - int ret; - struct btrfs_root *extent_root = fs_info->extent_root; - unsigned long bi; - struct extent_buffer *leaf; - - ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); - if (ret) { - if (ret > 0) - ret = -ENOENT; - goto fail; - } - - leaf = path->nodes[0]; - bi = btrfs_item_ptr_offset(leaf, path->slots[0]); - write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); - btrfs_mark_buffer_dirty(leaf); -fail: - btrfs_release_path(path); - return ret; - -} - -static struct btrfs_block_group_cache *next_block_group( - struct btrfs_block_group_cache *cache) -{ - struct btrfs_fs_info *fs_info = cache->fs_info; - struct rb_node *node; - - spin_lock(&fs_info->block_group_cache_lock); - - /* If our block group was removed, we need a full search. */ - if (RB_EMPTY_NODE(&cache->cache_node)) { - const u64 next_bytenr = cache->key.objectid + cache->key.offset; - - spin_unlock(&fs_info->block_group_cache_lock); - btrfs_put_block_group(cache); - cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache; - } - node = rb_next(&cache->cache_node); - btrfs_put_block_group(cache); - if (node) { - cache = rb_entry(node, struct btrfs_block_group_cache, - cache_node); - btrfs_get_block_group(cache); - } else - cache = NULL; - spin_unlock(&fs_info->block_group_cache_lock); - return cache; -} - -static int cache_save_setup(struct btrfs_block_group_cache *block_group, - struct btrfs_trans_handle *trans, - struct btrfs_path *path) -{ - struct btrfs_fs_info *fs_info = block_group->fs_info; - struct btrfs_root *root = fs_info->tree_root; - struct inode *inode = NULL; - struct extent_changeset *data_reserved = NULL; - u64 alloc_hint = 0; - int dcs = BTRFS_DC_ERROR; - u64 num_pages = 0; - int retries = 0; - int ret = 0; - - /* - * If this block group is smaller than 100 megs don't bother caching the - * block group. - */ - if (block_group->key.offset < (100 * SZ_1M)) { - spin_lock(&block_group->lock); - block_group->disk_cache_state = BTRFS_DC_WRITTEN; - spin_unlock(&block_group->lock); - return 0; - } - - if (trans->aborted) - return 0; -again: - inode = lookup_free_space_inode(block_group, path); - if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { - ret = PTR_ERR(inode); - btrfs_release_path(path); - goto out; - } - - if (IS_ERR(inode)) { - BUG_ON(retries); - retries++; - - if (block_group->ro) - goto out_free; - - ret = create_free_space_inode(trans, block_group, path); - if (ret) - goto out_free; - goto again; - } - - /* - * We want to set the generation to 0, that way if anything goes wrong - * from here on out we know not to trust this cache when we load up next - * time. - */ - BTRFS_I(inode)->generation = 0; - ret = btrfs_update_inode(trans, root, inode); - if (ret) { - /* - * So theoretically we could recover from this, simply set the - * super cache generation to 0 so we know to invalidate the - * cache, but then we'd have to keep track of the block groups - * that fail this way so we know we _have_ to reset this cache - * before the next commit or risk reading stale cache. So to - * limit our exposure to horrible edge cases lets just abort the - * transaction, this only happens in really bad situations - * anyway. - */ - btrfs_abort_transaction(trans, ret); - goto out_put; - } - WARN_ON(ret); - - /* We've already setup this transaction, go ahead and exit */ - if (block_group->cache_generation == trans->transid && - i_size_read(inode)) { - dcs = BTRFS_DC_SETUP; - goto out_put; - } - - if (i_size_read(inode) > 0) { - ret = btrfs_check_trunc_cache_free_space(fs_info, - &fs_info->global_block_rsv); - if (ret) - goto out_put; - - ret = btrfs_truncate_free_space_cache(trans, NULL, inode); - if (ret) - goto out_put; - } - - spin_lock(&block_group->lock); - if (block_group->cached != BTRFS_CACHE_FINISHED || - !btrfs_test_opt(fs_info, SPACE_CACHE)) { - /* - * don't bother trying to write stuff out _if_ - * a) we're not cached, - * b) we're with nospace_cache mount option, - * c) we're with v2 space_cache (FREE_SPACE_TREE). - */ - dcs = BTRFS_DC_WRITTEN; - spin_unlock(&block_group->lock); - goto out_put; - } - spin_unlock(&block_group->lock); - - /* - * We hit an ENOSPC when setting up the cache in this transaction, just - * skip doing the setup, we've already cleared the cache so we're safe. - */ - if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { - ret = -ENOSPC; - goto out_put; - } - - /* - * Try to preallocate enough space based on how big the block group is. - * Keep in mind this has to include any pinned space which could end up - * taking up quite a bit since it's not folded into the other space - * cache. - */ - num_pages = div_u64(block_group->key.offset, SZ_256M); - if (!num_pages) - num_pages = 1; - - num_pages *= 16; - num_pages *= PAGE_SIZE; - - ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages); - if (ret) - goto out_put; - - ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, - num_pages, num_pages, - &alloc_hint); - /* - * Our cache requires contiguous chunks so that we don't modify a bunch - * of metadata or split extents when writing the cache out, which means - * we can enospc if we are heavily fragmented in addition to just normal - * out of space conditions. So if we hit this just skip setting up any - * other block groups for this transaction, maybe we'll unpin enough - * space the next time around. - */ - if (!ret) - dcs = BTRFS_DC_SETUP; - else if (ret == -ENOSPC) - set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); - -out_put: - iput(inode); -out_free: - btrfs_release_path(path); -out: - spin_lock(&block_group->lock); - if (!ret && dcs == BTRFS_DC_SETUP) - block_group->cache_generation = trans->transid; - block_group->disk_cache_state = dcs; - spin_unlock(&block_group->lock); - - extent_changeset_free(data_reserved); - return ret; -} - -int btrfs_setup_space_cache(struct btrfs_trans_handle *trans) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_group_cache *cache, *tmp; - struct btrfs_transaction *cur_trans = trans->transaction; - struct btrfs_path *path; - - if (list_empty(&cur_trans->dirty_bgs) || - !btrfs_test_opt(fs_info, SPACE_CACHE)) - return 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - /* Could add new block groups, use _safe just in case */ - list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs, - dirty_list) { - if (cache->disk_cache_state == BTRFS_DC_CLEAR) - cache_save_setup(cache, trans, path); - } - - btrfs_free_path(path); - return 0; -} - -/* - * transaction commit does final block group cache writeback during a - * critical section where nothing is allowed to change the FS. This is - * required in order for the cache to actually match the block group, - * but can introduce a lot of latency into the commit. - * - * So, btrfs_start_dirty_block_groups is here to kick off block group - * cache IO. There's a chance we'll have to redo some of it if the - * block group changes again during the commit, but it greatly reduces - * the commit latency by getting rid of the easy block groups while - * we're still allowing others to join the commit. - */ -int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_group_cache *cache; - struct btrfs_transaction *cur_trans = trans->transaction; - int ret = 0; - int should_put; - struct btrfs_path *path = NULL; - LIST_HEAD(dirty); - struct list_head *io = &cur_trans->io_bgs; - int num_started = 0; - int loops = 0; - - spin_lock(&cur_trans->dirty_bgs_lock); - if (list_empty(&cur_trans->dirty_bgs)) { - spin_unlock(&cur_trans->dirty_bgs_lock); - return 0; - } - list_splice_init(&cur_trans->dirty_bgs, &dirty); - spin_unlock(&cur_trans->dirty_bgs_lock); - -again: - /* - * make sure all the block groups on our dirty list actually - * exist - */ - btrfs_create_pending_block_groups(trans); - - if (!path) { - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - } - - /* - * cache_write_mutex is here only to save us from balance or automatic - * removal of empty block groups deleting this block group while we are - * writing out the cache - */ - mutex_lock(&trans->transaction->cache_write_mutex); - while (!list_empty(&dirty)) { - bool drop_reserve = true; - - cache = list_first_entry(&dirty, - struct btrfs_block_group_cache, - dirty_list); - /* - * this can happen if something re-dirties a block - * group that is already under IO. Just wait for it to - * finish and then do it all again - */ - if (!list_empty(&cache->io_list)) { - list_del_init(&cache->io_list); - btrfs_wait_cache_io(trans, cache, path); - btrfs_put_block_group(cache); - } - - - /* - * btrfs_wait_cache_io uses the cache->dirty_list to decide - * if it should update the cache_state. Don't delete - * until after we wait. - * - * Since we're not running in the commit critical section - * we need the dirty_bgs_lock to protect from update_block_group - */ - spin_lock(&cur_trans->dirty_bgs_lock); - list_del_init(&cache->dirty_list); - spin_unlock(&cur_trans->dirty_bgs_lock); - - should_put = 1; - - cache_save_setup(cache, trans, path); - - if (cache->disk_cache_state == BTRFS_DC_SETUP) { - cache->io_ctl.inode = NULL; - ret = btrfs_write_out_cache(trans, cache, path); - if (ret == 0 && cache->io_ctl.inode) { - num_started++; - should_put = 0; - - /* - * The cache_write_mutex is protecting the - * io_list, also refer to the definition of - * btrfs_transaction::io_bgs for more details - */ - list_add_tail(&cache->io_list, io); - } else { - /* - * if we failed to write the cache, the - * generation will be bad and life goes on - */ - ret = 0; - } - } - if (!ret) { - ret = write_one_cache_group(trans, path, cache); - /* - * Our block group might still be attached to the list - * of new block groups in the transaction handle of some - * other task (struct btrfs_trans_handle->new_bgs). This - * means its block group item isn't yet in the extent - * tree. If this happens ignore the error, as we will - * try again later in the critical section of the - * transaction commit. - */ - if (ret == -ENOENT) { - ret = 0; - spin_lock(&cur_trans->dirty_bgs_lock); - if (list_empty(&cache->dirty_list)) { - list_add_tail(&cache->dirty_list, - &cur_trans->dirty_bgs); - btrfs_get_block_group(cache); - drop_reserve = false; - } - spin_unlock(&cur_trans->dirty_bgs_lock); - } else if (ret) { - btrfs_abort_transaction(trans, ret); - } - } - - /* if it's not on the io list, we need to put the block group */ - if (should_put) - btrfs_put_block_group(cache); - if (drop_reserve) - btrfs_delayed_refs_rsv_release(fs_info, 1); - - if (ret) - break; - - /* - * Avoid blocking other tasks for too long. It might even save - * us from writing caches for block groups that are going to be - * removed. - */ - mutex_unlock(&trans->transaction->cache_write_mutex); - mutex_lock(&trans->transaction->cache_write_mutex); - } - mutex_unlock(&trans->transaction->cache_write_mutex); - - /* - * go through delayed refs for all the stuff we've just kicked off - * and then loop back (just once) - */ - ret = btrfs_run_delayed_refs(trans, 0); - if (!ret && loops == 0) { - loops++; - spin_lock(&cur_trans->dirty_bgs_lock); - list_splice_init(&cur_trans->dirty_bgs, &dirty); - /* - * dirty_bgs_lock protects us from concurrent block group - * deletes too (not just cache_write_mutex). - */ - if (!list_empty(&dirty)) { - spin_unlock(&cur_trans->dirty_bgs_lock); - goto again; - } - spin_unlock(&cur_trans->dirty_bgs_lock); - } else if (ret < 0) { - btrfs_cleanup_dirty_bgs(cur_trans, fs_info); - } - - btrfs_free_path(path); - return ret; -} - -int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_group_cache *cache; - struct btrfs_transaction *cur_trans = trans->transaction; - int ret = 0; - int should_put; - struct btrfs_path *path; - struct list_head *io = &cur_trans->io_bgs; - int num_started = 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - /* - * Even though we are in the critical section of the transaction commit, - * we can still have concurrent tasks adding elements to this - * transaction's list of dirty block groups. These tasks correspond to - * endio free space workers started when writeback finishes for a - * space cache, which run inode.c:btrfs_finish_ordered_io(), and can - * allocate new block groups as a result of COWing nodes of the root - * tree when updating the free space inode. The writeback for the space - * caches is triggered by an earlier call to - * btrfs_start_dirty_block_groups() and iterations of the following - * loop. - * Also we want to do the cache_save_setup first and then run the - * delayed refs to make sure we have the best chance at doing this all - * in one shot. - */ - spin_lock(&cur_trans->dirty_bgs_lock); - while (!list_empty(&cur_trans->dirty_bgs)) { - cache = list_first_entry(&cur_trans->dirty_bgs, - struct btrfs_block_group_cache, - dirty_list); - - /* - * this can happen if cache_save_setup re-dirties a block - * group that is already under IO. Just wait for it to - * finish and then do it all again - */ - if (!list_empty(&cache->io_list)) { - spin_unlock(&cur_trans->dirty_bgs_lock); - list_del_init(&cache->io_list); - btrfs_wait_cache_io(trans, cache, path); - btrfs_put_block_group(cache); - spin_lock(&cur_trans->dirty_bgs_lock); - } - - /* - * don't remove from the dirty list until after we've waited - * on any pending IO - */ - list_del_init(&cache->dirty_list); - spin_unlock(&cur_trans->dirty_bgs_lock); - should_put = 1; - - cache_save_setup(cache, trans, path); - - if (!ret) - ret = btrfs_run_delayed_refs(trans, - (unsigned long) -1); - - if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { - cache->io_ctl.inode = NULL; - ret = btrfs_write_out_cache(trans, cache, path); - if (ret == 0 && cache->io_ctl.inode) { - num_started++; - should_put = 0; - list_add_tail(&cache->io_list, io); - } else { - /* - * if we failed to write the cache, the - * generation will be bad and life goes on - */ - ret = 0; - } - } - if (!ret) { - ret = write_one_cache_group(trans, path, cache); - /* - * One of the free space endio workers might have - * created a new block group while updating a free space - * cache's inode (at inode.c:btrfs_finish_ordered_io()) - * and hasn't released its transaction handle yet, in - * which case the new block group is still attached to - * its transaction handle and its creation has not - * finished yet (no block group item in the extent tree - * yet, etc). If this is the case, wait for all free - * space endio workers to finish and retry. This is a - * a very rare case so no need for a more efficient and - * complex approach. - */ - if (ret == -ENOENT) { - wait_event(cur_trans->writer_wait, - atomic_read(&cur_trans->num_writers) == 1); - ret = write_one_cache_group(trans, path, cache); - } - if (ret) - btrfs_abort_transaction(trans, ret); - } - - /* if its not on the io list, we need to put the block group */ - if (should_put) - btrfs_put_block_group(cache); - btrfs_delayed_refs_rsv_release(fs_info, 1); - spin_lock(&cur_trans->dirty_bgs_lock); - } - spin_unlock(&cur_trans->dirty_bgs_lock); - - /* - * Refer to the definition of io_bgs member for details why it's safe - * to use it without any locking - */ - while (!list_empty(io)) { - cache = list_first_entry(io, struct btrfs_block_group_cache, - io_list); - list_del_init(&cache->io_list); - btrfs_wait_cache_io(trans, cache, path); - btrfs_put_block_group(cache); - } - - btrfs_free_path(path); - return ret; -} - int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) { struct btrfs_block_group_cache *block_group; @@ -3677,166 +2527,6 @@ int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) return readonly; } -bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) -{ - struct btrfs_block_group_cache *bg; - bool ret = true; - - bg = btrfs_lookup_block_group(fs_info, bytenr); - if (!bg) - return false; - - spin_lock(&bg->lock); - if (bg->ro) - ret = false; - else - atomic_inc(&bg->nocow_writers); - spin_unlock(&bg->lock); - - /* no put on block group, done by btrfs_dec_nocow_writers */ - if (!ret) - btrfs_put_block_group(bg); - - return ret; - -} - -void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) -{ - struct btrfs_block_group_cache *bg; - - bg = btrfs_lookup_block_group(fs_info, bytenr); - ASSERT(bg); - if (atomic_dec_and_test(&bg->nocow_writers)) - wake_up_var(&bg->nocow_writers); - /* - * Once for our lookup and once for the lookup done by a previous call - * to btrfs_inc_nocow_writers() - */ - btrfs_put_block_group(bg); - btrfs_put_block_group(bg); -} - -void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) -{ - wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers)); -} - -static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) -{ - u64 extra_flags = chunk_to_extended(flags) & - BTRFS_EXTENDED_PROFILE_MASK; - - write_seqlock(&fs_info->profiles_lock); - if (flags & BTRFS_BLOCK_GROUP_DATA) - fs_info->avail_data_alloc_bits |= extra_flags; - if (flags & BTRFS_BLOCK_GROUP_METADATA) - fs_info->avail_metadata_alloc_bits |= extra_flags; - if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - fs_info->avail_system_alloc_bits |= extra_flags; - write_sequnlock(&fs_info->profiles_lock); -} - -/* - * returns target flags in extended format or 0 if restripe for this - * chunk_type is not in progress - * - * should be called with balance_lock held - */ -static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) -{ - struct btrfs_balance_control *bctl = fs_info->balance_ctl; - u64 target = 0; - - if (!bctl) - return 0; - - if (flags & BTRFS_BLOCK_GROUP_DATA && - bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { - target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; - } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && - bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { - target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; - } else if (flags & BTRFS_BLOCK_GROUP_METADATA && - bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { - target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; - } - - return target; -} - -/* - * @flags: available profiles in extended format (see ctree.h) - * - * Returns reduced profile in chunk format. If profile changing is in - * progress (either running or paused) picks the target profile (if it's - * already available), otherwise falls back to plain reducing. - */ -static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) -{ - u64 num_devices = fs_info->fs_devices->rw_devices; - u64 target; - u64 raid_type; - u64 allowed = 0; - - /* - * see if restripe for this chunk_type is in progress, if so - * try to reduce to the target profile - */ - spin_lock(&fs_info->balance_lock); - target = get_restripe_target(fs_info, flags); - if (target) { - /* pick target profile only if it's already available */ - if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { - spin_unlock(&fs_info->balance_lock); - return extended_to_chunk(target); - } - } - spin_unlock(&fs_info->balance_lock); - - /* First, mask out the RAID levels which aren't possible */ - for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { - if (num_devices >= btrfs_raid_array[raid_type].devs_min) - allowed |= btrfs_raid_array[raid_type].bg_flag; - } - allowed &= flags; - - if (allowed & BTRFS_BLOCK_GROUP_RAID6) - allowed = BTRFS_BLOCK_GROUP_RAID6; - else if (allowed & BTRFS_BLOCK_GROUP_RAID5) - allowed = BTRFS_BLOCK_GROUP_RAID5; - else if (allowed & BTRFS_BLOCK_GROUP_RAID10) - allowed = BTRFS_BLOCK_GROUP_RAID10; - else if (allowed & BTRFS_BLOCK_GROUP_RAID1) - allowed = BTRFS_BLOCK_GROUP_RAID1; - else if (allowed & BTRFS_BLOCK_GROUP_RAID0) - allowed = BTRFS_BLOCK_GROUP_RAID0; - - flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK; - - return extended_to_chunk(flags | allowed); -} - -static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) -{ - unsigned seq; - u64 flags; - - do { - flags = orig_flags; - seq = read_seqbegin(&fs_info->profiles_lock); - - if (flags & BTRFS_BLOCK_GROUP_DATA) - flags |= fs_info->avail_data_alloc_bits; - else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - flags |= fs_info->avail_system_alloc_bits; - else if (flags & BTRFS_BLOCK_GROUP_METADATA) - flags |= fs_info->avail_metadata_alloc_bits; - } while (read_seqretry(&fs_info->profiles_lock, seq)); - - return btrfs_reduce_alloc_profile(fs_info, flags); -} - static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -3850,368 +2540,7 @@ static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) else flags = BTRFS_BLOCK_GROUP_METADATA; - ret = get_alloc_profile(fs_info, flags); - return ret; -} - -u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info) -{ - return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA); -} - -u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info) -{ - return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA); -} - -u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info) -{ - return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); -} - -static void force_metadata_allocation(struct btrfs_fs_info *info) -{ - struct list_head *head = &info->space_info; - struct btrfs_space_info *found; - - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) { - if (found->flags & BTRFS_BLOCK_GROUP_METADATA) - found->force_alloc = CHUNK_ALLOC_FORCE; - } - rcu_read_unlock(); -} - -static int should_alloc_chunk(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *sinfo, int force) -{ - u64 bytes_used = btrfs_space_info_used(sinfo, false); - u64 thresh; - - if (force == CHUNK_ALLOC_FORCE) - return 1; - - /* - * in limited mode, we want to have some free space up to - * about 1% of the FS size. - */ - if (force == CHUNK_ALLOC_LIMITED) { - thresh = btrfs_super_total_bytes(fs_info->super_copy); - thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1)); - - if (sinfo->total_bytes - bytes_used < thresh) - return 1; - } - - if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8)) - return 0; - return 1; -} - -static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) -{ - u64 num_dev; - - num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max; - if (!num_dev) - num_dev = fs_info->fs_devices->rw_devices; - - return num_dev; -} - -/* - * If @is_allocation is true, reserve space in the system space info necessary - * for allocating a chunk, otherwise if it's false, reserve space necessary for - * removing a chunk. - */ -void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_space_info *info; - u64 left; - u64 thresh; - int ret = 0; - u64 num_devs; - - /* - * Needed because we can end up allocating a system chunk and for an - * atomic and race free space reservation in the chunk block reserve. - */ - lockdep_assert_held(&fs_info->chunk_mutex); - - info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); - spin_lock(&info->lock); - left = info->total_bytes - btrfs_space_info_used(info, true); - spin_unlock(&info->lock); - - num_devs = get_profile_num_devs(fs_info, type); - - /* num_devs device items to update and 1 chunk item to add or remove */ - thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) + - btrfs_calc_trans_metadata_size(fs_info, 1); - - if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { - btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", - left, thresh, type); - btrfs_dump_space_info(fs_info, info, 0, 0); - } - - if (left < thresh) { - u64 flags = btrfs_system_alloc_profile(fs_info); - - /* - * Ignore failure to create system chunk. We might end up not - * needing it, as we might not need to COW all nodes/leafs from - * the paths we visit in the chunk tree (they were already COWed - * or created in the current transaction for example). - */ - ret = btrfs_alloc_chunk(trans, flags); - } - - if (!ret) { - ret = btrfs_block_rsv_add(fs_info->chunk_root, - &fs_info->chunk_block_rsv, - thresh, BTRFS_RESERVE_NO_FLUSH); - if (!ret) - trans->chunk_bytes_reserved += thresh; - } -} - -/* - * If force is CHUNK_ALLOC_FORCE: - * - return 1 if it successfully allocates a chunk, - * - return errors including -ENOSPC otherwise. - * If force is NOT CHUNK_ALLOC_FORCE: - * - return 0 if it doesn't need to allocate a new chunk, - * - return 1 if it successfully allocates a chunk, - * - return errors including -ENOSPC otherwise. - */ -int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, - enum btrfs_chunk_alloc_enum force) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_space_info *space_info; - bool wait_for_alloc = false; - bool should_alloc = false; - int ret = 0; - - /* Don't re-enter if we're already allocating a chunk */ - if (trans->allocating_chunk) - return -ENOSPC; - - space_info = btrfs_find_space_info(fs_info, flags); - ASSERT(space_info); - - do { - spin_lock(&space_info->lock); - if (force < space_info->force_alloc) - force = space_info->force_alloc; - should_alloc = should_alloc_chunk(fs_info, space_info, force); - if (space_info->full) { - /* No more free physical space */ - if (should_alloc) - ret = -ENOSPC; - else - ret = 0; - spin_unlock(&space_info->lock); - return ret; - } else if (!should_alloc) { - spin_unlock(&space_info->lock); - return 0; - } else if (space_info->chunk_alloc) { - /* - * Someone is already allocating, so we need to block - * until this someone is finished and then loop to - * recheck if we should continue with our allocation - * attempt. - */ - wait_for_alloc = true; - spin_unlock(&space_info->lock); - mutex_lock(&fs_info->chunk_mutex); - mutex_unlock(&fs_info->chunk_mutex); - } else { - /* Proceed with allocation */ - space_info->chunk_alloc = 1; - wait_for_alloc = false; - spin_unlock(&space_info->lock); - } - - cond_resched(); - } while (wait_for_alloc); - - mutex_lock(&fs_info->chunk_mutex); - trans->allocating_chunk = true; - - /* - * If we have mixed data/metadata chunks we want to make sure we keep - * allocating mixed chunks instead of individual chunks. - */ - if (btrfs_mixed_space_info(space_info)) - flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); - - /* - * if we're doing a data chunk, go ahead and make sure that - * we keep a reasonable number of metadata chunks allocated in the - * FS as well. - */ - if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { - fs_info->data_chunk_allocations++; - if (!(fs_info->data_chunk_allocations % - fs_info->metadata_ratio)) - force_metadata_allocation(fs_info); - } - - /* - * Check if we have enough space in SYSTEM chunk because we may need - * to update devices. - */ - check_system_chunk(trans, flags); - - ret = btrfs_alloc_chunk(trans, flags); - trans->allocating_chunk = false; - - spin_lock(&space_info->lock); - if (ret < 0) { - if (ret == -ENOSPC) - space_info->full = 1; - else - goto out; - } else { - ret = 1; - space_info->max_extent_size = 0; - } - - space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; -out: - space_info->chunk_alloc = 0; - spin_unlock(&space_info->lock); - mutex_unlock(&fs_info->chunk_mutex); - /* - * When we allocate a new chunk we reserve space in the chunk block - * reserve to make sure we can COW nodes/leafs in the chunk tree or - * add new nodes/leafs to it if we end up needing to do it when - * inserting the chunk item and updating device items as part of the - * second phase of chunk allocation, performed by - * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a - * large number of new block groups to create in our transaction - * handle's new_bgs list to avoid exhausting the chunk block reserve - * in extreme cases - like having a single transaction create many new - * block groups when starting to write out the free space caches of all - * the block groups that were made dirty during the lifetime of the - * transaction. - */ - if (trans->chunk_bytes_reserved >= (u64)SZ_2M) - btrfs_create_pending_block_groups(trans); - - return ret; -} - -static int update_block_group(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, int alloc) -{ - struct btrfs_fs_info *info = trans->fs_info; - struct btrfs_block_group_cache *cache = NULL; - u64 total = num_bytes; - u64 old_val; - u64 byte_in_group; - int factor; - int ret = 0; - - /* block accounting for super block */ - spin_lock(&info->delalloc_root_lock); - old_val = btrfs_super_bytes_used(info->super_copy); - if (alloc) - old_val += num_bytes; - else - old_val -= num_bytes; - btrfs_set_super_bytes_used(info->super_copy, old_val); - spin_unlock(&info->delalloc_root_lock); - - while (total) { - cache = btrfs_lookup_block_group(info, bytenr); - if (!cache) { - ret = -ENOENT; - break; - } - factor = btrfs_bg_type_to_factor(cache->flags); - - /* - * If this block group has free space cache written out, we - * need to make sure to load it if we are removing space. This - * is because we need the unpinning stage to actually add the - * space back to the block group, otherwise we will leak space. - */ - if (!alloc && cache->cached == BTRFS_CACHE_NO) - cache_block_group(cache, 1); - - byte_in_group = bytenr - cache->key.objectid; - WARN_ON(byte_in_group > cache->key.offset); - - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - - if (btrfs_test_opt(info, SPACE_CACHE) && - cache->disk_cache_state < BTRFS_DC_CLEAR) - cache->disk_cache_state = BTRFS_DC_CLEAR; - - old_val = btrfs_block_group_used(&cache->item); - num_bytes = min(total, cache->key.offset - byte_in_group); - if (alloc) { - old_val += num_bytes; - btrfs_set_block_group_used(&cache->item, old_val); - cache->reserved -= num_bytes; - cache->space_info->bytes_reserved -= num_bytes; - cache->space_info->bytes_used += num_bytes; - cache->space_info->disk_used += num_bytes * factor; - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - } else { - old_val -= num_bytes; - btrfs_set_block_group_used(&cache->item, old_val); - cache->pinned += num_bytes; - btrfs_space_info_update_bytes_pinned(info, - cache->space_info, num_bytes); - cache->space_info->bytes_used -= num_bytes; - cache->space_info->disk_used -= num_bytes * factor; - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - - trace_btrfs_space_reservation(info, "pinned", - cache->space_info->flags, - num_bytes, 1); - percpu_counter_add_batch(&cache->space_info->total_bytes_pinned, - num_bytes, - BTRFS_TOTAL_BYTES_PINNED_BATCH); - set_extent_dirty(info->pinned_extents, - bytenr, bytenr + num_bytes - 1, - GFP_NOFS | __GFP_NOFAIL); - } - - spin_lock(&trans->transaction->dirty_bgs_lock); - if (list_empty(&cache->dirty_list)) { - list_add_tail(&cache->dirty_list, - &trans->transaction->dirty_bgs); - trans->delayed_ref_updates++; - btrfs_get_block_group(cache); - } - spin_unlock(&trans->transaction->dirty_bgs_lock); - - /* - * No longer have used bytes in this block group, queue it for - * deletion. We do this after adding the block group to the - * dirty list to avoid races between cleaner kthread and space - * cache writeout. - */ - if (!alloc && old_val == 0) - btrfs_mark_bg_unused(cache); - - btrfs_put_block_group(cache); - total -= num_bytes; - bytenr += num_bytes; - } - - /* Modified block groups are accounted for in the delayed_refs_rsv. */ - btrfs_update_delayed_refs_rsv(trans); + ret = btrfs_get_alloc_profile(fs_info, flags); return ret; } @@ -4254,8 +2583,6 @@ static int pin_down_extent(struct btrfs_block_group_cache *cache, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - trace_btrfs_space_reservation(fs_info, "pinned", - cache->space_info->flags, num_bytes, 1); percpu_counter_add_batch(&cache->space_info->total_bytes_pinned, num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH); set_extent_dirty(fs_info->pinned_extents, bytenr, @@ -4299,7 +2626,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, * to one because the slow code to read in the free extents does check * the pinned extents. */ - cache_block_group(cache, 1); + btrfs_cache_block_group(cache, 1); pin_down_extent(cache, bytenr, num_bytes, 0); @@ -4320,18 +2647,19 @@ static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, if (!block_group) return -EINVAL; - cache_block_group(block_group, 0); - caching_ctl = get_caching_control(block_group); + btrfs_cache_block_group(block_group, 0); + caching_ctl = btrfs_get_caching_control(block_group); if (!caching_ctl) { /* Logic error */ - BUG_ON(!block_group_cache_done(block_group)); + BUG_ON(!btrfs_block_group_cache_done(block_group)); ret = btrfs_remove_free_space(block_group, start, num_bytes); } else { mutex_lock(&caching_ctl->mutex); if (start >= caching_ctl->progress) { - ret = add_excluded_extent(fs_info, start, num_bytes); + ret = btrfs_add_excluded_extent(fs_info, start, + num_bytes); } else if (start + num_bytes <= caching_ctl->progress) { ret = btrfs_remove_free_space(block_group, start, num_bytes); @@ -4345,11 +2673,12 @@ static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, num_bytes = (start + num_bytes) - caching_ctl->progress; start = caching_ctl->progress; - ret = add_excluded_extent(fs_info, start, num_bytes); + ret = btrfs_add_excluded_extent(fs_info, start, + num_bytes); } out_lock: mutex_unlock(&caching_ctl->mutex); - put_caching_control(caching_ctl); + btrfs_put_caching_control(caching_ctl); } btrfs_put_block_group(block_group); return ret; @@ -4393,108 +2722,6 @@ btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg) atomic_inc(&bg->reservations); } -void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, - const u64 start) -{ - struct btrfs_block_group_cache *bg; - - bg = btrfs_lookup_block_group(fs_info, start); - ASSERT(bg); - if (atomic_dec_and_test(&bg->reservations)) - wake_up_var(&bg->reservations); - btrfs_put_block_group(bg); -} - -void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) -{ - struct btrfs_space_info *space_info = bg->space_info; - - ASSERT(bg->ro); - - if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) - return; - - /* - * Our block group is read only but before we set it to read only, - * some task might have had allocated an extent from it already, but it - * has not yet created a respective ordered extent (and added it to a - * root's list of ordered extents). - * Therefore wait for any task currently allocating extents, since the - * block group's reservations counter is incremented while a read lock - * on the groups' semaphore is held and decremented after releasing - * the read access on that semaphore and creating the ordered extent. - */ - down_write(&space_info->groups_sem); - up_write(&space_info->groups_sem); - - wait_var_event(&bg->reservations, !atomic_read(&bg->reservations)); -} - -/** - * btrfs_add_reserved_bytes - update the block_group and space info counters - * @cache: The cache we are manipulating - * @ram_bytes: The number of bytes of file content, and will be same to - * @num_bytes except for the compress path. - * @num_bytes: The number of bytes in question - * @delalloc: The blocks are allocated for the delalloc write - * - * This is called by the allocator when it reserves space. If this is a - * reservation and the block group has become read only we cannot make the - * reservation and return -EAGAIN, otherwise this function always succeeds. - */ -static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 ram_bytes, u64 num_bytes, int delalloc) -{ - struct btrfs_space_info *space_info = cache->space_info; - int ret = 0; - - spin_lock(&space_info->lock); - spin_lock(&cache->lock); - if (cache->ro) { - ret = -EAGAIN; - } else { - cache->reserved += num_bytes; - space_info->bytes_reserved += num_bytes; - btrfs_space_info_update_bytes_may_use(cache->fs_info, - space_info, -ram_bytes); - if (delalloc) - cache->delalloc_bytes += num_bytes; - } - spin_unlock(&cache->lock); - spin_unlock(&space_info->lock); - return ret; -} - -/** - * btrfs_free_reserved_bytes - update the block_group and space info counters - * @cache: The cache we are manipulating - * @num_bytes: The number of bytes in question - * @delalloc: The blocks are allocated for the delalloc write - * - * This is called by somebody who is freeing space that was never actually used - * on disk. For example if you reserve some space for a new leaf in transaction - * A and before transaction A commits you free that leaf, you call this with - * reserve set to 0 in order to clear the reservation. - */ - -static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int delalloc) -{ - struct btrfs_space_info *space_info = cache->space_info; - - spin_lock(&space_info->lock); - spin_lock(&cache->lock); - if (cache->ro) - space_info->bytes_readonly += num_bytes; - cache->reserved -= num_bytes; - space_info->bytes_reserved -= num_bytes; - space_info->max_extent_size = 0; - - if (delalloc) - cache->delalloc_bytes -= num_bytes; - spin_unlock(&cache->lock); - spin_unlock(&space_info->lock); -} void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info) { struct btrfs_caching_control *next; @@ -4506,10 +2733,10 @@ void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info) list_for_each_entry_safe(caching_ctl, next, &fs_info->caching_block_groups, list) { cache = caching_ctl->block_group; - if (block_group_cache_done(cache)) { + if (btrfs_block_group_cache_done(cache)) { cache->last_byte_to_unpin = (u64)-1; list_del_init(&caching_ctl->list); - put_caching_control(caching_ctl); + btrfs_put_caching_control(caching_ctl); } else { cache->last_byte_to_unpin = caching_ctl->progress; } @@ -4613,9 +2840,6 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, spin_lock(&cache->lock); cache->pinned -= len; btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len); - - trace_btrfs_space_reservation(fs_info, "pinned", - space_info->flags, len, 0); space_info->max_extent_size = 0; percpu_counter_add_batch(&space_info->total_bytes_pinned, -len, BTRFS_TOTAL_BYTES_PINNED_BATCH); @@ -4637,17 +2861,13 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, space_info, to_add); if (global_rsv->reserved >= global_rsv->size) global_rsv->full = 1; - trace_btrfs_space_reservation(fs_info, - "space_info", - space_info->flags, - to_add, 1); len -= to_add; } spin_unlock(&global_rsv->lock); /* Add to any tickets we may have */ if (len) - btrfs_space_info_add_new_bytes(fs_info, - space_info, len); + btrfs_try_granting_tickets(fs_info, + space_info); } spin_unlock(&space_info->lock); } @@ -4948,7 +3168,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, goto out; } - ret = update_block_group(trans, bytenr, num_bytes, 0); + ret = btrfs_update_block_group(trans, bytenr, num_bytes, 0); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -5121,53 +3341,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) return ret; } -/* - * when we wait for progress in the block group caching, its because - * our allocation attempt failed at least once. So, we must sleep - * and let some progress happen before we try again. - * - * This function will sleep at least once waiting for new free space to - * show up, and then it will check the block group free space numbers - * for our min num_bytes. Another option is to have it go ahead - * and look in the rbtree for a free extent of a given size, but this - * is a good start. - * - * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using - * any of the information in this block group. - */ -static noinline void -wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, - u64 num_bytes) -{ - struct btrfs_caching_control *caching_ctl; - - caching_ctl = get_caching_control(cache); - if (!caching_ctl) - return; - - wait_event(caching_ctl->wait, block_group_cache_done(cache) || - (cache->free_space_ctl->free_space >= num_bytes)); - - put_caching_control(caching_ctl); -} - -static noinline int -wait_block_group_cache_done(struct btrfs_block_group_cache *cache) -{ - struct btrfs_caching_control *caching_ctl; - int ret = 0; - - caching_ctl = get_caching_control(cache); - if (!caching_ctl) - return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; - - wait_event(caching_ctl->wait, block_group_cache_done(cache)); - if (cache->cached == BTRFS_CACHE_ERROR) - ret = -EIO; - put_caching_control(caching_ctl); - return ret; -} - enum btrfs_loop_type { LOOP_CACHING_NOWAIT, LOOP_CACHING_WAIT, @@ -5387,7 +3560,7 @@ refill_cluster: spin_unlock(&last_ptr->refill_lock); ffe_ctl->retry_clustered = true; - wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + + btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + ffe_ctl->empty_cluster + ffe_ctl->empty_size); return -EAGAIN; } @@ -5454,8 +3627,8 @@ static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg, */ if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT) { - wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + - ffe_ctl->empty_size); + btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + + ffe_ctl->empty_size); ffe_ctl->retry_unclustered = true; return -EAGAIN; } else if (!offset) { @@ -5751,13 +3924,21 @@ search: */ if ((flags & extra) && !(block_group->flags & extra)) goto loop; + + /* + * This block group has different flags than we want. + * It's possible that we have MIXED_GROUP flag but no + * block group is mixed. Just skip such block group. + */ + btrfs_release_block_group(block_group, delalloc); + continue; } have_block_group: - ffe_ctl.cached = block_group_cache_done(block_group); + ffe_ctl.cached = btrfs_block_group_cache_done(block_group); if (unlikely(!ffe_ctl.cached)) { ffe_ctl.have_caching_bg = true; - ret = cache_block_group(block_group, 0); + ret = btrfs_cache_block_group(block_group, 0); BUG_ON(ret < 0); ret = 0; } @@ -6052,7 +4233,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = update_block_group(trans, ins->objectid, ins->offset, 1); + ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", ins->objectid, ins->offset); @@ -6142,8 +4323,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = update_block_group(trans, extent_key.objectid, - fs_info->nodesize, 1); + ret = btrfs_update_block_group(trans, extent_key.objectid, + fs_info->nodesize, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", extent_key.objectid, extent_key.offset); @@ -7293,186 +5474,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, return ret; } -static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags) -{ - u64 num_devices; - u64 stripped; - - /* - * if restripe for this chunk_type is on pick target profile and - * return, otherwise do the usual balance - */ - stripped = get_restripe_target(fs_info, flags); - if (stripped) - return extended_to_chunk(stripped); - - num_devices = fs_info->fs_devices->rw_devices; - - stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK | - BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10; - - if (num_devices == 1) { - stripped |= BTRFS_BLOCK_GROUP_DUP; - stripped = flags & ~stripped; - - /* turn raid0 into single device chunks */ - if (flags & BTRFS_BLOCK_GROUP_RAID0) - return stripped; - - /* turn mirroring into duplication */ - if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK | - BTRFS_BLOCK_GROUP_RAID10)) - return stripped | BTRFS_BLOCK_GROUP_DUP; - } else { - /* they already had raid on here, just return */ - if (flags & stripped) - return flags; - - stripped |= BTRFS_BLOCK_GROUP_DUP; - stripped = flags & ~stripped; - - /* switch duplicated blocks with raid1 */ - if (flags & BTRFS_BLOCK_GROUP_DUP) - return stripped | BTRFS_BLOCK_GROUP_RAID1; - - /* this is drive concat, leave it alone */ - } - - return flags; -} - -static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) -{ - struct btrfs_space_info *sinfo = cache->space_info; - u64 num_bytes; - u64 sinfo_used; - u64 min_allocable_bytes; - int ret = -ENOSPC; - - /* - * We need some metadata space and system metadata space for - * allocating chunks in some corner cases until we force to set - * it to be readonly. - */ - if ((sinfo->flags & - (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && - !force) - min_allocable_bytes = SZ_1M; - else - min_allocable_bytes = 0; - - spin_lock(&sinfo->lock); - spin_lock(&cache->lock); - - if (cache->ro) { - cache->ro++; - ret = 0; - goto out; - } - - num_bytes = cache->key.offset - cache->reserved - cache->pinned - - cache->bytes_super - btrfs_block_group_used(&cache->item); - sinfo_used = btrfs_space_info_used(sinfo, true); - - if (sinfo_used + num_bytes + min_allocable_bytes <= - sinfo->total_bytes) { - sinfo->bytes_readonly += num_bytes; - cache->ro++; - list_add_tail(&cache->ro_list, &sinfo->ro_bgs); - ret = 0; - } -out: - spin_unlock(&cache->lock); - spin_unlock(&sinfo->lock); - if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { - btrfs_info(cache->fs_info, - "unable to make block group %llu ro", - cache->key.objectid); - btrfs_info(cache->fs_info, - "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu", - sinfo_used, num_bytes, min_allocable_bytes); - btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0); - } - return ret; -} - -int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache) - -{ - struct btrfs_fs_info *fs_info = cache->fs_info; - struct btrfs_trans_handle *trans; - u64 alloc_flags; - int ret; - -again: - trans = btrfs_join_transaction(fs_info->extent_root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - /* - * we're not allowed to set block groups readonly after the dirty - * block groups cache has started writing. If it already started, - * back off and let this transaction commit - */ - mutex_lock(&fs_info->ro_block_group_mutex); - if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { - u64 transid = trans->transid; - - mutex_unlock(&fs_info->ro_block_group_mutex); - btrfs_end_transaction(trans); - - ret = btrfs_wait_for_commit(fs_info, transid); - if (ret) - return ret; - goto again; - } - - /* - * if we are changing raid levels, try to allocate a corresponding - * block group with the new raid level. - */ - alloc_flags = update_block_group_flags(fs_info, cache->flags); - if (alloc_flags != cache->flags) { - ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); - /* - * ENOSPC is allowed here, we may have enough space - * already allocated at the new raid level to - * carry on - */ - if (ret == -ENOSPC) - ret = 0; - if (ret < 0) - goto out; - } - - ret = inc_block_group_ro(cache, 0); - if (!ret) - goto out; - alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags); - ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); - if (ret < 0) - goto out; - ret = inc_block_group_ro(cache, 0); -out: - if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { - alloc_flags = update_block_group_flags(fs_info, cache->flags); - mutex_lock(&fs_info->chunk_mutex); - check_system_chunk(trans, alloc_flags); - mutex_unlock(&fs_info->chunk_mutex); - } - mutex_unlock(&fs_info->ro_block_group_mutex); - - btrfs_end_transaction(trans); - return ret; -} - -int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) -{ - u64 alloc_flags = get_alloc_profile(trans->fs_info, type); - - return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); -} - /* * helper to account the unused space of all the readonly block group in the * space_info. takes mirrors into account. @@ -7508,1346 +5509,6 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) return free_bytes; } -void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache) -{ - struct btrfs_space_info *sinfo = cache->space_info; - u64 num_bytes; - - BUG_ON(!cache->ro); - - spin_lock(&sinfo->lock); - spin_lock(&cache->lock); - if (!--cache->ro) { - num_bytes = cache->key.offset - cache->reserved - - cache->pinned - cache->bytes_super - - btrfs_block_group_used(&cache->item); - sinfo->bytes_readonly -= num_bytes; - list_del_init(&cache->ro_list); - } - spin_unlock(&cache->lock); - spin_unlock(&sinfo->lock); -} - -/* - * Checks to see if it's even possible to relocate this block group. - * - * @return - -1 if it's not a good idea to relocate this block group, 0 if its - * ok to go ahead and try. - */ -int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) -{ - struct btrfs_block_group_cache *block_group; - struct btrfs_space_info *space_info; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - struct btrfs_device *device; - u64 min_free; - u64 dev_min = 1; - u64 dev_nr = 0; - u64 target; - int debug; - int index; - int full = 0; - int ret = 0; - - debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG); - - block_group = btrfs_lookup_block_group(fs_info, bytenr); - - /* odd, couldn't find the block group, leave it alone */ - if (!block_group) { - if (debug) - btrfs_warn(fs_info, - "can't find block group for bytenr %llu", - bytenr); - return -1; - } - - min_free = btrfs_block_group_used(&block_group->item); - - /* no bytes used, we're good */ - if (!min_free) - goto out; - - space_info = block_group->space_info; - spin_lock(&space_info->lock); - - full = space_info->full; - - /* - * if this is the last block group we have in this space, we can't - * relocate it unless we're able to allocate a new chunk below. - * - * Otherwise, we need to make sure we have room in the space to handle - * all of the extents from this block group. If we can, we're good - */ - if ((space_info->total_bytes != block_group->key.offset) && - (btrfs_space_info_used(space_info, false) + min_free < - space_info->total_bytes)) { - spin_unlock(&space_info->lock); - goto out; - } - spin_unlock(&space_info->lock); - - /* - * ok we don't have enough space, but maybe we have free space on our - * devices to allocate new chunks for relocation, so loop through our - * alloc devices and guess if we have enough space. if this block - * group is going to be restriped, run checks against the target - * profile instead of the current one. - */ - ret = -1; - - /* - * index: - * 0: raid10 - * 1: raid1 - * 2: dup - * 3: raid0 - * 4: single - */ - target = get_restripe_target(fs_info, block_group->flags); - if (target) { - index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target)); - } else { - /* - * this is just a balance, so if we were marked as full - * we know there is no space for a new chunk - */ - if (full) { - if (debug) - btrfs_warn(fs_info, - "no space to alloc new chunk for block group %llu", - block_group->key.objectid); - goto out; - } - - index = btrfs_bg_flags_to_raid_index(block_group->flags); - } - - if (index == BTRFS_RAID_RAID10) { - dev_min = 4; - /* Divide by 2 */ - min_free >>= 1; - } else if (index == BTRFS_RAID_RAID1) { - dev_min = 2; - } else if (index == BTRFS_RAID_DUP) { - /* Multiply by 2 */ - min_free <<= 1; - } else if (index == BTRFS_RAID_RAID0) { - dev_min = fs_devices->rw_devices; - min_free = div64_u64(min_free, dev_min); - } - - mutex_lock(&fs_info->chunk_mutex); - list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { - u64 dev_offset; - - /* - * check to make sure we can actually find a chunk with enough - * space to fit our block group in. - */ - if (device->total_bytes > device->bytes_used + min_free && - !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { - ret = find_free_dev_extent(device, min_free, - &dev_offset, NULL); - if (!ret) - dev_nr++; - - if (dev_nr >= dev_min) - break; - - ret = -1; - } - } - if (debug && ret == -1) - btrfs_warn(fs_info, - "no space to allocate a new chunk for block group %llu", - block_group->key.objectid); - mutex_unlock(&fs_info->chunk_mutex); -out: - btrfs_put_block_group(block_group); - return ret; -} - -static int find_first_block_group(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, - struct btrfs_key *key) -{ - struct btrfs_root *root = fs_info->extent_root; - int ret = 0; - struct btrfs_key found_key; - struct extent_buffer *leaf; - struct btrfs_block_group_item bg; - u64 flags; - int slot; - - ret = btrfs_search_slot(NULL, root, key, path, 0, 0); - if (ret < 0) - goto out; - - while (1) { - slot = path->slots[0]; - leaf = path->nodes[0]; - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto out; - break; - } - btrfs_item_key_to_cpu(leaf, &found_key, slot); - - if (found_key.objectid >= key->objectid && - found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { - struct extent_map_tree *em_tree; - struct extent_map *em; - - em_tree = &root->fs_info->mapping_tree; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, found_key.objectid, - found_key.offset); - read_unlock(&em_tree->lock); - if (!em) { - btrfs_err(fs_info, - "logical %llu len %llu found bg but no related chunk", - found_key.objectid, found_key.offset); - ret = -ENOENT; - } else if (em->start != found_key.objectid || - em->len != found_key.offset) { - btrfs_err(fs_info, - "block group %llu len %llu mismatch with chunk %llu len %llu", - found_key.objectid, found_key.offset, - em->start, em->len); - ret = -EUCLEAN; - } else { - read_extent_buffer(leaf, &bg, - btrfs_item_ptr_offset(leaf, slot), - sizeof(bg)); - flags = btrfs_block_group_flags(&bg) & - BTRFS_BLOCK_GROUP_TYPE_MASK; - - if (flags != (em->map_lookup->type & - BTRFS_BLOCK_GROUP_TYPE_MASK)) { - btrfs_err(fs_info, -"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", - found_key.objectid, - found_key.offset, flags, - (BTRFS_BLOCK_GROUP_TYPE_MASK & - em->map_lookup->type)); - ret = -EUCLEAN; - } else { - ret = 0; - } - } - free_extent_map(em); - goto out; - } - path->slots[0]++; - } -out: - return ret; -} - -void btrfs_put_block_group_cache(struct btrfs_fs_info *info) -{ - struct btrfs_block_group_cache *block_group; - u64 last = 0; - - while (1) { - struct inode *inode; - - block_group = btrfs_lookup_first_block_group(info, last); - while (block_group) { - wait_block_group_cache_done(block_group); - spin_lock(&block_group->lock); - if (block_group->iref) - break; - spin_unlock(&block_group->lock); - block_group = next_block_group(block_group); - } - if (!block_group) { - if (last == 0) - break; - last = 0; - continue; - } - - inode = block_group->inode; - block_group->iref = 0; - block_group->inode = NULL; - spin_unlock(&block_group->lock); - ASSERT(block_group->io_ctl.inode == NULL); - iput(inode); - last = block_group->key.objectid + block_group->key.offset; - btrfs_put_block_group(block_group); - } -} - -/* - * Must be called only after stopping all workers, since we could have block - * group caching kthreads running, and therefore they could race with us if we - * freed the block groups before stopping them. - */ -int btrfs_free_block_groups(struct btrfs_fs_info *info) -{ - struct btrfs_block_group_cache *block_group; - struct btrfs_space_info *space_info; - struct btrfs_caching_control *caching_ctl; - struct rb_node *n; - - down_write(&info->commit_root_sem); - while (!list_empty(&info->caching_block_groups)) { - caching_ctl = list_entry(info->caching_block_groups.next, - struct btrfs_caching_control, list); - list_del(&caching_ctl->list); - put_caching_control(caching_ctl); - } - up_write(&info->commit_root_sem); - - spin_lock(&info->unused_bgs_lock); - while (!list_empty(&info->unused_bgs)) { - block_group = list_first_entry(&info->unused_bgs, - struct btrfs_block_group_cache, - bg_list); - list_del_init(&block_group->bg_list); - btrfs_put_block_group(block_group); - } - spin_unlock(&info->unused_bgs_lock); - - spin_lock(&info->block_group_cache_lock); - while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { - block_group = rb_entry(n, struct btrfs_block_group_cache, - cache_node); - rb_erase(&block_group->cache_node, - &info->block_group_cache_tree); - RB_CLEAR_NODE(&block_group->cache_node); - spin_unlock(&info->block_group_cache_lock); - - down_write(&block_group->space_info->groups_sem); - list_del(&block_group->list); - up_write(&block_group->space_info->groups_sem); - - /* - * We haven't cached this block group, which means we could - * possibly have excluded extents on this block group. - */ - if (block_group->cached == BTRFS_CACHE_NO || - block_group->cached == BTRFS_CACHE_ERROR) - free_excluded_extents(block_group); - - btrfs_remove_free_space_cache(block_group); - ASSERT(block_group->cached != BTRFS_CACHE_STARTED); - ASSERT(list_empty(&block_group->dirty_list)); - ASSERT(list_empty(&block_group->io_list)); - ASSERT(list_empty(&block_group->bg_list)); - ASSERT(atomic_read(&block_group->count) == 1); - btrfs_put_block_group(block_group); - - spin_lock(&info->block_group_cache_lock); - } - spin_unlock(&info->block_group_cache_lock); - - /* now that all the block groups are freed, go through and - * free all the space_info structs. This is only called during - * the final stages of unmount, and so we know nobody is - * using them. We call synchronize_rcu() once before we start, - * just to be on the safe side. - */ - synchronize_rcu(); - - btrfs_release_global_block_rsv(info); - - while (!list_empty(&info->space_info)) { - int i; - - space_info = list_entry(info->space_info.next, - struct btrfs_space_info, - list); - - /* - * Do not hide this behind enospc_debug, this is actually - * important and indicates a real bug if this happens. - */ - if (WARN_ON(space_info->bytes_pinned > 0 || - space_info->bytes_reserved > 0 || - space_info->bytes_may_use > 0)) - btrfs_dump_space_info(info, space_info, 0, 0); - list_del(&space_info->list); - for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { - struct kobject *kobj; - kobj = space_info->block_group_kobjs[i]; - space_info->block_group_kobjs[i] = NULL; - if (kobj) { - kobject_del(kobj); - kobject_put(kobj); - } - } - kobject_del(&space_info->kobj); - kobject_put(&space_info->kobj); - } - return 0; -} - -static void link_block_group(struct btrfs_block_group_cache *cache) -{ - struct btrfs_space_info *space_info = cache->space_info; - struct btrfs_fs_info *fs_info = cache->fs_info; - int index = btrfs_bg_flags_to_raid_index(cache->flags); - bool first = false; - - down_write(&space_info->groups_sem); - if (list_empty(&space_info->block_groups[index])) - first = true; - list_add_tail(&cache->list, &space_info->block_groups[index]); - up_write(&space_info->groups_sem); - - if (first) { - struct raid_kobject *rkobj; - unsigned int nofs_flag; - int ret; - - /* - * Setup a NOFS context because kobject_add(), deep in its call - * chain, does GFP_KERNEL allocations, and we are often called - * in a context where if reclaim is triggered we can deadlock - * (we are either holding a transaction handle or some lock - * required for a transaction commit). - */ - nofs_flag = memalloc_nofs_save(); - rkobj = kzalloc(sizeof(*rkobj), GFP_KERNEL); - if (!rkobj) { - memalloc_nofs_restore(nofs_flag); - btrfs_warn(cache->fs_info, - "couldn't alloc memory for raid level kobject"); - return; - } - rkobj->flags = cache->flags; - kobject_init(&rkobj->kobj, &btrfs_raid_ktype); - ret = kobject_add(&rkobj->kobj, &space_info->kobj, "%s", - btrfs_bg_type_to_raid_name(rkobj->flags)); - memalloc_nofs_restore(nofs_flag); - if (ret) { - kobject_put(&rkobj->kobj); - btrfs_warn(fs_info, - "failed to add kobject for block cache, ignoring"); - return; - } - space_info->block_group_kobjs[index] = &rkobj->kobj; - } -} - -static struct btrfs_block_group_cache * -btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info, - u64 start, u64 size) -{ - struct btrfs_block_group_cache *cache; - - cache = kzalloc(sizeof(*cache), GFP_NOFS); - if (!cache) - return NULL; - - cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), - GFP_NOFS); - if (!cache->free_space_ctl) { - kfree(cache); - return NULL; - } - - cache->key.objectid = start; - cache->key.offset = size; - cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - - cache->fs_info = fs_info; - cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start); - set_free_space_tree_thresholds(cache); - - atomic_set(&cache->count, 1); - spin_lock_init(&cache->lock); - init_rwsem(&cache->data_rwsem); - INIT_LIST_HEAD(&cache->list); - INIT_LIST_HEAD(&cache->cluster_list); - INIT_LIST_HEAD(&cache->bg_list); - INIT_LIST_HEAD(&cache->ro_list); - INIT_LIST_HEAD(&cache->dirty_list); - INIT_LIST_HEAD(&cache->io_list); - btrfs_init_free_space_ctl(cache); - atomic_set(&cache->trimming, 0); - mutex_init(&cache->free_space_lock); - btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root); - - return cache; -} - - -/* - * Iterate all chunks and verify that each of them has the corresponding block - * group - */ -static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) -{ - struct extent_map_tree *map_tree = &fs_info->mapping_tree; - struct extent_map *em; - struct btrfs_block_group_cache *bg; - u64 start = 0; - int ret = 0; - - while (1) { - read_lock(&map_tree->lock); - /* - * lookup_extent_mapping will return the first extent map - * intersecting the range, so setting @len to 1 is enough to - * get the first chunk. - */ - em = lookup_extent_mapping(map_tree, start, 1); - read_unlock(&map_tree->lock); - if (!em) - break; - - bg = btrfs_lookup_block_group(fs_info, em->start); - if (!bg) { - btrfs_err(fs_info, - "chunk start=%llu len=%llu doesn't have corresponding block group", - em->start, em->len); - ret = -EUCLEAN; - free_extent_map(em); - break; - } - if (bg->key.objectid != em->start || - bg->key.offset != em->len || - (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != - (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { - btrfs_err(fs_info, -"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx", - em->start, em->len, - em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK, - bg->key.objectid, bg->key.offset, - bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK); - ret = -EUCLEAN; - free_extent_map(em); - btrfs_put_block_group(bg); - break; - } - start = em->start + em->len; - free_extent_map(em); - btrfs_put_block_group(bg); - } - return ret; -} - -int btrfs_read_block_groups(struct btrfs_fs_info *info) -{ - struct btrfs_path *path; - int ret; - struct btrfs_block_group_cache *cache; - struct btrfs_space_info *space_info; - struct btrfs_key key; - struct btrfs_key found_key; - struct extent_buffer *leaf; - int need_clear = 0; - u64 cache_gen; - u64 feature; - int mixed; - - feature = btrfs_super_incompat_flags(info->super_copy); - mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS); - - key.objectid = 0; - key.offset = 0; - key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->reada = READA_FORWARD; - - cache_gen = btrfs_super_cache_generation(info->super_copy); - if (btrfs_test_opt(info, SPACE_CACHE) && - btrfs_super_generation(info->super_copy) != cache_gen) - need_clear = 1; - if (btrfs_test_opt(info, CLEAR_CACHE)) - need_clear = 1; - - while (1) { - ret = find_first_block_group(info, path, &key); - if (ret > 0) - break; - if (ret != 0) - goto error; - - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - - cache = btrfs_create_block_group_cache(info, found_key.objectid, - found_key.offset); - if (!cache) { - ret = -ENOMEM; - goto error; - } - - if (need_clear) { - /* - * When we mount with old space cache, we need to - * set BTRFS_DC_CLEAR and set dirty flag. - * - * a) Setting 'BTRFS_DC_CLEAR' makes sure that we - * truncate the old free space cache inode and - * setup a new one. - * b) Setting 'dirty flag' makes sure that we flush - * the new space cache info onto disk. - */ - if (btrfs_test_opt(info, SPACE_CACHE)) - cache->disk_cache_state = BTRFS_DC_CLEAR; - } - - read_extent_buffer(leaf, &cache->item, - btrfs_item_ptr_offset(leaf, path->slots[0]), - sizeof(cache->item)); - cache->flags = btrfs_block_group_flags(&cache->item); - if (!mixed && - ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) && - (cache->flags & BTRFS_BLOCK_GROUP_DATA))) { - btrfs_err(info, -"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups", - cache->key.objectid); - ret = -EINVAL; - goto error; - } - - key.objectid = found_key.objectid + found_key.offset; - btrfs_release_path(path); - - /* - * We need to exclude the super stripes now so that the space - * info has super bytes accounted for, otherwise we'll think - * we have more space than we actually do. - */ - ret = exclude_super_stripes(cache); - if (ret) { - /* - * We may have excluded something, so call this just in - * case. - */ - free_excluded_extents(cache); - btrfs_put_block_group(cache); - goto error; - } - - /* - * check for two cases, either we are full, and therefore - * don't need to bother with the caching work since we won't - * find any space, or we are empty, and we can just add all - * the space in and be done with it. This saves us _a_lot_ of - * time, particularly in the full case. - */ - if (found_key.offset == btrfs_block_group_used(&cache->item)) { - cache->last_byte_to_unpin = (u64)-1; - cache->cached = BTRFS_CACHE_FINISHED; - free_excluded_extents(cache); - } else if (btrfs_block_group_used(&cache->item) == 0) { - cache->last_byte_to_unpin = (u64)-1; - cache->cached = BTRFS_CACHE_FINISHED; - add_new_free_space(cache, found_key.objectid, - found_key.objectid + - found_key.offset); - free_excluded_extents(cache); - } - - ret = btrfs_add_block_group_cache(info, cache); - if (ret) { - btrfs_remove_free_space_cache(cache); - btrfs_put_block_group(cache); - goto error; - } - - trace_btrfs_add_block_group(info, cache, 0); - btrfs_update_space_info(info, cache->flags, found_key.offset, - btrfs_block_group_used(&cache->item), - cache->bytes_super, &space_info); - - cache->space_info = space_info; - - link_block_group(cache); - - set_avail_alloc_bits(info, cache->flags); - if (btrfs_chunk_readonly(info, cache->key.objectid)) { - inc_block_group_ro(cache, 1); - } else if (btrfs_block_group_used(&cache->item) == 0) { - ASSERT(list_empty(&cache->bg_list)); - btrfs_mark_bg_unused(cache); - } - } - - list_for_each_entry_rcu(space_info, &info->space_info, list) { - if (!(get_alloc_profile(info, space_info->flags) & - (BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID1_MASK | - BTRFS_BLOCK_GROUP_RAID56_MASK | - BTRFS_BLOCK_GROUP_DUP))) - continue; - /* - * avoid allocating from un-mirrored block group if there are - * mirrored block groups. - */ - list_for_each_entry(cache, - &space_info->block_groups[BTRFS_RAID_RAID0], - list) - inc_block_group_ro(cache, 1); - list_for_each_entry(cache, - &space_info->block_groups[BTRFS_RAID_SINGLE], - list) - inc_block_group_ro(cache, 1); - } - - btrfs_init_global_block_rsv(info); - ret = check_chunk_block_group_mappings(info); -error: - btrfs_free_path(path); - return ret; -} - -void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_group_cache *block_group; - struct btrfs_root *extent_root = fs_info->extent_root; - struct btrfs_block_group_item item; - struct btrfs_key key; - int ret = 0; - - if (!trans->can_flush_pending_bgs) - return; - - while (!list_empty(&trans->new_bgs)) { - block_group = list_first_entry(&trans->new_bgs, - struct btrfs_block_group_cache, - bg_list); - if (ret) - goto next; - - spin_lock(&block_group->lock); - memcpy(&item, &block_group->item, sizeof(item)); - memcpy(&key, &block_group->key, sizeof(key)); - spin_unlock(&block_group->lock); - - ret = btrfs_insert_item(trans, extent_root, &key, &item, - sizeof(item)); - if (ret) - btrfs_abort_transaction(trans, ret); - ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset); - if (ret) - btrfs_abort_transaction(trans, ret); - add_block_group_free_space(trans, block_group); - /* already aborted the transaction if it failed. */ -next: - btrfs_delayed_refs_rsv_release(fs_info, 1); - list_del_init(&block_group->bg_list); - } - btrfs_trans_release_chunk_metadata(trans); -} - -int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, - u64 type, u64 chunk_offset, u64 size) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_group_cache *cache; - int ret; - - btrfs_set_log_full_commit(trans); - - cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size); - if (!cache) - return -ENOMEM; - - btrfs_set_block_group_used(&cache->item, bytes_used); - btrfs_set_block_group_chunk_objectid(&cache->item, - BTRFS_FIRST_CHUNK_TREE_OBJECTID); - btrfs_set_block_group_flags(&cache->item, type); - - cache->flags = type; - cache->last_byte_to_unpin = (u64)-1; - cache->cached = BTRFS_CACHE_FINISHED; - cache->needs_free_space = 1; - ret = exclude_super_stripes(cache); - if (ret) { - /* - * We may have excluded something, so call this just in - * case. - */ - free_excluded_extents(cache); - btrfs_put_block_group(cache); - return ret; - } - - add_new_free_space(cache, chunk_offset, chunk_offset + size); - - free_excluded_extents(cache); - -#ifdef CONFIG_BTRFS_DEBUG - if (btrfs_should_fragment_free_space(cache)) { - u64 new_bytes_used = size - bytes_used; - - bytes_used += new_bytes_used >> 1; - fragment_free_space(cache); - } -#endif - /* - * Ensure the corresponding space_info object is created and - * assigned to our block group. We want our bg to be added to the rbtree - * with its ->space_info set. - */ - cache->space_info = btrfs_find_space_info(fs_info, cache->flags); - ASSERT(cache->space_info); - - ret = btrfs_add_block_group_cache(fs_info, cache); - if (ret) { - btrfs_remove_free_space_cache(cache); - btrfs_put_block_group(cache); - return ret; - } - - /* - * Now that our block group has its ->space_info set and is inserted in - * the rbtree, update the space info's counters. - */ - trace_btrfs_add_block_group(fs_info, cache, 1); - btrfs_update_space_info(fs_info, cache->flags, size, bytes_used, - cache->bytes_super, &cache->space_info); - btrfs_update_global_block_rsv(fs_info); - - link_block_group(cache); - - list_add_tail(&cache->bg_list, &trans->new_bgs); - trans->delayed_ref_updates++; - btrfs_update_delayed_refs_rsv(trans); - - set_avail_alloc_bits(fs_info, type); - return 0; -} - -static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) -{ - u64 extra_flags = chunk_to_extended(flags) & - BTRFS_EXTENDED_PROFILE_MASK; - - write_seqlock(&fs_info->profiles_lock); - if (flags & BTRFS_BLOCK_GROUP_DATA) - fs_info->avail_data_alloc_bits &= ~extra_flags; - if (flags & BTRFS_BLOCK_GROUP_METADATA) - fs_info->avail_metadata_alloc_bits &= ~extra_flags; - if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - fs_info->avail_system_alloc_bits &= ~extra_flags; - write_sequnlock(&fs_info->profiles_lock); -} - -/* - * Clear incompat bits for the following feature(s): - * - * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group - * in the whole filesystem - */ -static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags) -{ - if (flags & BTRFS_BLOCK_GROUP_RAID56_MASK) { - struct list_head *head = &fs_info->space_info; - struct btrfs_space_info *sinfo; - - list_for_each_entry_rcu(sinfo, head, list) { - bool found = false; - - down_read(&sinfo->groups_sem); - if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5])) - found = true; - if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6])) - found = true; - up_read(&sinfo->groups_sem); - - if (found) - return; - } - btrfs_clear_fs_incompat(fs_info, RAID56); - } -} - -int btrfs_remove_block_group(struct btrfs_trans_handle *trans, - u64 group_start, struct extent_map *em) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *root = fs_info->extent_root; - struct btrfs_path *path; - struct btrfs_block_group_cache *block_group; - struct btrfs_free_cluster *cluster; - struct btrfs_root *tree_root = fs_info->tree_root; - struct btrfs_key key; - struct inode *inode; - struct kobject *kobj = NULL; - int ret; - int index; - int factor; - struct btrfs_caching_control *caching_ctl = NULL; - bool remove_em; - bool remove_rsv = false; - - block_group = btrfs_lookup_block_group(fs_info, group_start); - BUG_ON(!block_group); - BUG_ON(!block_group->ro); - - trace_btrfs_remove_block_group(block_group); - /* - * Free the reserved super bytes from this block group before - * remove it. - */ - free_excluded_extents(block_group); - btrfs_free_ref_tree_range(fs_info, block_group->key.objectid, - block_group->key.offset); - - memcpy(&key, &block_group->key, sizeof(key)); - index = btrfs_bg_flags_to_raid_index(block_group->flags); - factor = btrfs_bg_type_to_factor(block_group->flags); - - /* make sure this block group isn't part of an allocation cluster */ - cluster = &fs_info->data_alloc_cluster; - spin_lock(&cluster->refill_lock); - btrfs_return_cluster_to_free_space(block_group, cluster); - spin_unlock(&cluster->refill_lock); - - /* - * make sure this block group isn't part of a metadata - * allocation cluster - */ - cluster = &fs_info->meta_alloc_cluster; - spin_lock(&cluster->refill_lock); - btrfs_return_cluster_to_free_space(block_group, cluster); - spin_unlock(&cluster->refill_lock); - - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - - /* - * get the inode first so any iput calls done for the io_list - * aren't the final iput (no unlinks allowed now) - */ - inode = lookup_free_space_inode(block_group, path); - - mutex_lock(&trans->transaction->cache_write_mutex); - /* - * Make sure our free space cache IO is done before removing the - * free space inode - */ - spin_lock(&trans->transaction->dirty_bgs_lock); - if (!list_empty(&block_group->io_list)) { - list_del_init(&block_group->io_list); - - WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); - - spin_unlock(&trans->transaction->dirty_bgs_lock); - btrfs_wait_cache_io(trans, block_group, path); - btrfs_put_block_group(block_group); - spin_lock(&trans->transaction->dirty_bgs_lock); - } - - if (!list_empty(&block_group->dirty_list)) { - list_del_init(&block_group->dirty_list); - remove_rsv = true; - btrfs_put_block_group(block_group); - } - spin_unlock(&trans->transaction->dirty_bgs_lock); - mutex_unlock(&trans->transaction->cache_write_mutex); - - if (!IS_ERR(inode)) { - ret = btrfs_orphan_add(trans, BTRFS_I(inode)); - if (ret) { - btrfs_add_delayed_iput(inode); - goto out; - } - clear_nlink(inode); - /* One for the block groups ref */ - spin_lock(&block_group->lock); - if (block_group->iref) { - block_group->iref = 0; - block_group->inode = NULL; - spin_unlock(&block_group->lock); - iput(inode); - } else { - spin_unlock(&block_group->lock); - } - /* One for our lookup ref */ - btrfs_add_delayed_iput(inode); - } - - key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = block_group->key.objectid; - key.type = 0; - - ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); - if (ret < 0) - goto out; - if (ret > 0) - btrfs_release_path(path); - if (ret == 0) { - ret = btrfs_del_item(trans, tree_root, path); - if (ret) - goto out; - btrfs_release_path(path); - } - - spin_lock(&fs_info->block_group_cache_lock); - rb_erase(&block_group->cache_node, - &fs_info->block_group_cache_tree); - RB_CLEAR_NODE(&block_group->cache_node); - - if (fs_info->first_logical_byte == block_group->key.objectid) - fs_info->first_logical_byte = (u64)-1; - spin_unlock(&fs_info->block_group_cache_lock); - - down_write(&block_group->space_info->groups_sem); - /* - * we must use list_del_init so people can check to see if they - * are still on the list after taking the semaphore - */ - list_del_init(&block_group->list); - if (list_empty(&block_group->space_info->block_groups[index])) { - kobj = block_group->space_info->block_group_kobjs[index]; - block_group->space_info->block_group_kobjs[index] = NULL; - clear_avail_alloc_bits(fs_info, block_group->flags); - } - up_write(&block_group->space_info->groups_sem); - clear_incompat_bg_bits(fs_info, block_group->flags); - if (kobj) { - kobject_del(kobj); - kobject_put(kobj); - } - - if (block_group->has_caching_ctl) - caching_ctl = get_caching_control(block_group); - if (block_group->cached == BTRFS_CACHE_STARTED) - wait_block_group_cache_done(block_group); - if (block_group->has_caching_ctl) { - down_write(&fs_info->commit_root_sem); - if (!caching_ctl) { - struct btrfs_caching_control *ctl; - - list_for_each_entry(ctl, - &fs_info->caching_block_groups, list) - if (ctl->block_group == block_group) { - caching_ctl = ctl; - refcount_inc(&caching_ctl->count); - break; - } - } - if (caching_ctl) - list_del_init(&caching_ctl->list); - up_write(&fs_info->commit_root_sem); - if (caching_ctl) { - /* Once for the caching bgs list and once for us. */ - put_caching_control(caching_ctl); - put_caching_control(caching_ctl); - } - } - - spin_lock(&trans->transaction->dirty_bgs_lock); - WARN_ON(!list_empty(&block_group->dirty_list)); - WARN_ON(!list_empty(&block_group->io_list)); - spin_unlock(&trans->transaction->dirty_bgs_lock); - - btrfs_remove_free_space_cache(block_group); - - spin_lock(&block_group->space_info->lock); - list_del_init(&block_group->ro_list); - - if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { - WARN_ON(block_group->space_info->total_bytes - < block_group->key.offset); - WARN_ON(block_group->space_info->bytes_readonly - < block_group->key.offset); - WARN_ON(block_group->space_info->disk_total - < block_group->key.offset * factor); - } - block_group->space_info->total_bytes -= block_group->key.offset; - block_group->space_info->bytes_readonly -= block_group->key.offset; - block_group->space_info->disk_total -= block_group->key.offset * factor; - - spin_unlock(&block_group->space_info->lock); - - memcpy(&key, &block_group->key, sizeof(key)); - - mutex_lock(&fs_info->chunk_mutex); - spin_lock(&block_group->lock); - block_group->removed = 1; - /* - * At this point trimming can't start on this block group, because we - * removed the block group from the tree fs_info->block_group_cache_tree - * so no one can't find it anymore and even if someone already got this - * block group before we removed it from the rbtree, they have already - * incremented block_group->trimming - if they didn't, they won't find - * any free space entries because we already removed them all when we - * called btrfs_remove_free_space_cache(). - * - * And we must not remove the extent map from the fs_info->mapping_tree - * to prevent the same logical address range and physical device space - * ranges from being reused for a new block group. This is because our - * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is - * completely transactionless, so while it is trimming a range the - * currently running transaction might finish and a new one start, - * allowing for new block groups to be created that can reuse the same - * physical device locations unless we take this special care. - * - * There may also be an implicit trim operation if the file system - * is mounted with -odiscard. The same protections must remain - * in place until the extents have been discarded completely when - * the transaction commit has completed. - */ - remove_em = (atomic_read(&block_group->trimming) == 0); - spin_unlock(&block_group->lock); - - mutex_unlock(&fs_info->chunk_mutex); - - ret = remove_block_group_free_space(trans, block_group); - if (ret) - goto out; - - btrfs_put_block_group(block_group); - btrfs_put_block_group(block_group); - - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret > 0) - ret = -EIO; - if (ret < 0) - goto out; - - ret = btrfs_del_item(trans, root, path); - if (ret) - goto out; - - if (remove_em) { - struct extent_map_tree *em_tree; - - em_tree = &fs_info->mapping_tree; - write_lock(&em_tree->lock); - remove_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - /* once for the tree */ - free_extent_map(em); - } -out: - if (remove_rsv) - btrfs_delayed_refs_rsv_release(fs_info, 1); - btrfs_free_path(path); - return ret; -} - -struct btrfs_trans_handle * -btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info, - const u64 chunk_offset) -{ - struct extent_map_tree *em_tree = &fs_info->mapping_tree; - struct extent_map *em; - struct map_lookup *map; - unsigned int num_items; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, chunk_offset, 1); - read_unlock(&em_tree->lock); - ASSERT(em && em->start == chunk_offset); - - /* - * We need to reserve 3 + N units from the metadata space info in order - * to remove a block group (done at btrfs_remove_chunk() and at - * btrfs_remove_block_group()), which are used for: - * - * 1 unit for adding the free space inode's orphan (located in the tree - * of tree roots). - * 1 unit for deleting the block group item (located in the extent - * tree). - * 1 unit for deleting the free space item (located in tree of tree - * roots). - * N units for deleting N device extent items corresponding to each - * stripe (located in the device tree). - * - * In order to remove a block group we also need to reserve units in the - * system space info in order to update the chunk tree (update one or - * more device items and remove one chunk item), but this is done at - * btrfs_remove_chunk() through a call to check_system_chunk(). - */ - map = em->map_lookup; - num_items = 3 + map->num_stripes; - free_extent_map(em); - - return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root, - num_items, 1); -} - -/* - * Process the unused_bgs list and remove any that don't have any allocated - * space inside of them. - */ -void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) -{ - struct btrfs_block_group_cache *block_group; - struct btrfs_space_info *space_info; - struct btrfs_trans_handle *trans; - int ret = 0; - - if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) - return; - - spin_lock(&fs_info->unused_bgs_lock); - while (!list_empty(&fs_info->unused_bgs)) { - u64 start, end; - int trimming; - - block_group = list_first_entry(&fs_info->unused_bgs, - struct btrfs_block_group_cache, - bg_list); - list_del_init(&block_group->bg_list); - - space_info = block_group->space_info; - - if (ret || btrfs_mixed_space_info(space_info)) { - btrfs_put_block_group(block_group); - continue; - } - spin_unlock(&fs_info->unused_bgs_lock); - - mutex_lock(&fs_info->delete_unused_bgs_mutex); - - /* Don't want to race with allocators so take the groups_sem */ - down_write(&space_info->groups_sem); - spin_lock(&block_group->lock); - if (block_group->reserved || block_group->pinned || - btrfs_block_group_used(&block_group->item) || - block_group->ro || - list_is_singular(&block_group->list)) { - /* - * We want to bail if we made new allocations or have - * outstanding allocations in this block group. We do - * the ro check in case balance is currently acting on - * this block group. - */ - trace_btrfs_skip_unused_block_group(block_group); - spin_unlock(&block_group->lock); - up_write(&space_info->groups_sem); - goto next; - } - spin_unlock(&block_group->lock); - - /* We don't want to force the issue, only flip if it's ok. */ - ret = inc_block_group_ro(block_group, 0); - up_write(&space_info->groups_sem); - if (ret < 0) { - ret = 0; - goto next; - } - - /* - * Want to do this before we do anything else so we can recover - * properly if we fail to join the transaction. - */ - trans = btrfs_start_trans_remove_block_group(fs_info, - block_group->key.objectid); - if (IS_ERR(trans)) { - btrfs_dec_block_group_ro(block_group); - ret = PTR_ERR(trans); - goto next; - } - - /* - * We could have pending pinned extents for this block group, - * just delete them, we don't care about them anymore. - */ - start = block_group->key.objectid; - end = start + block_group->key.offset - 1; - /* - * Hold the unused_bg_unpin_mutex lock to avoid racing with - * btrfs_finish_extent_commit(). If we are at transaction N, - * another task might be running finish_extent_commit() for the - * previous transaction N - 1, and have seen a range belonging - * to the block group in freed_extents[] before we were able to - * clear the whole block group range from freed_extents[]. This - * means that task can lookup for the block group after we - * unpinned it from freed_extents[] and removed it, leading to - * a BUG_ON() at btrfs_unpin_extent_range(). - */ - mutex_lock(&fs_info->unused_bg_unpin_mutex); - ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, - EXTENT_DIRTY); - if (ret) { - mutex_unlock(&fs_info->unused_bg_unpin_mutex); - btrfs_dec_block_group_ro(block_group); - goto end_trans; - } - ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, - EXTENT_DIRTY); - if (ret) { - mutex_unlock(&fs_info->unused_bg_unpin_mutex); - btrfs_dec_block_group_ro(block_group); - goto end_trans; - } - mutex_unlock(&fs_info->unused_bg_unpin_mutex); - - /* Reset pinned so btrfs_put_block_group doesn't complain */ - spin_lock(&space_info->lock); - spin_lock(&block_group->lock); - - btrfs_space_info_update_bytes_pinned(fs_info, space_info, - -block_group->pinned); - space_info->bytes_readonly += block_group->pinned; - percpu_counter_add_batch(&space_info->total_bytes_pinned, - -block_group->pinned, - BTRFS_TOTAL_BYTES_PINNED_BATCH); - block_group->pinned = 0; - - spin_unlock(&block_group->lock); - spin_unlock(&space_info->lock); - - /* DISCARD can flip during remount */ - trimming = btrfs_test_opt(fs_info, DISCARD); - - /* Implicit trim during transaction commit. */ - if (trimming) - btrfs_get_block_group_trimming(block_group); - - /* - * Btrfs_remove_chunk will abort the transaction if things go - * horribly wrong. - */ - ret = btrfs_remove_chunk(trans, block_group->key.objectid); - - if (ret) { - if (trimming) - btrfs_put_block_group_trimming(block_group); - goto end_trans; - } - - /* - * If we're not mounted with -odiscard, we can just forget - * about this block group. Otherwise we'll need to wait - * until transaction commit to do the actual discard. - */ - if (trimming) { - spin_lock(&fs_info->unused_bgs_lock); - /* - * A concurrent scrub might have added us to the list - * fs_info->unused_bgs, so use a list_move operation - * to add the block group to the deleted_bgs list. - */ - list_move(&block_group->bg_list, - &trans->transaction->deleted_bgs); - spin_unlock(&fs_info->unused_bgs_lock); - btrfs_get_block_group(block_group); - } -end_trans: - btrfs_end_transaction(trans); -next: - mutex_unlock(&fs_info->delete_unused_bgs_mutex); - btrfs_put_block_group(block_group); - spin_lock(&fs_info->unused_bgs_lock); - } - spin_unlock(&fs_info->unused_bgs_lock); -} - int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end) { @@ -8985,7 +5646,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) return -EINVAL; cache = btrfs_lookup_first_block_group(fs_info, range->start); - for (; cache; cache = next_block_group(cache)) { + for (; cache; cache = btrfs_next_block_group(cache)) { if (cache->key.objectid >= range_end) { btrfs_put_block_group(cache); break; @@ -8995,14 +5656,14 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) end = min(range_end, cache->key.objectid + cache->key.offset); if (end - start >= range->minlen) { - if (!block_group_cache_done(cache)) { - ret = cache_block_group(cache, 0); + if (!btrfs_block_group_cache_done(cache)) { + ret = btrfs_cache_block_group(cache, 0); if (ret) { bg_failed++; bg_ret = ret; continue; } - ret = wait_block_group_cache_done(cache); + ret = btrfs_wait_block_group_cache_done(cache); if (ret) { bg_failed++; bg_ret = ret; @@ -9095,16 +5756,3 @@ void btrfs_wait_for_snapshot_creation(struct btrfs_root *root) !atomic_read(&root->will_be_snapshotted)); } } - -void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg) -{ - struct btrfs_fs_info *fs_info = bg->fs_info; - - spin_lock(&fs_info->unused_bgs_lock); - if (list_empty(&bg->bg_list)) { - btrfs_get_block_group(bg); - trace_btrfs_add_unused_block_group(bg); - list_add_tail(&bg->bg_list, &fs_info->unused_bgs); - } - spin_unlock(&fs_info->unused_bgs_lock); -} diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1ff438fd5bc2..7b32b6af322d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1938,9 +1938,9 @@ out: } void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, - u64 delalloc_end, struct page *locked_page, - unsigned clear_bits, - unsigned long page_ops) + struct page *locked_page, + unsigned clear_bits, + unsigned long page_ops) { clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0, NULL); @@ -3628,6 +3628,13 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb) TASK_UNINTERRUPTIBLE); } +static void end_extent_buffer_writeback(struct extent_buffer *eb) +{ + clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); + smp_mb__after_atomic(); + wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); +} + /* * Lock eb pages and flush the bio if we can't the locks * @@ -3699,8 +3706,11 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb if (!trylock_page(p)) { if (!flush) { - ret = flush_write_bio(epd); - if (ret < 0) { + int err; + + err = flush_write_bio(epd); + if (err < 0) { + ret = err; failed_page_nr = i; goto err_unlock; } @@ -3715,16 +3725,23 @@ err_unlock: /* Unlock already locked pages */ for (i = 0; i < failed_page_nr; i++) unlock_page(eb->pages[i]); + /* + * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it. + * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can + * be made and undo everything done before. + */ + btrfs_tree_lock(eb); + spin_lock(&eb->refs_lock); + set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); + end_extent_buffer_writeback(eb); + spin_unlock(&eb->refs_lock); + percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len, + fs_info->dirty_metadata_batch); + btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); + btrfs_tree_unlock(eb); return ret; } -static void end_extent_buffer_writeback(struct extent_buffer *eb) -{ - clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); - smp_mb__after_atomic(); - wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); -} - static void set_btree_ioerr(struct page *page) { struct extent_buffer *eb = (struct extent_buffer *)page->private; @@ -4322,10 +4339,8 @@ int extent_invalidatepage(struct extent_io_tree *tree, lock_extent_bits(tree, start, end, &cached_state); wait_on_page_writeback(page); - clear_extent_bit(tree, start, end, - EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, - 1, 1, &cached_state); + clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 1, 1, &cached_state); return 0; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 401423b16976..cf3424d58fec 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -494,9 +494,9 @@ int map_private_extent_buffer(const struct extent_buffer *eb, void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, - u64 delalloc_end, struct page *locked_page, - unsigned bits_to_clear, - unsigned long page_ops); + struct page *locked_page, + unsigned bits_to_clear, + unsigned long page_ops); struct bio *btrfs_bio_alloc(u64 first_byte); struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs); struct bio *btrfs_bio_clone(struct bio *bio); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 9558d79faf1e..9d30acca55e1 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -384,6 +384,8 @@ int add_extent_mapping(struct extent_map_tree *tree, { int ret = 0; + lockdep_assert_held_write(&tree->lock); + ret = tree_insert(&tree->map, em); if (ret) goto out; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 58a18ed11546..8fe4eb7e5045 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -537,8 +537,8 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, * we can set things up properly */ clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, end_of_last_block, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, cached); + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + 0, 0, cached); if (!btrfs_is_free_space_inode(BTRFS_I(inode))) { if (start_pos >= isize && @@ -559,7 +559,7 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, } err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, - extra_bits, cached, 0); + extra_bits, cached); if (err) return err; @@ -1882,10 +1882,10 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, u64 start_pos; u64 end_pos; ssize_t num_written = 0; - bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); + const bool sync = iocb->ki_flags & IOCB_DSYNC; ssize_t err; loff_t pos; - size_t count = iov_iter_count(from); + size_t count; loff_t oldsize; int clean_page = 0; @@ -1906,6 +1906,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, } pos = iocb->ki_pos; + count = iov_iter_count(from); if (iocb->ki_flags & IOCB_NOWAIT) { /* * We will allocate space in case nodatacow is not set, @@ -2439,27 +2440,286 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, return 0; } +static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, + struct inode *inode, + struct btrfs_path *path, + struct btrfs_clone_extent_info *clone_info, + const u64 clone_len) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_file_extent_item *extent; + struct extent_buffer *leaf; + struct btrfs_key key; + int slot; + struct btrfs_ref ref = { 0 }; + u64 ref_offset; + int ret; + + if (clone_len == 0) + return 0; + + if (clone_info->disk_offset == 0 && + btrfs_fs_incompat(fs_info, NO_HOLES)) + return 0; + + key.objectid = btrfs_ino(BTRFS_I(inode)); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = clone_info->file_offset; + ret = btrfs_insert_empty_item(trans, root, path, &key, + clone_info->item_size); + if (ret) + return ret; + leaf = path->nodes[0]; + slot = path->slots[0]; + write_extent_buffer(leaf, clone_info->extent_buf, + btrfs_item_ptr_offset(leaf, slot), + clone_info->item_size); + extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + btrfs_set_file_extent_offset(leaf, extent, clone_info->data_offset); + btrfs_set_file_extent_num_bytes(leaf, extent, clone_len); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + /* If it's a hole, nothing more needs to be done. */ + if (clone_info->disk_offset == 0) + return 0; + + inode_add_bytes(inode, clone_len); + btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, + clone_info->disk_offset, + clone_info->disk_len, 0); + ref_offset = clone_info->file_offset - clone_info->data_offset; + btrfs_init_data_ref(&ref, root->root_key.objectid, + btrfs_ino(BTRFS_I(inode)), ref_offset); + ret = btrfs_inc_extent_ref(trans, &ref); + + return ret; +} + +/* + * The respective range must have been previously locked, as well as the inode. + * The end offset is inclusive (last byte of the range). + * @clone_info is NULL for fallocate's hole punching and non-NULL for extent + * cloning. + * When cloning, we don't want to end up in a state where we dropped extents + * without inserting a new one, so we must abort the transaction to avoid a + * corruption. + */ +int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, + const u64 start, const u64 end, + struct btrfs_clone_extent_info *clone_info, + struct btrfs_trans_handle **trans_out) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1); + u64 ino_size = round_up(inode->i_size, fs_info->sectorsize); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans = NULL; + struct btrfs_block_rsv *rsv; + unsigned int rsv_count; + u64 cur_offset; + u64 drop_end; + u64 len = end - start; + int ret = 0; + + if (end <= start) + return -EINVAL; + + rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); + if (!rsv) { + ret = -ENOMEM; + goto out; + } + rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1); + rsv->failfast = 1; + + /* + * 1 - update the inode + * 1 - removing the extents in the range + * 1 - adding the hole extent if no_holes isn't set or if we are cloning + * an extent + */ + if (!btrfs_fs_incompat(fs_info, NO_HOLES) || clone_info) + rsv_count = 3; + else + rsv_count = 2; + + trans = btrfs_start_transaction(root, rsv_count); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out_free; + } + + ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, + min_size, false); + BUG_ON(ret); + trans->block_rsv = rsv; + + cur_offset = start; + while (cur_offset < end) { + ret = __btrfs_drop_extents(trans, root, inode, path, + cur_offset, end + 1, &drop_end, + 1, 0, 0, NULL); + if (ret != -ENOSPC) { + /* + * When cloning we want to avoid transaction aborts when + * nothing was done and we are attempting to clone parts + * of inline extents, in such cases -EOPNOTSUPP is + * returned by __btrfs_drop_extents() without having + * changed anything in the file. + */ + if (clone_info && ret && ret != -EOPNOTSUPP) + btrfs_abort_transaction(trans, ret); + break; + } + + trans->block_rsv = &fs_info->trans_block_rsv; + + if (!clone_info && cur_offset < drop_end && + cur_offset < ino_size) { + ret = fill_holes(trans, BTRFS_I(inode), path, + cur_offset, drop_end); + if (ret) { + /* + * If we failed then we didn't insert our hole + * entries for the area we dropped, so now the + * fs is corrupted, so we must abort the + * transaction. + */ + btrfs_abort_transaction(trans, ret); + break; + } + } + + if (clone_info) { + u64 clone_len = drop_end - cur_offset; + + ret = btrfs_insert_clone_extent(trans, inode, path, + clone_info, clone_len); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } + clone_info->data_len -= clone_len; + clone_info->data_offset += clone_len; + clone_info->file_offset += clone_len; + } + + cur_offset = drop_end; + + ret = btrfs_update_inode(trans, root, inode); + if (ret) + break; + + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(fs_info); + + trans = btrfs_start_transaction(root, rsv_count); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + break; + } + + ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, + rsv, min_size, false); + BUG_ON(ret); /* shouldn't happen */ + trans->block_rsv = rsv; + + if (!clone_info) { + ret = find_first_non_hole(inode, &cur_offset, &len); + if (unlikely(ret < 0)) + break; + if (ret && !len) { + ret = 0; + break; + } + } + } + + /* + * If we were cloning, force the next fsync to be a full one since we + * we replaced (or just dropped in the case of cloning holes when + * NO_HOLES is enabled) extents and extent maps. + * This is for the sake of simplicity, and cloning into files larger + * than 16Mb would force the full fsync any way (when + * try_release_extent_mapping() is invoked during page cache truncation. + */ + if (clone_info) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + + if (ret) + goto out_trans; + + trans->block_rsv = &fs_info->trans_block_rsv; + /* + * If we are using the NO_HOLES feature we might have had already an + * hole that overlaps a part of the region [lockstart, lockend] and + * ends at (or beyond) lockend. Since we have no file extent items to + * represent holes, drop_end can be less than lockend and so we must + * make sure we have an extent map representing the existing hole (the + * call to __btrfs_drop_extents() might have dropped the existing extent + * map representing the existing hole), otherwise the fast fsync path + * will not record the existence of the hole region + * [existing_hole_start, lockend]. + */ + if (drop_end <= end) + drop_end = end + 1; + /* + * Don't insert file hole extent item if it's for a range beyond eof + * (because it's useless) or if it represents a 0 bytes range (when + * cur_offset == drop_end). + */ + if (!clone_info && cur_offset < ino_size && cur_offset < drop_end) { + ret = fill_holes(trans, BTRFS_I(inode), path, + cur_offset, drop_end); + if (ret) { + /* Same comment as above. */ + btrfs_abort_transaction(trans, ret); + goto out_trans; + } + } + if (clone_info) { + ret = btrfs_insert_clone_extent(trans, inode, path, clone_info, + clone_info->data_len); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_trans; + } + } + +out_trans: + if (!trans) + goto out_free; + + trans->block_rsv = &fs_info->trans_block_rsv; + if (ret) + btrfs_end_transaction(trans); + else + *trans_out = trans; +out_free: + btrfs_free_block_rsv(fs_info, rsv); +out: + return ret; +} + static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_state *cached_state = NULL; struct btrfs_path *path; - struct btrfs_block_rsv *rsv; - struct btrfs_trans_handle *trans; + struct btrfs_trans_handle *trans = NULL; u64 lockstart; u64 lockend; u64 tail_start; u64 tail_len; u64 orig_start = offset; - u64 cur_offset; - u64 min_size = btrfs_calc_trans_metadata_size(fs_info, 1); - u64 drop_end; int ret = 0; - int err = 0; - unsigned int rsv_count; bool same_block; - bool no_holes = btrfs_fs_incompat(fs_info, NO_HOLES); u64 ino_size; bool truncated_block = false; bool updated_inode = false; @@ -2566,145 +2826,24 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out; } - rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); - if (!rsv) { - ret = -ENOMEM; - goto out_free; - } - rsv->size = btrfs_calc_trans_metadata_size(fs_info, 1); - rsv->failfast = 1; - - /* - * 1 - update the inode - * 1 - removing the extents in the range - * 1 - adding the hole extent if no_holes isn't set - */ - rsv_count = no_holes ? 2 : 3; - trans = btrfs_start_transaction(root, rsv_count); - if (IS_ERR(trans)) { - err = PTR_ERR(trans); - goto out_free; - } - - ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, - min_size, false); - BUG_ON(ret); - trans->block_rsv = rsv; - - cur_offset = lockstart; - len = lockend - cur_offset; - while (cur_offset < lockend) { - ret = __btrfs_drop_extents(trans, root, inode, path, - cur_offset, lockend + 1, - &drop_end, 1, 0, 0, NULL); - if (ret != -ENOSPC) - break; - - trans->block_rsv = &fs_info->trans_block_rsv; - - if (cur_offset < drop_end && cur_offset < ino_size) { - ret = fill_holes(trans, BTRFS_I(inode), path, - cur_offset, drop_end); - if (ret) { - /* - * If we failed then we didn't insert our hole - * entries for the area we dropped, so now the - * fs is corrupted, so we must abort the - * transaction. - */ - btrfs_abort_transaction(trans, ret); - err = ret; - break; - } - } - - cur_offset = drop_end; - - ret = btrfs_update_inode(trans, root, inode); - if (ret) { - err = ret; - break; - } - - btrfs_end_transaction(trans); - btrfs_btree_balance_dirty(fs_info); - - trans = btrfs_start_transaction(root, rsv_count); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - break; - } - - ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, - rsv, min_size, false); - BUG_ON(ret); /* shouldn't happen */ - trans->block_rsv = rsv; - - ret = find_first_non_hole(inode, &cur_offset, &len); - if (unlikely(ret < 0)) - break; - if (ret && !len) { - ret = 0; - break; - } - } - - if (ret) { - err = ret; - goto out_trans; - } - - trans->block_rsv = &fs_info->trans_block_rsv; - /* - * If we are using the NO_HOLES feature we might have had already an - * hole that overlaps a part of the region [lockstart, lockend] and - * ends at (or beyond) lockend. Since we have no file extent items to - * represent holes, drop_end can be less than lockend and so we must - * make sure we have an extent map representing the existing hole (the - * call to __btrfs_drop_extents() might have dropped the existing extent - * map representing the existing hole), otherwise the fast fsync path - * will not record the existence of the hole region - * [existing_hole_start, lockend]. - */ - if (drop_end <= lockend) - drop_end = lockend + 1; - /* - * Don't insert file hole extent item if it's for a range beyond eof - * (because it's useless) or if it represents a 0 bytes range (when - * cur_offset == drop_end). - */ - if (cur_offset < ino_size && cur_offset < drop_end) { - ret = fill_holes(trans, BTRFS_I(inode), path, - cur_offset, drop_end); - if (ret) { - /* Same comment as above. */ - btrfs_abort_transaction(trans, ret); - err = ret; - goto out_trans; - } - } - -out_trans: - if (!trans) - goto out_free; + ret = btrfs_punch_hole_range(inode, path, lockstart, lockend, NULL, + &trans); + btrfs_free_path(path); + if (ret) + goto out; + ASSERT(trans != NULL); inode_inc_iversion(inode); inode->i_mtime = inode->i_ctime = current_time(inode); - - trans->block_rsv = &fs_info->trans_block_rsv; ret = btrfs_update_inode(trans, root, inode); updated_inode = true; btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); -out_free: - btrfs_free_path(path); - btrfs_free_block_rsv(fs_info, rsv); out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); out_only_mutex: - if (!updated_inode && truncated_block && !ret && !err) { + if (!updated_inode && truncated_block && !ret) { /* * If we only end up zeroing part of a page, we still need to * update the inode item, so that all the time fields are @@ -2719,16 +2858,18 @@ out_only_mutex: inode->i_ctime = now; trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); } else { - err = btrfs_update_inode(trans, root, inode); - ret = btrfs_end_transaction(trans); + int ret2; + + ret = btrfs_update_inode(trans, root, inode); + ret2 = btrfs_end_transaction(trans); + if (!ret) + ret = ret2; } } inode_unlock(inode); - if (ret && !err) - err = ret; - return err; + return ret; } /* Helper structure to record which range is already reserved */ diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 062be9dde4c6..d54dcd0ab230 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -20,6 +20,7 @@ #include "volumes.h" #include "space-info.h" #include "delalloc-space.h" +#include "block-group.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_32K @@ -210,8 +211,8 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, int ret; /* 1 for slack space, 1 for updating the inode */ - needed_bytes = btrfs_calc_trunc_metadata_size(fs_info, 1) + - btrfs_calc_trans_metadata_size(fs_info, 1); + needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) + + btrfs_calc_metadata_size(fs_info, 1); spin_lock(&rsv->lock); if (rsv->reserved < needed_bytes) @@ -764,7 +765,8 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, } else { ASSERT(num_bitmaps); num_bitmaps--; - e->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS); + e->bitmap = kmem_cache_zalloc( + btrfs_free_space_bitmap_cachep, GFP_NOFS); if (!e->bitmap) { kmem_cache_free( btrfs_free_space_cachep, e); @@ -1004,7 +1006,7 @@ update_cache_item(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL); + EXTENT_DELALLOC, 0, 0, NULL); goto fail; } leaf = path->nodes[0]; @@ -1016,9 +1018,8 @@ update_cache_item(struct btrfs_trans_handle *trans, if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || found_key.offset != offset) { clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, - inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, - NULL); + inode->i_size - 1, EXTENT_DELALLOC, 0, + 0, NULL); btrfs_release_path(path); goto fail; } @@ -1114,7 +1115,7 @@ static int flush_dirty_cache(struct inode *inode) ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); if (ret) clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL); + EXTENT_DELALLOC, 0, 0, NULL); return ret; } @@ -1881,7 +1882,7 @@ static void free_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *bitmap_info) { unlink_free_space(ctl, bitmap_info); - kfree(bitmap_info->bitmap); + kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap); kmem_cache_free(btrfs_free_space_cachep, bitmap_info); ctl->total_bitmaps--; ctl->op->recalc_thresholds(ctl); @@ -2135,7 +2136,8 @@ new_bitmap: } /* allocate the bitmap */ - info->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS); + info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, + GFP_NOFS); spin_lock(&ctl->tree_lock); if (!info->bitmap) { ret = -ENOMEM; @@ -2146,7 +2148,9 @@ new_bitmap: out: if (info) { - kfree(info->bitmap); + if (info->bitmap) + kmem_cache_free(btrfs_free_space_bitmap_cachep, + info->bitmap); kmem_cache_free(btrfs_free_space_cachep, info); } @@ -2376,6 +2380,14 @@ out: return ret; } +int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 bytenr, u64 size) +{ + return __btrfs_add_free_space(block_group->fs_info, + block_group->free_space_ctl, + bytenr, size); +} + int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, u64 offset, u64 bytes) { @@ -2802,7 +2814,8 @@ out: if (entry->bytes == 0) { ctl->free_extents--; if (entry->bitmap) { - kfree(entry->bitmap); + kmem_cache_free(btrfs_free_space_bitmap_cachep, + entry->bitmap); ctl->total_bitmaps--; ctl->op->recalc_thresholds(ctl); } @@ -3606,7 +3619,7 @@ again: } if (!map) { - map = kzalloc(PAGE_SIZE, GFP_NOFS); + map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS); if (!map) { kmem_cache_free(btrfs_free_space_cachep, info); return -ENOMEM; @@ -3635,7 +3648,8 @@ again: if (info) kmem_cache_free(btrfs_free_space_cachep, info); - kfree(map); + if (map) + kmem_cache_free(btrfs_free_space_bitmap_cachep, map); return 0; } diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 8760acb55ffd..39c32c8fc24f 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -36,7 +36,19 @@ struct btrfs_free_space_op { struct btrfs_free_space *info); }; -struct btrfs_io_ctl; +struct btrfs_io_ctl { + void *cur, *orig; + struct page *page; + struct page **pages; + struct btrfs_fs_info *fs_info; + struct inode *inode; + unsigned long size; + int index; + int num_pages; + int entries; + int bitmaps; + unsigned check_crcs:1; +}; struct inode *lookup_free_space_inode( struct btrfs_block_group_cache *block_group, @@ -73,14 +85,8 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group); int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, struct btrfs_free_space_ctl *ctl, u64 bytenr, u64 size); -static inline int -btrfs_add_free_space(struct btrfs_block_group_cache *block_group, - u64 bytenr, u64 size) -{ - return __btrfs_add_free_space(block_group->fs_info, - block_group->free_space_ctl, - bytenr, size); -} +int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 bytenr, u64 size); int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, u64 bytenr, u64 size); void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl); diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index f5dc115ebba0..48a03f5240f5 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -10,6 +10,7 @@ #include "locking.h" #include "free-space-tree.h" #include "transaction.h" +#include "block-group.h" static int __add_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h index 22b7602bde25..360d50e1cdea 100644 --- a/fs/btrfs/free-space-tree.h +++ b/fs/btrfs/free-space-tree.h @@ -6,6 +6,8 @@ #ifndef BTRFS_FREE_SPACE_TREE_H #define BTRFS_FREE_SPACE_TREE_H +struct btrfs_caching_control; + /* * The default size for new free space bitmap items. The last bitmap in a block * group may be truncated, and none of the free space tree code assumes that diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 30d62ef918b9..668701832845 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -8,9 +8,9 @@ #include "transaction.h" #include "print-tree.h" -int btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot, - const char *name, - int name_len, struct btrfs_inode_ref **ref_ret) +struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, + int slot, const char *name, + int name_len) { struct btrfs_inode_ref *ref; unsigned long ptr; @@ -28,19 +28,15 @@ int btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot, cur_offset += len + sizeof(*ref); if (len != name_len) continue; - if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) { - if (ref_ret) - *ref_ret = ref; - return 1; - } + if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) + return ref; } - return 0; + return NULL; } -int btrfs_find_name_in_ext_backref(struct extent_buffer *leaf, int slot, - u64 ref_objectid, - const char *name, int name_len, - struct btrfs_inode_extref **extref_ret) +struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( + struct extent_buffer *leaf, int slot, u64 ref_objectid, + const char *name, int name_len) { struct btrfs_inode_extref *extref; unsigned long ptr; @@ -65,15 +61,12 @@ int btrfs_find_name_in_ext_backref(struct extent_buffer *leaf, int slot, if (ref_name_len == name_len && btrfs_inode_extref_parent(leaf, extref) == ref_objectid && - (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)) { - if (extref_ret) - *extref_ret = extref; - return 1; - } + (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)) + return extref; cur_offset += ref_name_len + sizeof(*extref); } - return 0; + return NULL; } /* Returns NULL if no extref found */ @@ -87,7 +80,6 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, { int ret; struct btrfs_key key; - struct btrfs_inode_extref *extref; key.objectid = inode_objectid; key.type = BTRFS_INODE_EXTREF_KEY; @@ -98,11 +90,9 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, return ERR_PTR(ret); if (ret > 0) return NULL; - if (!btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], - ref_objectid, name, name_len, - &extref)) - return NULL; - return extref; + return btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], + ref_objectid, name, name_len); + } static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, @@ -142,9 +132,9 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, * This should always succeed so error here will make the FS * readonly. */ - if (!btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], - ref_objectid, - name, name_len, &extref)) { + extref = btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], + ref_objectid, name, name_len); + if (!extref) { btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL); ret = -EROFS; goto out; @@ -213,8 +203,10 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, } else if (ret < 0) { goto out; } - if (!btrfs_find_name_in_backref(path->nodes[0], path->slots[0], - name, name_len, &ref)) { + + ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], name, + name_len); + if (!ref) { ret = -ENOENT; search_ext_refs = 1; goto out; @@ -285,7 +277,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, if (btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], ref_objectid, - name, name_len, NULL)) + name, name_len)) goto out; btrfs_extend_item(path, ins_len); @@ -341,9 +333,9 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, ins_len); if (ret == -EEXIST) { u32 old_size; - - if (btrfs_find_name_in_backref(path->nodes[0], path->slots[0], - name, name_len, &ref)) + ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], + name, name_len); + if (ref) goto out; old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); @@ -359,7 +351,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, if (ret == -EOVERFLOW) { if (btrfs_find_name_in_backref(path->nodes[0], path->slots[0], - name, name_len, &ref)) + name, name_len)) ret = -EEXIST; else ret = -EMLINK; diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 2e8bb402050b..63cad7865d75 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -13,6 +13,19 @@ #include "transaction.h" #include "delalloc-space.h" +static void fail_caching_thread(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + + btrfs_warn(fs_info, "failed to start inode caching task"); + btrfs_clear_pending_and_info(fs_info, INODE_MAP_CACHE, + "disabling inode map caching"); + spin_lock(&root->ino_cache_lock); + root->ino_cache_state = BTRFS_CACHE_ERROR; + spin_unlock(&root->ino_cache_lock); + wake_up(&root->ino_cache_wait); +} + static int caching_kthread(void *data) { struct btrfs_root *root = data; @@ -29,8 +42,10 @@ static int caching_kthread(void *data) return 0; path = btrfs_alloc_path(); - if (!path) + if (!path) { + fail_caching_thread(root); return -ENOMEM; + } /* Since the commit root is read-only, we can safely skip locking. */ path->skip_locking = 1; @@ -146,6 +161,7 @@ static void start_caching(struct btrfs_root *root) spin_lock(&root->ino_cache_lock); root->ino_cache_state = BTRFS_CACHE_FINISHED; spin_unlock(&root->ino_cache_lock); + wake_up(&root->ino_cache_wait); return; } @@ -160,15 +176,13 @@ static void start_caching(struct btrfs_root *root) if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) { __btrfs_add_free_space(fs_info, ctl, objectid, BTRFS_LAST_FREE_OBJECTID - objectid + 1); + wake_up(&root->ino_cache_wait); } tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu", root->root_key.objectid); - if (IS_ERR(tsk)) { - btrfs_warn(fs_info, "failed to start inode caching task"); - btrfs_clear_pending_and_info(fs_info, INODE_MAP_CACHE, - "disabling inode map caching"); - } + if (IS_ERR(tsk)) + fail_caching_thread(root); } int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid) @@ -186,11 +200,14 @@ again: wait_event(root->ino_cache_wait, root->ino_cache_state == BTRFS_CACHE_FINISHED || + root->ino_cache_state == BTRFS_CACHE_ERROR || root->free_ino_ctl->free_space > 0); if (root->ino_cache_state == BTRFS_CACHE_FINISHED && root->free_ino_ctl->free_space == 0) return -ENOSPC; + else if (root->ino_cache_state == BTRFS_CACHE_ERROR) + return btrfs_find_free_objectid(root, objectid); else goto again; } @@ -419,7 +436,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root, * 1 item for free space object * 3 items for pre-allocation */ - trans->bytes_reserved = btrfs_calc_trans_metadata_size(fs_info, 10); + trans->bytes_reserved = btrfs_calc_insert_metadata_size(fs_info, 10); ret = btrfs_block_rsv_add(root, trans->block_rsv, trans->bytes_reserved, BTRFS_RESERVE_NO_FLUSH); @@ -485,6 +502,7 @@ again: prealloc, prealloc, &alloc_hint); if (ret) { btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, true); + btrfs_delalloc_release_metadata(BTRFS_I(inode), prealloc, true); goto out_put; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ee582a36653d..a0546401bc0a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -30,6 +30,7 @@ #include <linux/swap.h> #include <linux/sched/mm.h> #include <asm/unaligned.h> +#include "misc.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -46,8 +47,8 @@ #include "backref.h" #include "props.h" #include "qgroup.h" -#include "dedupe.h" #include "delalloc-space.h" +#include "block-group.h" struct btrfs_iget_args { struct btrfs_key *location; @@ -74,15 +75,15 @@ static struct kmem_cache *btrfs_inode_cachep; struct kmem_cache *btrfs_trans_handle_cachep; struct kmem_cache *btrfs_path_cachep; struct kmem_cache *btrfs_free_space_cachep; +struct kmem_cache *btrfs_free_space_bitmap_cachep; static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct inode *inode, bool skip_writeback); static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); static noinline int cow_file_range(struct inode *inode, struct page *locked_page, - u64 start, u64 end, u64 delalloc_end, - int *page_started, unsigned long *nr_written, - int unlock, struct btrfs_dedupe_hash *hash); + u64 start, u64 end, int *page_started, + unsigned long *nr_written, int unlock); static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, u64 orig_block_len, @@ -178,6 +179,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, size_t cur_size = size; unsigned long offset; + ASSERT((compressed_size > 0 && compressed_pages) || + (compressed_size == 0 && !compressed_pages)); + if (compressed_size && compressed_pages) cur_size = compressed_size; @@ -462,8 +466,7 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, * are written in the same order that the flusher thread sent them * down. */ -static noinline void compress_file_range(struct async_chunk *async_chunk, - int *num_added) +static noinline int compress_file_range(struct async_chunk *async_chunk) { struct inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -479,6 +482,7 @@ static noinline void compress_file_range(struct async_chunk *async_chunk, int i; int will_compress; int compress_type = fs_info->compress_type; + int compressed_extents = 0; int redirty = 0; inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1, @@ -615,14 +619,21 @@ cont: * our outstanding extent for clearing delalloc for this * range. */ - extent_clear_unlock_delalloc(inode, start, end, end, - NULL, clear_flags, + extent_clear_unlock_delalloc(inode, start, end, NULL, + clear_flags, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | page_error_op | PAGE_END_WRITEBACK); - goto free_pages_out; + + for (i = 0; i < nr_pages; i++) { + WARN_ON(pages[i]->mapping); + put_page(pages[i]); + } + kfree(pages); + + return 0; } } @@ -641,7 +652,7 @@ cont: */ total_in = ALIGN(total_in, PAGE_SIZE); if (total_compressed + blocksize <= total_in) { - *num_added += 1; + compressed_extents++; /* * The async work queues will take care of doing actual @@ -658,7 +669,7 @@ cont: cond_resched(); goto again; } - return; + return compressed_extents; } } if (pages) { @@ -697,16 +708,9 @@ cleanup_and_bail_uncompressed: extent_range_redirty_for_io(inode, start, end); add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, BTRFS_COMPRESS_NONE); - *num_added += 1; - - return; + compressed_extents++; -free_pages_out: - for (i = 0; i < nr_pages; i++) { - WARN_ON(pages[i]->mapping); - put_page(pages[i]); - } - kfree(pages); + return compressed_extents; } static void free_async_extent_pages(struct async_extent *async_extent) @@ -762,10 +766,7 @@ retry: async_extent->start, async_extent->start + async_extent->ram_size - 1, - async_extent->start + - async_extent->ram_size - 1, - &page_started, &nr_written, 0, - NULL); + &page_started, &nr_written, 0); /* JDM XXX */ @@ -855,8 +856,6 @@ retry: extent_clear_unlock_delalloc(inode, async_extent->start, async_extent->start + async_extent->ram_size - 1, - async_extent->start + - async_extent->ram_size - 1, NULL, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK); @@ -875,7 +874,7 @@ retry: btrfs_writepage_endio_finish_ordered(p, start, end, 0); p->mapping = NULL; - extent_clear_unlock_delalloc(inode, start, end, end, + extent_clear_unlock_delalloc(inode, start, end, NULL, 0, PAGE_END_WRITEBACK | PAGE_SET_ERROR); @@ -893,8 +892,6 @@ out_free: extent_clear_unlock_delalloc(inode, async_extent->start, async_extent->start + async_extent->ram_size - 1, - async_extent->start + - async_extent->ram_size - 1, NULL, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, @@ -953,9 +950,8 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start, */ static noinline int cow_file_range(struct inode *inode, struct page *locked_page, - u64 start, u64 end, u64 delalloc_end, - int *page_started, unsigned long *nr_written, - int unlock, struct btrfs_dedupe_hash *hash) + u64 start, u64 end, int *page_started, + unsigned long *nr_written, int unlock) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; @@ -994,8 +990,7 @@ static noinline int cow_file_range(struct inode *inode, * our outstanding extent for clearing delalloc for this * range. */ - extent_clear_unlock_delalloc(inode, start, end, - delalloc_end, NULL, + extent_clear_unlock_delalloc(inode, start, end, NULL, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | @@ -1078,7 +1073,7 @@ static noinline int cow_file_range(struct inode *inode, extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, - delalloc_end, locked_page, + locked_page, EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); if (num_bytes < cur_alloc_size) @@ -1123,7 +1118,6 @@ out_unlock: if (extent_reserved) { extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size, - start + cur_alloc_size, locked_page, clear_bits, page_ops); @@ -1131,8 +1125,7 @@ out_unlock: if (start >= end) goto out; } - extent_clear_unlock_delalloc(inode, start, end, delalloc_end, - locked_page, + extent_clear_unlock_delalloc(inode, start, end, locked_page, clear_bits | EXTENT_CLEAR_DATA_RESV, page_ops); goto out; @@ -1144,12 +1137,12 @@ out_unlock: static noinline void async_cow_start(struct btrfs_work *work) { struct async_chunk *async_chunk; - int num_added = 0; + int compressed_extents; async_chunk = container_of(work, struct async_chunk, work); - compress_file_range(async_chunk, &num_added); - if (num_added == 0) { + compressed_extents = compress_file_range(async_chunk); + if (compressed_extents == 0) { btrfs_add_delayed_iput(async_chunk->inode); async_chunk->inode = NULL; } @@ -1235,7 +1228,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK | PAGE_SET_ERROR; - extent_clear_unlock_delalloc(inode, start, end, 0, locked_page, + extent_clear_unlock_delalloc(inode, start, end, locked_page, clear_bits, page_ops); return -ENOMEM; } @@ -1310,36 +1303,25 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, */ static noinline int run_delalloc_nocow(struct inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, int force, - unsigned long *nr_written) + const u64 start, const u64 end, + int *page_started, int force, + unsigned long *nr_written) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_buffer *leaf; struct btrfs_path *path; - struct btrfs_file_extent_item *fi; - struct btrfs_key found_key; - struct extent_map *em; - u64 cow_start; - u64 cur_offset; - u64 extent_end; - u64 extent_offset; - u64 disk_bytenr; - u64 num_bytes; - u64 disk_num_bytes; - u64 ram_bytes; - int extent_type; + u64 cow_start = (u64)-1; + u64 cur_offset = start; int ret; - int type; - int nocow; - int check_prev = 1; - bool nolock; + bool check_prev = true; + const bool freespace_inode = btrfs_is_free_space_inode(BTRFS_I(inode)); u64 ino = btrfs_ino(BTRFS_I(inode)); + bool nocow = false; + u64 disk_bytenr = 0; path = btrfs_alloc_path(); if (!path) { - extent_clear_unlock_delalloc(inode, start, end, end, - locked_page, + extent_clear_unlock_delalloc(inode, start, end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, PAGE_UNLOCK | @@ -1349,15 +1331,29 @@ static noinline int run_delalloc_nocow(struct inode *inode, return -ENOMEM; } - nolock = btrfs_is_free_space_inode(BTRFS_I(inode)); - - cow_start = (u64)-1; - cur_offset = start; while (1) { + struct btrfs_key found_key; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + u64 extent_end; + u64 extent_offset; + u64 num_bytes = 0; + u64 disk_num_bytes; + u64 ram_bytes; + int extent_type; + + nocow = false; + ret = btrfs_lookup_file_extent(NULL, root, path, ino, cur_offset, 0); if (ret < 0) goto error; + + /* + * If there is no extent for our range when doing the initial + * search, then go back to the previous slot as it will be the + * one containing the search offset + */ if (ret > 0 && path->slots[0] > 0 && check_prev) { leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, @@ -1366,8 +1362,9 @@ static noinline int run_delalloc_nocow(struct inode *inode, found_key.type == BTRFS_EXTENT_DATA_KEY) path->slots[0]--; } - check_prev = 0; + check_prev = false; next_slot: + /* Go to next leaf if we have exhausted the current one */ leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); @@ -1381,28 +1378,40 @@ next_slot: leaf = path->nodes[0]; } - nocow = 0; - disk_bytenr = 0; - num_bytes = 0; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + /* Didn't find anything for our INO */ if (found_key.objectid > ino) break; + /* + * Keep searching until we find an EXTENT_ITEM or there are no + * more extents for this inode + */ if (WARN_ON_ONCE(found_key.objectid < ino) || found_key.type < BTRFS_EXTENT_DATA_KEY) { path->slots[0]++; goto next_slot; } + + /* Found key is not EXTENT_DATA_KEY or starts after req range */ if (found_key.type > BTRFS_EXTENT_DATA_KEY || found_key.offset > end) break; + /* + * If the found extent starts after requested offset, then + * adjust extent_end to be right before this extent begins + */ if (found_key.offset > cur_offset) { extent_end = found_key.offset; extent_type = 0; goto out_check; } + /* + * Found extent which begins before our range and potentially + * intersect it + */ fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(leaf, fi); @@ -1416,26 +1425,36 @@ next_slot: btrfs_file_extent_num_bytes(leaf, fi); disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + /* + * If extent we got ends before our range starts, skip + * to next extent + */ if (extent_end <= start) { path->slots[0]++; goto next_slot; } + /* Skip holes */ if (disk_bytenr == 0) goto out_check; + /* Skip compressed/encrypted/encoded extents */ if (btrfs_file_extent_compression(leaf, fi) || btrfs_file_extent_encryption(leaf, fi) || btrfs_file_extent_other_encoding(leaf, fi)) goto out_check; /* - * Do the same check as in btrfs_cross_ref_exist but - * without the unnecessary search. + * If extent is created before the last volume's snapshot + * this implies the extent is shared, hence we can't do + * nocow. This is the same check as in + * btrfs_cross_ref_exist but without calling + * btrfs_search_slot. */ - if (!nolock && + if (!freespace_inode && btrfs_file_extent_generation(leaf, fi) <= btrfs_root_last_snapshot(&root->root_item)) goto out_check; if (extent_type == BTRFS_FILE_EXTENT_REG && !force) goto out_check; + /* If extent is RO, we must COW it */ if (btrfs_extent_readonly(fs_info, disk_bytenr)) goto out_check; ret = btrfs_cross_ref_exist(root, ino, @@ -1452,17 +1471,17 @@ next_slot: goto error; } - WARN_ON_ONCE(nolock); + WARN_ON_ONCE(freespace_inode); goto out_check; } disk_bytenr += extent_offset; disk_bytenr += cur_offset - found_key.offset; num_bytes = min(end + 1, extent_end) - cur_offset; /* - * if there are pending snapshots for this root, - * we fall into common COW way. + * If there are pending snapshots for this root, we + * fall into common COW way */ - if (!nolock && atomic_read(&root->snapshot_force_cow)) + if (!freespace_inode && atomic_read(&root->snapshot_force_cow)) goto out_check; /* * force cow if csum exists in the range. @@ -1481,27 +1500,29 @@ next_slot: cur_offset = cow_start; goto error; } - WARN_ON_ONCE(nolock); + WARN_ON_ONCE(freespace_inode); goto out_check; } if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) goto out_check; - nocow = 1; + nocow = true; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - extent_end = found_key.offset + - btrfs_file_extent_ram_bytes(leaf, fi); - extent_end = ALIGN(extent_end, - fs_info->sectorsize); + extent_end = found_key.offset + ram_bytes; + extent_end = ALIGN(extent_end, fs_info->sectorsize); + /* Skip extents outside of our requested range */ + if (extent_end <= start) { + path->slots[0]++; + goto next_slot; + } } else { + /* If this triggers then we have a memory corruption */ BUG(); } out_check: - if (extent_end <= start) { - path->slots[0]++; - if (nocow) - btrfs_dec_nocow_writers(fs_info, disk_bytenr); - goto next_slot; - } + /* + * If nocow is false then record the beginning of the range + * that needs to be COWed + */ if (!nocow) { if (cow_start == (u64)-1) cow_start = cur_offset; @@ -1513,11 +1534,16 @@ out_check: } btrfs_release_path(path); + + /* + * COW range from cow_start to found_key.offset - 1. As the key + * will contain the beginning of the first extent that can be + * NOCOW, following one which needs to be COW'ed + */ if (cow_start != (u64)-1) { ret = cow_file_range(inode, locked_page, cow_start, found_key.offset - 1, - end, page_started, nr_written, 1, - NULL); + page_started, nr_written, 1); if (ret) { if (nocow) btrfs_dec_nocow_writers(fs_info, @@ -1529,6 +1555,7 @@ out_check: if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { u64 orig_start = found_key.offset - extent_offset; + struct extent_map *em; em = create_io_em(inode, cur_offset, num_bytes, orig_start, @@ -1545,19 +1572,29 @@ out_check: goto error; } free_extent_map(em); - } - - if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { - type = BTRFS_ORDERED_PREALLOC; + ret = btrfs_add_ordered_extent(inode, cur_offset, + disk_bytenr, num_bytes, + num_bytes, + BTRFS_ORDERED_PREALLOC); + if (ret) { + btrfs_drop_extent_cache(BTRFS_I(inode), + cur_offset, + cur_offset + num_bytes - 1, + 0); + goto error; + } } else { - type = BTRFS_ORDERED_NOCOW; + ret = btrfs_add_ordered_extent(inode, cur_offset, + disk_bytenr, num_bytes, + num_bytes, + BTRFS_ORDERED_NOCOW); + if (ret) + goto error; } - ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, - num_bytes, num_bytes, type); if (nocow) btrfs_dec_nocow_writers(fs_info, disk_bytenr); - BUG_ON(ret); /* -ENOMEM */ + nocow = false; if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) @@ -1570,7 +1607,7 @@ out_check: num_bytes); extent_clear_unlock_delalloc(inode, cur_offset, - cur_offset + num_bytes - 1, end, + cur_offset + num_bytes - 1, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_CLEAR_DATA_RESV, @@ -1595,15 +1632,18 @@ out_check: if (cow_start != (u64)-1) { cur_offset = end; - ret = cow_file_range(inode, locked_page, cow_start, end, end, - page_started, nr_written, 1, NULL); + ret = cow_file_range(inode, locked_page, cow_start, end, + page_started, nr_written, 1); if (ret) goto error; } error: + if (nocow) + btrfs_dec_nocow_writers(fs_info, disk_bytenr); + if (ret && cur_offset < end) - extent_clear_unlock_delalloc(inode, cur_offset, end, end, + extent_clear_unlock_delalloc(inode, cur_offset, end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | @@ -1654,8 +1694,8 @@ int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page, page_started, 0, nr_written); } else if (!inode_can_compress(inode) || !inode_need_compress(inode, start, end)) { - ret = cow_file_range(inode, locked_page, start, end, end, - page_started, nr_written, 1, NULL); + ret = cow_file_range(inode, locked_page, start, end, + page_started, nr_written, 1); } else { set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &BTRFS_I(inode)->runtime_flags); @@ -2090,7 +2130,7 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, unsigned int extra_bits, - struct extent_state **cached_state, int dedupe) + struct extent_state **cached_state) { WARN_ON(PAGE_ALIGNED(end)); return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, @@ -2156,7 +2196,7 @@ again: } ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, - &cached_state, 0); + &cached_state); if (ret) { mapping_set_error(page->mapping, ret); end_extent_writepage(page, ret, page_start, page_end); @@ -3850,7 +3890,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, { struct btrfs_map_token token; - btrfs_init_map_token(&token); + btrfs_init_map_token(&token, leaf); btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); @@ -4946,12 +4986,11 @@ again: } clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, &cached_state); + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + 0, 0, &cached_state); ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, - &cached_state, 0); + &cached_state); if (ret) { unlock_extent_cached(io_tree, block_start, block_end, &cached_state); @@ -5332,9 +5371,9 @@ static void evict_inode_truncate_pages(struct inode *inode) btrfs_qgroup_free_data(inode, NULL, start, end - start + 1); clear_extent_bit(io_tree, start, end, - EXTENT_LOCKED | EXTENT_DIRTY | - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 1, 1, &cached_state); + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, + &cached_state); cond_resched(); spin_lock(&io_tree->lock); @@ -5347,59 +5386,50 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - u64 delayed_refs_extra = btrfs_calc_trans_metadata_size(fs_info, 1); - int failures = 0; - - for (;;) { - struct btrfs_trans_handle *trans; - int ret; - - ret = btrfs_block_rsv_refill(root, rsv, - rsv->size + delayed_refs_extra, - BTRFS_RESERVE_FLUSH_LIMIT); - - if (ret && ++failures > 2) { - btrfs_warn(fs_info, - "could not allocate space for a delete; will truncate on mount"); - return ERR_PTR(-ENOSPC); - } - - /* - * Evict can generate a large amount of delayed refs without - * having a way to add space back since we exhaust our temporary - * block rsv. We aren't allowed to do FLUSH_ALL in this case - * because we could deadlock with so many things in the flushing - * code, so we have to try and hold some extra space to - * compensate for our delayed ref generation. If we can't get - * that space then we need see if we can steal our minimum from - * the global reserve. We will be ratelimited by the amount of - * space we have for the delayed refs rsv, so we'll end up - * committing and trying again. - */ - trans = btrfs_join_transaction(root); - if (IS_ERR(trans) || !ret) { - if (!IS_ERR(trans)) { - trans->block_rsv = &fs_info->trans_block_rsv; - trans->bytes_reserved = delayed_refs_extra; - btrfs_block_rsv_migrate(rsv, trans->block_rsv, - delayed_refs_extra, 1); - } - return trans; - } + struct btrfs_trans_handle *trans; + u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1); + int ret; + /* + * Eviction should be taking place at some place safe because of our + * delayed iputs. However the normal flushing code will run delayed + * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock. + * + * We reserve the delayed_refs_extra here again because we can't use + * btrfs_start_transaction(root, 0) for the same deadlocky reason as + * above. We reserve our extra bit here because we generate a ton of + * delayed refs activity by truncating. + * + * If we cannot make our reservation we'll attempt to steal from the + * global reserve, because we really want to be able to free up space. + */ + ret = btrfs_block_rsv_refill(root, rsv, rsv->size + delayed_refs_extra, + BTRFS_RESERVE_FLUSH_EVICT); + if (ret) { /* * Try to steal from the global reserve if there is space for * it. */ - if (!btrfs_check_space_for_delayed_refs(fs_info) && - !btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0)) - return trans; + if (btrfs_check_space_for_delayed_refs(fs_info) || + btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0)) { + btrfs_warn(fs_info, + "could not allocate space for delete; will truncate on mount"); + return ERR_PTR(-ENOSPC); + } + delayed_refs_extra = 0; + } - /* If not, commit and try again. */ - ret = btrfs_commit_transaction(trans); - if (ret) - return ERR_PTR(ret); + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return trans; + + if (delayed_refs_extra) { + trans->block_rsv = &fs_info->trans_block_rsv; + trans->bytes_reserved = delayed_refs_extra; + btrfs_block_rsv_migrate(rsv, trans->block_rsv, + delayed_refs_extra, 1); } + return trans; } void btrfs_evict_inode(struct inode *inode) @@ -5446,7 +5476,7 @@ void btrfs_evict_inode(struct inode *inode) rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); if (!rsv) goto no_delete; - rsv->size = btrfs_calc_trunc_metadata_size(fs_info, 1); + rsv->size = btrfs_calc_metadata_size(fs_info, 1); rsv->failfast = 1; btrfs_i_size_write(BTRFS_I(inode), 0); @@ -7701,12 +7731,9 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, u64 start = iblock << inode->i_blkbits; u64 lockstart, lockend; u64 len = bh_result->b_size; - int unlock_bits = EXTENT_LOCKED; int ret = 0; - if (create) - unlock_bits |= EXTENT_DIRTY; - else + if (!create) len = min_t(u64, len, fs_info->sectorsize); lockstart = start; @@ -7765,9 +7792,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (ret < 0) goto unlock_err; - /* clear and unlock the entire range */ - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, - unlock_bits, 1, 0, &cached_state); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, + lockend, &cached_state); } else { ret = btrfs_get_blocks_direct_read(em, bh_result, inode, start, len); @@ -7783,9 +7809,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, */ lockstart = start + bh_result->b_size; if (lockstart < lockend) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockend, unlock_bits, 1, 0, - &cached_state); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + lockstart, lockend, &cached_state); } else { free_extent_state(cached_state); } @@ -7796,8 +7821,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, return 0; unlock_err: - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, - unlock_bits, 1, 0, &cached_state); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); err: if (dio_data) current->journal_info = dio_data; @@ -8812,8 +8837,7 @@ again: */ if (!inode_evicting) clear_extent_bit(tree, start, end, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DELALLOC_NEW | + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 0, &cached_state); /* @@ -8868,8 +8892,7 @@ again: if (PageDirty(page)) btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); if (!inode_evicting) { - clear_extent_bit(tree, page_start, page_end, - EXTENT_LOCKED | EXTENT_DIRTY | + clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, &cached_state); @@ -8997,12 +9020,11 @@ again: * reserve data&meta space before lock_page() (see above comments). */ clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, &cached_state); + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, 0, 0, &cached_state); ret2 = btrfs_set_extent_delalloc(inode, page_start, end, 0, - &cached_state, 0); + &cached_state); if (ret2) { unlock_extent_cached(io_tree, page_start, page_end, &cached_state); @@ -9060,7 +9082,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) int ret; struct btrfs_trans_handle *trans; u64 mask = fs_info->sectorsize - 1; - u64 min_size = btrfs_calc_trunc_metadata_size(fs_info, 1); + u64 min_size = btrfs_calc_metadata_size(fs_info, 1); if (!skip_writeback) { ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask), @@ -9380,6 +9402,7 @@ void __cold btrfs_destroy_cachep(void) kmem_cache_destroy(btrfs_trans_handle_cachep); kmem_cache_destroy(btrfs_path_cachep); kmem_cache_destroy(btrfs_free_space_cachep); + kmem_cache_destroy(btrfs_free_space_bitmap_cachep); } int __init btrfs_init_cachep(void) @@ -9409,6 +9432,12 @@ int __init btrfs_init_cachep(void) if (!btrfs_free_space_cachep) goto fail; + btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", + PAGE_SIZE, PAGE_SIZE, + SLAB_RED_ZONE, NULL); + if (!btrfs_free_space_bitmap_cachep) + goto fail; + return 0; fail: btrfs_destroy_cachep(); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 818f7ec8bb0e..de730e56d3f5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -45,6 +45,7 @@ #include "compression.h" #include "space-info.h" #include "delalloc-space.h" +#include "block-group.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI @@ -1332,9 +1333,8 @@ again: lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, &cached_state); clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, - page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, - &cached_state); + page_end - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, 0, 0, &cached_state); if (i_done != page_cnt) { spin_lock(&BTRFS_I(inode)->lock); @@ -1840,8 +1840,15 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, goto free_args; } - if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) + if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) { + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + + btrfs_warn(fs_info, +"SNAP_CREATE_V2 ioctl with CREATE_ASYNC is deprecated and will be removed in kernel 5.7"); + ptr = &transid; + } if (vol_args->flags & BTRFS_SUBVOL_RDONLY) readonly = true; if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { @@ -3324,61 +3331,6 @@ out: return ret; } -static void clone_update_extent_map(struct btrfs_inode *inode, - const struct btrfs_trans_handle *trans, - const struct btrfs_path *path, - const u64 hole_offset, - const u64 hole_len) -{ - struct extent_map_tree *em_tree = &inode->extent_tree; - struct extent_map *em; - int ret; - - em = alloc_extent_map(); - if (!em) { - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); - return; - } - - if (path) { - struct btrfs_file_extent_item *fi; - - fi = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - btrfs_extent_item_to_extent_map(inode, path, fi, false, em); - em->generation = -1; - if (btrfs_file_extent_type(path->nodes[0], fi) == - BTRFS_FILE_EXTENT_INLINE) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &inode->runtime_flags); - } else { - em->start = hole_offset; - em->len = hole_len; - em->ram_bytes = em->len; - em->orig_start = hole_offset; - em->block_start = EXTENT_MAP_HOLE; - em->block_len = 0; - em->orig_block_len = 0; - em->compress_type = BTRFS_COMPRESS_NONE; - em->generation = trans->transid; - } - - while (1) { - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 1); - write_unlock(&em_tree->lock); - if (ret != -EEXIST) { - free_extent_map(em); - break; - } - btrfs_drop_extent_cache(inode, em->start, - em->start + em->len - 1, 0); - } - - if (ret) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); -} - /* * Make sure we do not end up inserting an inline extent into a file that has * already other (non-inline) extents. If a file has an inline extent it can @@ -3519,6 +3471,7 @@ copy_inline_extent: path->slots[0]), size); inode_add_bytes(dst, datal); + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags); return 0; } @@ -3570,6 +3523,14 @@ static int btrfs_clone(struct inode *src, struct inode *inode, while (1) { u64 next_key_min_offset = key.offset + 1; + struct btrfs_file_extent_item *extent; + int type; + u32 size; + struct btrfs_key new_key; + u64 disko = 0, diskl = 0; + u64 datao = 0, datal = 0; + u8 comp; + u64 drop_start; /* * note the key will change type as we walk through the @@ -3610,75 +3571,115 @@ process_slot: key.objectid != btrfs_ino(BTRFS_I(src))) break; - if (key.type == BTRFS_EXTENT_DATA_KEY) { - struct btrfs_file_extent_item *extent; - int type; - u32 size; - struct btrfs_key new_key; - u64 disko = 0, diskl = 0; - u64 datao = 0, datal = 0; - u8 comp; - u64 drop_start; - - extent = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); - comp = btrfs_file_extent_compression(leaf, extent); - type = btrfs_file_extent_type(leaf, extent); - if (type == BTRFS_FILE_EXTENT_REG || - type == BTRFS_FILE_EXTENT_PREALLOC) { - disko = btrfs_file_extent_disk_bytenr(leaf, - extent); - diskl = btrfs_file_extent_disk_num_bytes(leaf, - extent); - datao = btrfs_file_extent_offset(leaf, extent); - datal = btrfs_file_extent_num_bytes(leaf, - extent); - } else if (type == BTRFS_FILE_EXTENT_INLINE) { - /* take upper bound, may be compressed */ - datal = btrfs_file_extent_ram_bytes(leaf, - extent); - } + ASSERT(key.type == BTRFS_EXTENT_DATA_KEY); + + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + comp = btrfs_file_extent_compression(leaf, extent); + type = btrfs_file_extent_type(leaf, extent); + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + disko = btrfs_file_extent_disk_bytenr(leaf, extent); + diskl = btrfs_file_extent_disk_num_bytes(leaf, extent); + datao = btrfs_file_extent_offset(leaf, extent); + datal = btrfs_file_extent_num_bytes(leaf, extent); + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + /* Take upper bound, may be compressed */ + datal = btrfs_file_extent_ram_bytes(leaf, extent); + } + + /* + * The first search might have left us at an extent item that + * ends before our target range's start, can happen if we have + * holes and NO_HOLES feature enabled. + */ + if (key.offset + datal <= off) { + path->slots[0]++; + goto process_slot; + } else if (key.offset >= off + len) { + break; + } + next_key_min_offset = key.offset + datal; + size = btrfs_item_size_nr(leaf, slot); + read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), + size); + + btrfs_release_path(path); + path->leave_spinning = 0; + + memcpy(&new_key, &key, sizeof(new_key)); + new_key.objectid = btrfs_ino(BTRFS_I(inode)); + if (off <= key.offset) + new_key.offset = key.offset + destoff - off; + else + new_key.offset = destoff; + + /* + * Deal with a hole that doesn't have an extent item that + * represents it (NO_HOLES feature enabled). + * This hole is either in the middle of the cloning range or at + * the beginning (fully overlaps it or partially overlaps it). + */ + if (new_key.offset != last_dest_end) + drop_start = last_dest_end; + else + drop_start = new_key.offset; + + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + struct btrfs_clone_extent_info clone_info; /* - * The first search might have left us at an extent - * item that ends before our target range's start, can - * happen if we have holes and NO_HOLES feature enabled. + * a | --- range to clone ---| b + * | ------------- extent ------------- | */ - if (key.offset + datal <= off) { - path->slots[0]++; - goto process_slot; - } else if (key.offset >= off + len) { - break; + + /* Subtract range b */ + if (key.offset + datal > off + len) + datal = off + len - key.offset; + + /* Subtract range a */ + if (off > key.offset) { + datao += off - key.offset; + datal -= off - key.offset; } - next_key_min_offset = key.offset + datal; - size = btrfs_item_size_nr(leaf, slot); - read_extent_buffer(leaf, buf, - btrfs_item_ptr_offset(leaf, slot), - size); - btrfs_release_path(path); - path->leave_spinning = 0; + clone_info.disk_offset = disko; + clone_info.disk_len = diskl; + clone_info.data_offset = datao; + clone_info.data_len = datal; + clone_info.file_offset = new_key.offset; + clone_info.extent_buf = buf; + clone_info.item_size = size; + ret = btrfs_punch_hole_range(inode, path, + drop_start, + new_key.offset + datal - 1, + &clone_info, &trans); + if (ret) + goto out; + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + u64 skip = 0; + u64 trim = 0; - memcpy(&new_key, &key, sizeof(new_key)); - new_key.objectid = btrfs_ino(BTRFS_I(inode)); - if (off <= key.offset) - new_key.offset = key.offset + destoff - off; - else - new_key.offset = destoff; + if (off > key.offset) { + skip = off - key.offset; + new_key.offset += skip; + } - /* - * Deal with a hole that doesn't have an extent item - * that represents it (NO_HOLES feature enabled). - * This hole is either in the middle of the cloning - * range or at the beginning (fully overlaps it or - * partially overlaps it). - */ - if (new_key.offset != last_dest_end) - drop_start = last_dest_end; - else - drop_start = new_key.offset; + if (key.offset + datal > off + len) + trim = key.offset + datal - (off + len); + + if (comp && (skip || trim)) { + ret = -EINVAL; + goto out; + } + size -= skip + trim; + datal -= skip + trim; /* + * If our extent is inline, we know we will drop or + * adjust at most 1 extent item in the destination root. + * * 1 - adjusting old extent (we may have to split it) * 1 - add new extent * 1 - inode update @@ -3689,140 +3690,28 @@ process_slot: goto out; } - if (type == BTRFS_FILE_EXTENT_REG || - type == BTRFS_FILE_EXTENT_PREALLOC) { - /* - * a | --- range to clone ---| b - * | ------------- extent ------------- | - */ - - /* subtract range b */ - if (key.offset + datal > off + len) - datal = off + len - key.offset; - - /* subtract range a */ - if (off > key.offset) { - datao += off - key.offset; - datal -= off - key.offset; - } - - ret = btrfs_drop_extents(trans, root, inode, - drop_start, - new_key.offset + datal, - 1); - if (ret) { - if (ret != -EOPNOTSUPP) - btrfs_abort_transaction(trans, - ret); - btrfs_end_transaction(trans); - goto out; - } - - ret = btrfs_insert_empty_item(trans, root, path, - &new_key, size); - if (ret) { + ret = clone_copy_inline_extent(inode, trans, path, + &new_key, drop_start, + datal, skip, size, buf); + if (ret) { + if (ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - goto out; - } - - leaf = path->nodes[0]; - slot = path->slots[0]; - write_extent_buffer(leaf, buf, - btrfs_item_ptr_offset(leaf, slot), - size); - - extent = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); - - /* disko == 0 means it's a hole */ - if (!disko) - datao = 0; - - btrfs_set_file_extent_offset(leaf, extent, - datao); - btrfs_set_file_extent_num_bytes(leaf, extent, - datal); - - if (disko) { - struct btrfs_ref ref = { 0 }; - inode_add_bytes(inode, datal); - btrfs_init_generic_ref(&ref, - BTRFS_ADD_DELAYED_REF, disko, - diskl, 0); - btrfs_init_data_ref(&ref, - root->root_key.objectid, - btrfs_ino(BTRFS_I(inode)), - new_key.offset - datao); - ret = btrfs_inc_extent_ref(trans, &ref); - if (ret) { - btrfs_abort_transaction(trans, - ret); - btrfs_end_transaction(trans); - goto out; - - } - } - } else if (type == BTRFS_FILE_EXTENT_INLINE) { - u64 skip = 0; - u64 trim = 0; - - if (off > key.offset) { - skip = off - key.offset; - new_key.offset += skip; - } - - if (key.offset + datal > off + len) - trim = key.offset + datal - (off + len); - - if (comp && (skip || trim)) { - ret = -EINVAL; - btrfs_end_transaction(trans); - goto out; - } - size -= skip + trim; - datal -= skip + trim; - - ret = clone_copy_inline_extent(inode, - trans, path, - &new_key, - drop_start, - datal, - skip, size, buf); - if (ret) { - if (ret != -EOPNOTSUPP) - btrfs_abort_transaction(trans, - ret); - btrfs_end_transaction(trans); - goto out; - } - leaf = path->nodes[0]; - slot = path->slots[0]; + btrfs_end_transaction(trans); + goto out; } + } - /* If we have an implicit hole (NO_HOLES feature). */ - if (drop_start < new_key.offset) - clone_update_extent_map(BTRFS_I(inode), trans, - NULL, drop_start, - new_key.offset - drop_start); - - clone_update_extent_map(BTRFS_I(inode), trans, - path, 0, 0); + btrfs_release_path(path); - btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(path); + last_dest_end = ALIGN(new_key.offset + datal, + fs_info->sectorsize); + ret = clone_finish_inode_update(trans, inode, last_dest_end, + destoff, olen, no_time_update); + if (ret) + goto out; + if (new_key.offset + datal >= destoff + len) + break; - last_dest_end = ALIGN(new_key.offset + datal, - fs_info->sectorsize); - ret = clone_finish_inode_update(trans, inode, - last_dest_end, - destoff, olen, - no_time_update); - if (ret) - goto out; - if (new_key.offset + datal >= destoff + len) - break; - } btrfs_release_path(path); key.offset = next_key_min_offset; @@ -3834,32 +3723,27 @@ process_slot: ret = 0; if (last_dest_end < destoff + len) { + struct btrfs_clone_extent_info clone_info = { 0 }; /* * We have an implicit hole (NO_HOLES feature is enabled) that * fully or partially overlaps our cloning range at its end. */ btrfs_release_path(path); + path->leave_spinning = 0; /* - * 1 - remove extent(s) - * 1 - inode update + * We are dealing with a hole and our clone_info already has a + * disk_offset of 0, we only need to fill the data length and + * file offset. */ - trans = btrfs_start_transaction(root, 2); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out; - } - ret = btrfs_drop_extents(trans, root, inode, - last_dest_end, destoff + len, 1); - if (ret) { - if (ret != -EOPNOTSUPP) - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); + clone_info.data_len = destoff + len - last_dest_end; + clone_info.file_offset = last_dest_end; + ret = btrfs_punch_hole_range(inode, path, + last_dest_end, destoff + len - 1, + &clone_info, &trans); + if (ret) goto out; - } - clone_update_extent_map(BTRFS_I(inode), trans, NULL, - last_dest_end, - destoff + len - last_dest_end); + ret = clone_finish_inode_update(trans, inode, destoff + len, destoff, olen, no_time_update); } @@ -4313,6 +4197,9 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, u64 transid; int ret; + btrfs_warn(root->fs_info, + "START_SYNC ioctl is deprecated and will be removed in kernel 5.7"); + trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { if (PTR_ERR(trans) != -ENOENT) @@ -4340,6 +4227,9 @@ static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info, { u64 transid; + btrfs_warn(fs_info, + "WAIT_SYNC ioctl is deprecated and will be removed in kernel 5.7"); + if (argp) { if (copy_from_user(&transid, argp, sizeof(transid))) return -EFAULT; @@ -5381,7 +5271,7 @@ static int check_feature_bits(struct btrfs_fs_info *fs_info, u64 change_mask, u64 flags, u64 supported_flags, u64 safe_set, u64 safe_clear) { - const char *type = btrfs_feature_set_names[set]; + const char *type = btrfs_feature_set_name(set); char *names; u64 disallowed, unsupported; u64 set_mask = flags & change_mask; @@ -5562,6 +5452,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_setflags(file, argp); case FS_IOC_GETVERSION: return btrfs_ioctl_getversion(file, argp); + case FS_IOC_GETFSLABEL: + return btrfs_ioctl_get_fslabel(file, argp); + case FS_IOC_SETFSLABEL: + return btrfs_ioctl_set_fslabel(file, argp); case FITRIM: return btrfs_ioctl_fitrim(file, argp); case BTRFS_IOC_SNAP_CREATE: @@ -5673,10 +5567,6 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_quota_rescan_wait(file, argp); case BTRFS_IOC_DEV_REPLACE: return btrfs_ioctl_dev_replace(fs_info, argp); - case BTRFS_IOC_GET_FSLABEL: - return btrfs_ioctl_get_fslabel(file, argp); - case BTRFS_IOC_SET_FSLABEL: - return btrfs_ioctl_set_fslabel(file, argp); case BTRFS_IOC_GET_SUPPORTED_FEATURES: return btrfs_ioctl_get_supported_features(argp); case BTRFS_IOC_GET_FEATURES: diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 393eceda57c8..7f9a578a1a20 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -8,6 +8,7 @@ #include <linux/spinlock.h> #include <linux/page-flags.h> #include <asm/bug.h> +#include "misc.h" #include "ctree.h" #include "extent_io.h" #include "locking.h" @@ -119,42 +120,6 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb) } } -void btrfs_clear_lock_blocking_read(struct extent_buffer *eb) -{ - trace_btrfs_clear_lock_blocking_read(eb); - /* - * No lock is required. The lock owner may change if we have a read - * lock, but it won't change to or away from us. If we have the write - * lock, we are the owner and it'll never change. - */ - if (eb->lock_nested && current->pid == eb->lock_owner) - return; - BUG_ON(atomic_read(&eb->blocking_readers) == 0); - read_lock(&eb->lock); - btrfs_assert_spinning_readers_get(eb); - /* atomic_dec_and_test implies a barrier */ - if (atomic_dec_and_test(&eb->blocking_readers)) - cond_wake_up_nomb(&eb->read_lock_wq); -} - -void btrfs_clear_lock_blocking_write(struct extent_buffer *eb) -{ - trace_btrfs_clear_lock_blocking_write(eb); - /* - * no lock is required. The lock owner may change if - * we have a read lock, but it won't change to or away - * from us. If we have the write lock, we are the owner - * and it'll never change. - */ - if (eb->lock_nested && current->pid == eb->lock_owner) - return; - write_lock(&eb->lock); - BUG_ON(eb->blocking_writers != 1); - btrfs_assert_spinning_writers_get(eb); - if (--eb->blocking_writers == 0) - cond_wake_up(&eb->write_lock_wq); -} - /* * take a spinning read lock. This will wait for any blocking * writers diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 595014f64830..b775a4207ed9 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -19,8 +19,6 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb); void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb); void btrfs_set_lock_blocking_read(struct extent_buffer *eb); void btrfs_set_lock_blocking_write(struct extent_buffer *eb); -void btrfs_clear_lock_blocking_read(struct extent_buffer *eb); -void btrfs_clear_lock_blocking_write(struct extent_buffer *eb); void btrfs_assert_tree_locked(struct extent_buffer *eb); int btrfs_try_tree_read_lock(struct extent_buffer *eb); int btrfs_try_tree_write_lock(struct extent_buffer *eb); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 579d53ae256f..acad4174f68d 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -507,11 +507,6 @@ out: return ret; } -static unsigned int lzo_set_level(unsigned int level) -{ - return 0; -} - const struct btrfs_compress_op btrfs_lzo_compress = { .init_workspace_manager = lzo_init_workspace_manager, .cleanup_workspace_manager = lzo_cleanup_workspace_manager, @@ -522,5 +517,6 @@ const struct btrfs_compress_op btrfs_lzo_compress = { .compress_pages = lzo_compress_pages, .decompress_bio = lzo_decompress_bio, .decompress = lzo_decompress, - .set_level = lzo_set_level, + .max_level = 1, + .default_level = 1, }; diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h deleted file mode 100644 index 75246f2f56ba..000000000000 --- a/fs/btrfs/math.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2012 Fujitsu. All rights reserved. - * Written by Miao Xie <miaox@cn.fujitsu.com> - */ - -#ifndef BTRFS_MATH_H -#define BTRFS_MATH_H - -#include <asm/div64.h> - -static inline u64 div_factor(u64 num, int factor) -{ - if (factor == 10) - return num; - num *= factor; - return div_u64(num, 10); -} - -static inline u64 div_factor_fine(u64 num, int factor) -{ - if (factor == 100) - return num; - num *= factor; - return div_u64(num, 100); -} - -#endif diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h new file mode 100644 index 000000000000..7d564924dfeb --- /dev/null +++ b/fs/btrfs/misc.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_MISC_H +#define BTRFS_MISC_H + +#include <linux/sched.h> +#include <linux/wait.h> +#include <asm/div64.h> + +#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) + +static inline void cond_wake_up(struct wait_queue_head *wq) +{ + /* + * This implies a full smp_mb barrier, see comments for + * waitqueue_active why. + */ + if (wq_has_sleeper(wq)) + wake_up(wq); +} + +static inline void cond_wake_up_nomb(struct wait_queue_head *wq) +{ + /* + * Special case for conditional wakeup where the barrier required for + * waitqueue_active is implied by some of the preceding code. Eg. one + * of such atomic operations (atomic_dec_and_return, ...), or a + * unlock/lock sequence, etc. + */ + if (waitqueue_active(wq)) + wake_up(wq); +} + +static inline u64 div_factor(u64 num, int factor) +{ + if (factor == 10) + return num; + num *= factor; + return div_u64(num, 10); +} + +static inline u64 div_factor_fine(u64 num, int factor) +{ + if (factor == 100) + return num; + num *= factor; + return div_u64(num, 100); +} + +#endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index ae7f64a8facb..24b6c72b9a59 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -7,6 +7,7 @@ #include <linux/blkdev.h> #include <linux/writeback.h> #include <linux/sched/mm.h> +#include "misc.h" #include "ctree.h" #include "transaction.h" #include "btrfs_inode.h" diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index e0469816c678..1e664e0b59b8 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -362,7 +362,7 @@ static int inherit_props(struct btrfs_trans_handle *trans, * reservations if we do add more properties in the future. */ if (need_reserve) { - num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); + num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1); ret = btrfs_block_rsv_add(root, trans->block_rsv, num_bytes, BTRFS_RESERVE_NO_FLUSH); if (ret) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index f8a3c1b0a15a..8d3bd799ac7d 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -21,7 +21,7 @@ #include "backref.h" #include "extent_io.h" #include "qgroup.h" - +#include "block-group.h" /* TODO XXX FIXME * - subvol delete -> delete when ref goes to 0? delete limits also? @@ -1312,8 +1312,9 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; struct ulist *tmp; + bool found = false; int ret = 0; - int err; + int ret2; tmp = ulist_alloc(GFP_KERNEL); if (!tmp) @@ -1327,28 +1328,39 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, member = find_qgroup_rb(fs_info, src); parent = find_qgroup_rb(fs_info, dst); - if (!member || !parent) { - ret = -EINVAL; - goto out; - } + /* + * The parent/member pair doesn't exist, then try to delete the dead + * relation items only. + */ + if (!member || !parent) + goto delete_item; /* check if such qgroup relation exist firstly */ list_for_each_entry(list, &member->groups, next_group) { - if (list->group == parent) - goto exist; + if (list->group == parent) { + found = true; + break; + } } - ret = -ENOENT; - goto out; -exist: + +delete_item: ret = del_qgroup_relation_item(trans, src, dst); - err = del_qgroup_relation_item(trans, dst, src); - if (err && !ret) - ret = err; + if (ret < 0 && ret != -ENOENT) + goto out; + ret2 = del_qgroup_relation_item(trans, dst, src); + if (ret2 < 0 && ret2 != -ENOENT) + goto out; - spin_lock(&fs_info->qgroup_lock); - del_relation_rb(fs_info, src, dst); - ret = quick_update_accounting(fs_info, tmp, src, dst, -1); - spin_unlock(&fs_info->qgroup_lock); + /* At least one deletion succeeded, return 0 */ + if (!ret || !ret2) + ret = 0; + + if (found) { + spin_lock(&fs_info->qgroup_lock); + del_relation_rb(fs_info, src, dst); + ret = quick_update_accounting(fs_info, tmp, src, dst, -1); + spin_unlock(&fs_info->qgroup_lock); + } out: ulist_free(tmp); return ret; diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index f3d0576dd327..57a2ac721985 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -35,6 +35,22 @@ #define RBIO_CACHE_SIZE 1024 +#define BTRFS_STRIPE_HASH_TABLE_BITS 11 + +/* Used by the raid56 code to lock stripes for read/modify/write */ +struct btrfs_stripe_hash { + struct list_head hash_list; + spinlock_t lock; +}; + +/* Used by the raid56 code to lock stripes for read/modify/write */ +struct btrfs_stripe_hash_table { + struct list_head stripe_cache; + spinlock_t cache_lock; + int cache_size; + struct btrfs_stripe_hash table[]; +}; + enum btrfs_rbio_ops { BTRFS_RBIO_WRITE, BTRFS_RBIO_READ_REBUILD, diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index bb5bd49573b4..ee6f60547a8d 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -14,6 +14,7 @@ #include "disk-io.h" #include "transaction.h" #include "dev-replace.h" +#include "block-group.h" #undef DEBUG @@ -638,6 +639,35 @@ static int reada_pick_zone(struct btrfs_device *dev) return 1; } +static int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, + int mirror_num, struct extent_buffer **eb) +{ + struct extent_buffer *buf = NULL; + int ret; + + buf = btrfs_find_create_tree_block(fs_info, bytenr); + if (IS_ERR(buf)) + return 0; + + set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags); + + ret = read_extent_buffer_pages(buf, WAIT_PAGE_LOCK, mirror_num); + if (ret) { + free_extent_buffer_stale(buf); + return ret; + } + + if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) { + free_extent_buffer_stale(buf); + return -EIO; + } else if (extent_buffer_uptodate(buf)) { + *eb = buf; + } else { + free_extent_buffer(buf); + } + return 0; +} + static int reada_start_machine_dev(struct btrfs_device *dev) { struct btrfs_fs_info *fs_info = dev->fs_info; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 7f219851fa23..2f0e25afa486 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -21,6 +21,7 @@ #include "qgroup.h" #include "print-tree.h" #include "delalloc-space.h" +#include "block-group.h" /* * backref_node, mapping_node and tree_block start with this @@ -3311,7 +3312,7 @@ static int relocate_file_extent_cluster(struct inode *inode, } ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, - NULL, 0); + NULL); if (ret) { unlock_page(page); put_page(page); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 47733fb55df7..3b17b647d002 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -533,7 +533,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, return ret; } - num_bytes = btrfs_calc_trans_metadata_size(fs_info, items); + num_bytes = btrfs_calc_insert_metadata_size(fs_info, items); rsv->space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); ret = btrfs_block_rsv_add(root, rsv, num_bytes, diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 0c99cf9fb595..f7d4e03f4c5d 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -18,6 +18,7 @@ #include "check-integrity.h" #include "rcu-string.h" #include "raid56.h" +#include "block-group.h" /* * This is only the first step towards a full-features scrub. It reads all diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index c3c0c064c25d..f3215028235c 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -260,6 +260,21 @@ struct name_cache_entry { char name[]; }; +#define ADVANCE 1 +#define ADVANCE_ONLY_NEXT -1 + +enum btrfs_compare_tree_result { + BTRFS_COMPARE_TREE_NEW, + BTRFS_COMPARE_TREE_DELETED, + BTRFS_COMPARE_TREE_CHANGED, + BTRFS_COMPARE_TREE_SAME, +}; +typedef int (*btrfs_changed_cb_t)(struct btrfs_path *left_path, + struct btrfs_path *right_path, + struct btrfs_key *key, + enum btrfs_compare_tree_result result, + void *ctx); + __cold static void inconsistent_snapshot_error(struct send_ctx *sctx, enum btrfs_compare_tree_result result, @@ -6514,6 +6529,366 @@ out: return ret; } +static int tree_move_down(struct btrfs_path *path, int *level) +{ + struct extent_buffer *eb; + + BUG_ON(*level == 0); + eb = btrfs_read_node_slot(path->nodes[*level], path->slots[*level]); + if (IS_ERR(eb)) + return PTR_ERR(eb); + + path->nodes[*level - 1] = eb; + path->slots[*level - 1] = 0; + (*level)--; + return 0; +} + +static int tree_move_next_or_upnext(struct btrfs_path *path, + int *level, int root_level) +{ + int ret = 0; + int nritems; + nritems = btrfs_header_nritems(path->nodes[*level]); + + path->slots[*level]++; + + while (path->slots[*level] >= nritems) { + if (*level == root_level) + return -1; + + /* move upnext */ + path->slots[*level] = 0; + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + (*level)++; + path->slots[*level]++; + + nritems = btrfs_header_nritems(path->nodes[*level]); + ret = 1; + } + return ret; +} + +/* + * Returns 1 if it had to move up and next. 0 is returned if it moved only next + * or down. + */ +static int tree_advance(struct btrfs_path *path, + int *level, int root_level, + int allow_down, + struct btrfs_key *key) +{ + int ret; + + if (*level == 0 || !allow_down) { + ret = tree_move_next_or_upnext(path, level, root_level); + } else { + ret = tree_move_down(path, level); + } + if (ret >= 0) { + if (*level == 0) + btrfs_item_key_to_cpu(path->nodes[*level], key, + path->slots[*level]); + else + btrfs_node_key_to_cpu(path->nodes[*level], key, + path->slots[*level]); + } + return ret; +} + +static int tree_compare_item(struct btrfs_path *left_path, + struct btrfs_path *right_path, + char *tmp_buf) +{ + int cmp; + int len1, len2; + unsigned long off1, off2; + + len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]); + len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]); + if (len1 != len2) + return 1; + + off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]); + off2 = btrfs_item_ptr_offset(right_path->nodes[0], + right_path->slots[0]); + + read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1); + + cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1); + if (cmp) + return 1; + return 0; +} + +/* + * This function compares two trees and calls the provided callback for + * every changed/new/deleted item it finds. + * If shared tree blocks are encountered, whole subtrees are skipped, making + * the compare pretty fast on snapshotted subvolumes. + * + * This currently works on commit roots only. As commit roots are read only, + * we don't do any locking. The commit roots are protected with transactions. + * Transactions are ended and rejoined when a commit is tried in between. + * + * This function checks for modifications done to the trees while comparing. + * If it detects a change, it aborts immediately. + */ +static int btrfs_compare_trees(struct btrfs_root *left_root, + struct btrfs_root *right_root, + btrfs_changed_cb_t changed_cb, void *ctx) +{ + struct btrfs_fs_info *fs_info = left_root->fs_info; + int ret; + int cmp; + struct btrfs_path *left_path = NULL; + struct btrfs_path *right_path = NULL; + struct btrfs_key left_key; + struct btrfs_key right_key; + char *tmp_buf = NULL; + int left_root_level; + int right_root_level; + int left_level; + int right_level; + int left_end_reached; + int right_end_reached; + int advance_left; + int advance_right; + u64 left_blockptr; + u64 right_blockptr; + u64 left_gen; + u64 right_gen; + + left_path = btrfs_alloc_path(); + if (!left_path) { + ret = -ENOMEM; + goto out; + } + right_path = btrfs_alloc_path(); + if (!right_path) { + ret = -ENOMEM; + goto out; + } + + tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); + if (!tmp_buf) { + ret = -ENOMEM; + goto out; + } + + left_path->search_commit_root = 1; + left_path->skip_locking = 1; + right_path->search_commit_root = 1; + right_path->skip_locking = 1; + + /* + * Strategy: Go to the first items of both trees. Then do + * + * If both trees are at level 0 + * Compare keys of current items + * If left < right treat left item as new, advance left tree + * and repeat + * If left > right treat right item as deleted, advance right tree + * and repeat + * If left == right do deep compare of items, treat as changed if + * needed, advance both trees and repeat + * If both trees are at the same level but not at level 0 + * Compare keys of current nodes/leafs + * If left < right advance left tree and repeat + * If left > right advance right tree and repeat + * If left == right compare blockptrs of the next nodes/leafs + * If they match advance both trees but stay at the same level + * and repeat + * If they don't match advance both trees while allowing to go + * deeper and repeat + * If tree levels are different + * Advance the tree that needs it and repeat + * + * Advancing a tree means: + * If we are at level 0, try to go to the next slot. If that's not + * possible, go one level up and repeat. Stop when we found a level + * where we could go to the next slot. We may at this point be on a + * node or a leaf. + * + * If we are not at level 0 and not on shared tree blocks, go one + * level deeper. + * + * If we are not at level 0 and on shared tree blocks, go one slot to + * the right if possible or go up and right. + */ + + down_read(&fs_info->commit_root_sem); + left_level = btrfs_header_level(left_root->commit_root); + left_root_level = left_level; + left_path->nodes[left_level] = + btrfs_clone_extent_buffer(left_root->commit_root); + if (!left_path->nodes[left_level]) { + up_read(&fs_info->commit_root_sem); + ret = -ENOMEM; + goto out; + } + + right_level = btrfs_header_level(right_root->commit_root); + right_root_level = right_level; + right_path->nodes[right_level] = + btrfs_clone_extent_buffer(right_root->commit_root); + if (!right_path->nodes[right_level]) { + up_read(&fs_info->commit_root_sem); + ret = -ENOMEM; + goto out; + } + up_read(&fs_info->commit_root_sem); + + if (left_level == 0) + btrfs_item_key_to_cpu(left_path->nodes[left_level], + &left_key, left_path->slots[left_level]); + else + btrfs_node_key_to_cpu(left_path->nodes[left_level], + &left_key, left_path->slots[left_level]); + if (right_level == 0) + btrfs_item_key_to_cpu(right_path->nodes[right_level], + &right_key, right_path->slots[right_level]); + else + btrfs_node_key_to_cpu(right_path->nodes[right_level], + &right_key, right_path->slots[right_level]); + + left_end_reached = right_end_reached = 0; + advance_left = advance_right = 0; + + while (1) { + cond_resched(); + if (advance_left && !left_end_reached) { + ret = tree_advance(left_path, &left_level, + left_root_level, + advance_left != ADVANCE_ONLY_NEXT, + &left_key); + if (ret == -1) + left_end_reached = ADVANCE; + else if (ret < 0) + goto out; + advance_left = 0; + } + if (advance_right && !right_end_reached) { + ret = tree_advance(right_path, &right_level, + right_root_level, + advance_right != ADVANCE_ONLY_NEXT, + &right_key); + if (ret == -1) + right_end_reached = ADVANCE; + else if (ret < 0) + goto out; + advance_right = 0; + } + + if (left_end_reached && right_end_reached) { + ret = 0; + goto out; + } else if (left_end_reached) { + if (right_level == 0) { + ret = changed_cb(left_path, right_path, + &right_key, + BTRFS_COMPARE_TREE_DELETED, + ctx); + if (ret < 0) + goto out; + } + advance_right = ADVANCE; + continue; + } else if (right_end_reached) { + if (left_level == 0) { + ret = changed_cb(left_path, right_path, + &left_key, + BTRFS_COMPARE_TREE_NEW, + ctx); + if (ret < 0) + goto out; + } + advance_left = ADVANCE; + continue; + } + + if (left_level == 0 && right_level == 0) { + cmp = btrfs_comp_cpu_keys(&left_key, &right_key); + if (cmp < 0) { + ret = changed_cb(left_path, right_path, + &left_key, + BTRFS_COMPARE_TREE_NEW, + ctx); + if (ret < 0) + goto out; + advance_left = ADVANCE; + } else if (cmp > 0) { + ret = changed_cb(left_path, right_path, + &right_key, + BTRFS_COMPARE_TREE_DELETED, + ctx); + if (ret < 0) + goto out; + advance_right = ADVANCE; + } else { + enum btrfs_compare_tree_result result; + + WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); + ret = tree_compare_item(left_path, right_path, + tmp_buf); + if (ret) + result = BTRFS_COMPARE_TREE_CHANGED; + else + result = BTRFS_COMPARE_TREE_SAME; + ret = changed_cb(left_path, right_path, + &left_key, result, ctx); + if (ret < 0) + goto out; + advance_left = ADVANCE; + advance_right = ADVANCE; + } + } else if (left_level == right_level) { + cmp = btrfs_comp_cpu_keys(&left_key, &right_key); + if (cmp < 0) { + advance_left = ADVANCE; + } else if (cmp > 0) { + advance_right = ADVANCE; + } else { + left_blockptr = btrfs_node_blockptr( + left_path->nodes[left_level], + left_path->slots[left_level]); + right_blockptr = btrfs_node_blockptr( + right_path->nodes[right_level], + right_path->slots[right_level]); + left_gen = btrfs_node_ptr_generation( + left_path->nodes[left_level], + left_path->slots[left_level]); + right_gen = btrfs_node_ptr_generation( + right_path->nodes[right_level], + right_path->slots[right_level]); + if (left_blockptr == right_blockptr && + left_gen == right_gen) { + /* + * As we're on a shared block, don't + * allow to go deeper. + */ + advance_left = ADVANCE_ONLY_NEXT; + advance_right = ADVANCE_ONLY_NEXT; + } else { + advance_left = ADVANCE; + advance_right = ADVANCE; + } + } + } else if (left_level < right_level) { + advance_right = ADVANCE; + } else { + advance_left = ADVANCE; + } + } + +out: + btrfs_free_path(left_path); + btrfs_free_path(right_path); + kvfree(tmp_buf); + return ret; +} + static int send_subvol(struct send_ctx *sctx) { int ret; diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index ab7b9ec4c240..98dc092a905e 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include "misc.h" #include "ctree.h" #include "space-info.h" #include "sysfs.h" @@ -7,7 +8,7 @@ #include "free-space-cache.h" #include "ordered-data.h" #include "transaction.h" -#include "math.h" +#include "block-group.h" u64 btrfs_space_info_used(struct btrfs_space_info *s_info, bool may_use_included) @@ -33,23 +34,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) rcu_read_unlock(); } -static const char *alloc_name(u64 flags) -{ - switch (flags) { - case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: - return "mixed"; - case BTRFS_BLOCK_GROUP_METADATA: - return "metadata"; - case BTRFS_BLOCK_GROUP_DATA: - return "data"; - case BTRFS_BLOCK_GROUP_SYSTEM: - return "system"; - default: - WARN_ON(1); - return "invalid-combination"; - }; -} - static int create_space_info(struct btrfs_fs_info *info, u64 flags) { @@ -79,13 +63,9 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) INIT_LIST_HEAD(&space_info->tickets); INIT_LIST_HEAD(&space_info->priority_tickets); - ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, - info->space_info_kobj, "%s", - alloc_name(space_info->flags)); - if (ret) { - kobject_put(&space_info->kobj); + ret = btrfs_sysfs_add_space_info_type(info, space_info); + if (ret) return ret; - } list_add_rcu(&space_info->list, &info->space_info); if (flags & BTRFS_BLOCK_GROUP_DATA) @@ -151,9 +131,7 @@ void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, found->bytes_readonly += bytes_readonly; if (total_bytes > 0) found->full = 0; - btrfs_space_info_add_new_bytes(info, found, - total_bytes - bytes_used - - bytes_readonly); + btrfs_try_granting_tickets(info, found); spin_unlock(&found->lock); *space_info = found; } @@ -187,9 +165,7 @@ static int can_overcommit(struct btrfs_fs_info *fs_info, enum btrfs_reserve_flush_enum flush, bool system_chunk) { - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; u64 profile; - u64 space_size; u64 avail; u64 used; int factor; @@ -203,22 +179,7 @@ static int can_overcommit(struct btrfs_fs_info *fs_info, else profile = btrfs_metadata_alloc_profile(fs_info); - used = btrfs_space_info_used(space_info, false); - - /* - * We only want to allow over committing if we have lots of actual space - * free, but if we don't have enough space to handle the global reserve - * space then we could end up having a real enospc problem when trying - * to allocate a chunk or some other such important allocation. - */ - spin_lock(&global_rsv->lock); - space_size = calc_global_rsv_need_space(global_rsv); - spin_unlock(&global_rsv->lock); - if (used + space_size >= space_info->total_bytes) - return 0; - - used += space_info->bytes_may_use; - + used = btrfs_space_info_used(space_info, true); avail = atomic64_read(&fs_info->free_chunk_space); /* @@ -249,103 +210,41 @@ static int can_overcommit(struct btrfs_fs_info *fs_info, * This is for space we already have accounted in space_info->bytes_may_use, so * basically when we're returning space from block_rsv's. */ -void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 num_bytes) +void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info) { - struct reserve_ticket *ticket; struct list_head *head; - u64 used; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; - bool check_overcommit = false; - spin_lock(&space_info->lock); - head = &space_info->priority_tickets; + lockdep_assert_held(&space_info->lock); - /* - * If we are over our limit then we need to check and see if we can - * overcommit, and if we can't then we just need to free up our space - * and not satisfy any requests. - */ - used = btrfs_space_info_used(space_info, true); - if (used - num_bytes >= space_info->total_bytes) - check_overcommit = true; + head = &space_info->priority_tickets; again: - while (!list_empty(head) && num_bytes) { - ticket = list_first_entry(head, struct reserve_ticket, - list); - /* - * We use 0 bytes because this space is already reserved, so - * adding the ticket space would be a double count. - */ - if (check_overcommit && - !can_overcommit(fs_info, space_info, 0, flush, false)) - break; - if (num_bytes >= ticket->bytes) { - list_del_init(&ticket->list); - num_bytes -= ticket->bytes; - ticket->bytes = 0; - space_info->tickets_id++; - wake_up(&ticket->wait); - } else { - ticket->bytes -= num_bytes; - num_bytes = 0; - } - } - - if (num_bytes && head == &space_info->priority_tickets) { - head = &space_info->tickets; - flush = BTRFS_RESERVE_FLUSH_ALL; - goto again; - } - btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, num_bytes, 0); - spin_unlock(&space_info->lock); -} + while (!list_empty(head)) { + struct reserve_ticket *ticket; + u64 used = btrfs_space_info_used(space_info, true); -/* - * This is for newly allocated space that isn't accounted in - * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent - * we use this helper. - */ -void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 num_bytes) -{ - struct reserve_ticket *ticket; - struct list_head *head = &space_info->priority_tickets; + ticket = list_first_entry(head, struct reserve_ticket, list); -again: - while (!list_empty(head) && num_bytes) { - ticket = list_first_entry(head, struct reserve_ticket, - list); - if (num_bytes >= ticket->bytes) { - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, - ticket->bytes, 1); - list_del_init(&ticket->list); - num_bytes -= ticket->bytes; + /* Check and see if our ticket can be satisified now. */ + if ((used + ticket->bytes <= space_info->total_bytes) || + can_overcommit(fs_info, space_info, ticket->bytes, flush, + false)) { btrfs_space_info_update_bytes_may_use(fs_info, space_info, ticket->bytes); + list_del_init(&ticket->list); ticket->bytes = 0; space_info->tickets_id++; wake_up(&ticket->wait); } else { - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, - num_bytes, 1); - btrfs_space_info_update_bytes_may_use(fs_info, - space_info, - num_bytes); - ticket->bytes -= num_bytes; - num_bytes = 0; + break; } } - if (num_bytes && head == &space_info->priority_tickets) { + if (head == &space_info->priority_tickets) { head = &space_info->tickets; + flush = BTRFS_RESERVE_FLUSH_ALL; goto again; } } @@ -359,14 +258,11 @@ do { \ spin_unlock(&__rsv->lock); \ } while (0) -void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *info, u64 bytes, - int dump_block_groups) +static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *info) { - struct btrfs_block_group_cache *cache; - int index = 0; + lockdep_assert_held(&info->lock); - spin_lock(&info->lock); btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", info->flags, info->total_bytes - btrfs_space_info_used(info, true), @@ -376,7 +272,6 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, info->total_bytes, info->bytes_used, info->bytes_pinned, info->bytes_reserved, info->bytes_may_use, info->bytes_readonly); - spin_unlock(&info->lock); DUMP_BLOCK_RSV(fs_info, global_block_rsv); DUMP_BLOCK_RSV(fs_info, trans_block_rsv); @@ -384,6 +279,19 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); +} + +void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *info, u64 bytes, + int dump_block_groups) +{ + struct btrfs_block_group_cache *cache; + int index = 0; + + spin_lock(&info->lock); + __btrfs_dump_space_info(fs_info, info); + spin_unlock(&info->lock); + if (!dump_block_groups) return; @@ -432,7 +340,7 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, u64 bytes; u64 nr; - bytes = btrfs_calc_trans_metadata_size(fs_info, 1); + bytes = btrfs_calc_insert_metadata_size(fs_info, 1); nr = div64_u64(to_reclaim, bytes); if (!nr) nr = 1; @@ -557,12 +465,19 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans; u64 bytes_needed; u64 reclaim_bytes = 0; + u64 cur_free_bytes = 0; trans = (struct btrfs_trans_handle *)current->journal_info; if (trans) return -EAGAIN; spin_lock(&space_info->lock); + cur_free_bytes = btrfs_space_info_used(space_info, true); + if (cur_free_bytes < space_info->total_bytes) + cur_free_bytes = space_info->total_bytes - cur_free_bytes; + else + cur_free_bytes = 0; + if (!list_empty(&space_info->priority_tickets)) ticket = list_first_entry(&space_info->priority_tickets, struct reserve_ticket, list); @@ -570,6 +485,11 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); bytes_needed = (ticket) ? ticket->bytes : 0; + + if (bytes_needed > cur_free_bytes) + bytes_needed -= cur_free_bytes; + else + bytes_needed = 0; spin_unlock(&space_info->lock); if (!bytes_needed) @@ -684,7 +604,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, if (ret > 0 || ret == -ENOSPC) ret = 0; break; - case COMMIT_TRANS: + case RUN_DELAYED_IPUTS: /* * If we have pending delayed iputs then we could free up a * bunch of pinned space, so make sure we run the iputs before @@ -692,7 +612,8 @@ static void flush_space(struct btrfs_fs_info *fs_info, */ btrfs_run_delayed_iputs(fs_info); btrfs_wait_on_delayed_iputs(fs_info); - + break; + case COMMIT_TRANS: ret = may_commit_transaction(fs_info, space_info); break; default: @@ -762,19 +683,70 @@ static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); } -static bool wake_all_tickets(struct list_head *head) +/* + * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets + * @fs_info - fs_info for this fs + * @space_info - the space info we were flushing + * + * We call this when we've exhausted our flushing ability and haven't made + * progress in satisfying tickets. The reservation code handles tickets in + * order, so if there is a large ticket first and then smaller ones we could + * very well satisfy the smaller tickets. This will attempt to wake up any + * tickets in the list to catch this case. + * + * This function returns true if it was able to make progress by clearing out + * other tickets, or if it stumbles across a ticket that was smaller than the + * first ticket. + */ +static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info) { struct reserve_ticket *ticket; + u64 tickets_id = space_info->tickets_id; + u64 first_ticket_bytes = 0; + + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { + btrfs_info(fs_info, "cannot satisfy tickets, dumping space info"); + __btrfs_dump_space_info(fs_info, space_info); + } + + while (!list_empty(&space_info->tickets) && + tickets_id == space_info->tickets_id) { + ticket = list_first_entry(&space_info->tickets, + struct reserve_ticket, list); + + /* + * may_commit_transaction will avoid committing the transaction + * if it doesn't feel like the space reclaimed by the commit + * would result in the ticket succeeding. However if we have a + * smaller ticket in the queue it may be small enough to be + * satisified by committing the transaction, so if any + * subsequent ticket is smaller than the first ticket go ahead + * and send us back for another loop through the enospc flushing + * code. + */ + if (first_ticket_bytes == 0) + first_ticket_bytes = ticket->bytes; + else if (first_ticket_bytes > ticket->bytes) + return true; + + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) + btrfs_info(fs_info, "failing ticket with %llu bytes", + ticket->bytes); - while (!list_empty(head)) { - ticket = list_first_entry(head, struct reserve_ticket, list); list_del_init(&ticket->list); ticket->error = -ENOSPC; wake_up(&ticket->wait); - if (ticket->bytes != ticket->orig_bytes) - return true; + + /* + * We're just throwing tickets away, so more flushing may not + * trip over btrfs_try_granting_tickets, so we need to call it + * here to see if we can make progress with the next ticket in + * the list. + */ + btrfs_try_granting_tickets(fs_info, space_info); } - return false; + return (tickets_id != space_info->tickets_id); } /* @@ -842,7 +814,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) if (flush_state > COMMIT_TRANS) { commit_cycles++; if (commit_cycles > 2) { - if (wake_all_tickets(&space_info->tickets)) { + if (maybe_fail_all_tickets(fs_info, space_info)) { flush_state = FLUSH_DELAYED_ITEMS_NR; commit_cycles--; } else { @@ -867,9 +839,22 @@ static const enum btrfs_flush_state priority_flush_states[] = { ALLOC_CHUNK, }; +static const enum btrfs_flush_state evict_flush_states[] = { + FLUSH_DELAYED_ITEMS_NR, + FLUSH_DELAYED_ITEMS, + FLUSH_DELAYED_REFS_NR, + FLUSH_DELAYED_REFS, + FLUSH_DELALLOC, + FLUSH_DELALLOC_WAIT, + ALLOC_CHUNK, + COMMIT_TRANS, +}; + static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - struct reserve_ticket *ticket) + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket, + const enum btrfs_flush_state *states, + int states_nr) { u64 to_reclaim; int flush_state; @@ -885,8 +870,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, flush_state = 0; do { - flush_space(fs_info, space_info, to_reclaim, - priority_flush_states[flush_state]); + flush_space(fs_info, space_info, to_reclaim, states[flush_state]); flush_state++; spin_lock(&space_info->lock); if (ticket->bytes == 0) { @@ -894,23 +878,22 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, return; } spin_unlock(&space_info->lock); - } while (flush_state < ARRAY_SIZE(priority_flush_states)); + } while (flush_state < states_nr); } -static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - struct reserve_ticket *ticket) +static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket) { DEFINE_WAIT(wait); - u64 reclaim_bytes = 0; int ret = 0; spin_lock(&space_info->lock); while (ticket->bytes > 0 && ticket->error == 0) { ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); if (ret) { - ret = -EINTR; + ticket->error = -EINTR; break; } spin_unlock(&space_info->lock); @@ -920,17 +903,54 @@ static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, finish_wait(&ticket->wait, &wait); spin_lock(&space_info->lock); } - if (!ret) - ret = ticket->error; - if (!list_empty(&ticket->list)) - list_del_init(&ticket->list); - if (ticket->bytes && ticket->bytes < ticket->orig_bytes) - reclaim_bytes = ticket->orig_bytes - ticket->bytes; spin_unlock(&space_info->lock); +} + +/** + * handle_reserve_ticket - do the appropriate flushing and waiting for a ticket + * @fs_info - the fs + * @space_info - the space_info for the reservation + * @ticket - the ticket for the reservation + * @flush - how much we can flush + * + * This does the work of figuring out how to flush for the ticket, waiting for + * the reservation, and returning the appropriate error if there is one. + */ +static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket, + enum btrfs_reserve_flush_enum flush) +{ + int ret; + + switch (flush) { + case BTRFS_RESERVE_FLUSH_ALL: + wait_reserve_ticket(fs_info, space_info, ticket); + break; + case BTRFS_RESERVE_FLUSH_LIMIT: + priority_reclaim_metadata_space(fs_info, space_info, ticket, + priority_flush_states, + ARRAY_SIZE(priority_flush_states)); + break; + case BTRFS_RESERVE_FLUSH_EVICT: + priority_reclaim_metadata_space(fs_info, space_info, ticket, + evict_flush_states, + ARRAY_SIZE(evict_flush_states)); + break; + default: + ASSERT(0); + break; + } - if (reclaim_bytes) - btrfs_space_info_add_old_bytes(fs_info, space_info, - reclaim_bytes); + spin_lock(&space_info->lock); + ret = ticket->error; + if (ticket->bytes || ticket->error) { + list_del_init(&ticket->list); + if (!ret) + ret = -ENOSPC; + } + spin_unlock(&space_info->lock); + ASSERT(list_empty(&ticket->list)); return ret; } @@ -956,8 +976,8 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, { struct reserve_ticket ticket; u64 used; - u64 reclaim_bytes = 0; int ret = 0; + bool pending_tickets; ASSERT(orig_bytes); ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); @@ -965,18 +985,19 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); ret = -ENOSPC; used = btrfs_space_info_used(space_info, true); + pending_tickets = !list_empty(&space_info->tickets) || + !list_empty(&space_info->priority_tickets); /* * Carry on if we have enough space (short-circuit) OR call * can_overcommit() to ensure we can overcommit to continue. */ - if ((used + orig_bytes <= space_info->total_bytes) || - can_overcommit(fs_info, space_info, orig_bytes, flush, - system_chunk)) { + if (!pending_tickets && + ((used + orig_bytes <= space_info->total_bytes) || + can_overcommit(fs_info, space_info, orig_bytes, flush, + system_chunk))) { btrfs_space_info_update_bytes_may_use(fs_info, space_info, orig_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, orig_bytes, 1); ret = 0; } @@ -988,7 +1009,6 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, * the list and we will do our own flushing further down. */ if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { - ticket.orig_bytes = orig_bytes; ticket.bytes = orig_bytes; ticket.error = 0; init_waitqueue_head(&ticket.wait); @@ -1028,25 +1048,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) return ret; - if (flush == BTRFS_RESERVE_FLUSH_ALL) - return wait_reserve_ticket(fs_info, space_info, &ticket); - - ret = 0; - priority_reclaim_metadata_space(fs_info, space_info, &ticket); - spin_lock(&space_info->lock); - if (ticket.bytes) { - if (ticket.bytes < orig_bytes) - reclaim_bytes = orig_bytes - ticket.bytes; - list_del_init(&ticket.list); - ret = -ENOSPC; - } - spin_unlock(&space_info->lock); - - if (reclaim_bytes) - btrfs_space_info_add_old_bytes(fs_info, space_info, - reclaim_bytes); - ASSERT(list_empty(&ticket.list)); - return ret; + return handle_reserve_ticket(fs_info, space_info, &ticket, flush); } /** diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index c2b54b8e1a14..8867e84aa33d 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -70,7 +70,6 @@ struct btrfs_space_info { }; struct reserve_ticket { - u64 orig_bytes; u64 bytes; int error; struct list_head list; @@ -87,14 +86,18 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) * * Declare a helper function to detect underflow of various space info members */ -#define DECLARE_SPACE_INFO_UPDATE(name) \ +#define DECLARE_SPACE_INFO_UPDATE(name, trace_name) \ static inline void \ btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info, \ struct btrfs_space_info *sinfo, \ s64 bytes) \ { \ + const u64 abs_bytes = (bytes < 0) ? -bytes : bytes; \ lockdep_assert_held(&sinfo->lock); \ trace_update_##name(fs_info, sinfo, sinfo->name, bytes); \ + trace_btrfs_space_reservation(fs_info, trace_name, \ + sinfo->flags, abs_bytes, \ + bytes > 0); \ if (bytes < 0 && sinfo->name < -bytes) { \ WARN_ON(1); \ sinfo->name = 0; \ @@ -103,15 +106,9 @@ btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info, \ sinfo->name += bytes; \ } -DECLARE_SPACE_INFO_UPDATE(bytes_may_use); -DECLARE_SPACE_INFO_UPDATE(bytes_pinned); +DECLARE_SPACE_INFO_UPDATE(bytes_may_use, "space_info"); +DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned"); -void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 num_bytes); -void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 num_bytes); int btrfs_init_space_info(struct btrfs_fs_info *fs_info); void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, @@ -129,5 +126,18 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 orig_bytes, enum btrfs_reserve_flush_enum flush); +void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info); + +static inline void btrfs_space_info_free_bytes_may_use( + struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes) +{ + spin_lock(&space_info->lock); + btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes); + btrfs_try_granting_tickets(fs_info, space_info); + spin_unlock(&space_info->lock); +} #endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index 4c13b737f568..73f7987143df 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -33,6 +33,8 @@ static inline void put_unaligned_le8(u8 val, void *p) * * The extent buffer api is used to do the page spanning work required to * have a metadata blocksize different from the page size. + * + * There are 2 variants defined, one with a token pointer and one without. */ #define DEFINE_BTRFS_SETGET_BITS(bits) \ @@ -50,8 +52,10 @@ u##bits btrfs_get_token_##bits(const struct extent_buffer *eb, \ int size = sizeof(u##bits); \ u##bits res; \ \ - if (token && token->kaddr && token->offset <= offset && \ - token->eb == eb && \ + ASSERT(token); \ + ASSERT(token->eb == eb); \ + \ + if (token->kaddr && token->offset <= offset && \ (token->offset + PAGE_SIZE >= offset + size)) { \ kaddr = token->kaddr; \ p = kaddr + part_offset - token->offset; \ @@ -68,11 +72,33 @@ u##bits btrfs_get_token_##bits(const struct extent_buffer *eb, \ } \ p = kaddr + part_offset - map_start; \ res = get_unaligned_le##bits(p + off); \ - if (token) { \ - token->kaddr = kaddr; \ - token->offset = map_start; \ - token->eb = eb; \ + token->kaddr = kaddr; \ + token->offset = map_start; \ + return res; \ +} \ +u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ + const void *ptr, unsigned long off) \ +{ \ + unsigned long part_offset = (unsigned long)ptr; \ + unsigned long offset = part_offset + off; \ + void *p; \ + int err; \ + char *kaddr; \ + unsigned long map_start; \ + unsigned long map_len; \ + int size = sizeof(u##bits); \ + u##bits res; \ + \ + err = map_private_extent_buffer(eb, offset, size, \ + &kaddr, &map_start, &map_len); \ + if (err) { \ + __le##bits leres; \ + \ + read_extent_buffer(eb, &leres, offset, size); \ + return le##bits##_to_cpu(leres); \ } \ + p = kaddr + part_offset - map_start; \ + res = get_unaligned_le##bits(p + off); \ return res; \ } \ void btrfs_set_token_##bits(struct extent_buffer *eb, \ @@ -89,8 +115,10 @@ void btrfs_set_token_##bits(struct extent_buffer *eb, \ unsigned long map_len; \ int size = sizeof(u##bits); \ \ - if (token && token->kaddr && token->offset <= offset && \ - token->eb == eb && \ + ASSERT(token); \ + ASSERT(token->eb == eb); \ + \ + if (token->kaddr && token->offset <= offset && \ (token->offset + PAGE_SIZE >= offset + size)) { \ kaddr = token->kaddr; \ p = kaddr + part_offset - token->offset; \ @@ -108,11 +136,32 @@ void btrfs_set_token_##bits(struct extent_buffer *eb, \ } \ p = kaddr + part_offset - map_start; \ put_unaligned_le##bits(val, p + off); \ - if (token) { \ - token->kaddr = kaddr; \ - token->offset = map_start; \ - token->eb = eb; \ + token->kaddr = kaddr; \ + token->offset = map_start; \ +} \ +void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \ + unsigned long off, u##bits val) \ +{ \ + unsigned long part_offset = (unsigned long)ptr; \ + unsigned long offset = part_offset + off; \ + void *p; \ + int err; \ + char *kaddr; \ + unsigned long map_start; \ + unsigned long map_len; \ + int size = sizeof(u##bits); \ + \ + err = map_private_extent_buffer(eb, offset, size, \ + &kaddr, &map_start, &map_len); \ + if (err) { \ + __le##bits val2; \ + \ + val2 = cpu_to_le##bits(val); \ + write_extent_buffer(eb, &val2, offset, size); \ + return; \ } \ + p = kaddr + part_offset - map_start; \ + put_unaligned_le##bits(val, p + off); \ } DEFINE_BTRFS_SETGET_BITS(8) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 78de9d5d80c6..1b151af25772 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -43,7 +43,9 @@ #include "free-space-cache.h" #include "backref.h" #include "space-info.h" +#include "sysfs.h" #include "tests/btrfs-tests.h" +#include "block-group.h" #include "qgroup.h" #define CREATE_TRACE_POINTS @@ -1899,11 +1901,10 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, struct btrfs_device_info *devices_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; - u64 skip_space; u64 type; u64 avail_space; u64 min_stripe_size; - int min_stripes, num_stripes = 1; + int num_stripes = 1; int i = 0, nr_devices; const struct btrfs_raid_attr *rattr; @@ -1930,7 +1931,6 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, /* calc min stripe number for data space allocation */ type = btrfs_data_alloc_profile(fs_info); rattr = &btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)]; - min_stripes = rattr->devs_min; if (type & BTRFS_BLOCK_GROUP_RAID0) num_stripes = nr_devices; @@ -1956,28 +1956,21 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, avail_space = device->total_bytes - device->bytes_used; /* align with stripe_len */ - avail_space = div_u64(avail_space, BTRFS_STRIPE_LEN); - avail_space *= BTRFS_STRIPE_LEN; + avail_space = rounddown(avail_space, BTRFS_STRIPE_LEN); /* * In order to avoid overwriting the superblock on the drive, * btrfs starts at an offset of at least 1MB when doing chunk * allocation. + * + * This ensures we have at least min_stripe_size free space + * after excluding 1MB. */ - skip_space = SZ_1M; - - /* - * we can use the free space in [0, skip_space - 1], subtract - * it from the total. - */ - if (avail_space && avail_space >= skip_space) - avail_space -= skip_space; - else - avail_space = 0; - - if (avail_space < min_stripe_size) + if (avail_space <= SZ_1M + min_stripe_size) continue; + avail_space -= SZ_1M; + devices_info[i].dev = device; devices_info[i].max_avail = avail_space; @@ -1991,9 +1984,8 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, i = nr_devices - 1; avail_space = 0; - while (nr_devices >= min_stripes) { - if (num_stripes > nr_devices) - num_stripes = nr_devices; + while (nr_devices >= rattr->devs_min) { + num_stripes = min(num_stripes, nr_devices); if (devices_info[i].max_avail >= min_stripe_size) { int j; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 9539f8143b7a..f6d3c80f2e28 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -4,12 +4,11 @@ */ #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> -#include <linux/kobject.h> #include <linux/bug.h> -#include <linux/debugfs.h> #include "ctree.h" #include "disk-io.h" @@ -17,10 +16,75 @@ #include "sysfs.h" #include "volumes.h" #include "space-info.h" +#include "block-group.h" + +struct btrfs_feature_attr { + struct kobj_attribute kobj_attr; + enum btrfs_feature_set feature_set; + u64 feature_bit; +}; + +/* For raid type sysfs entries */ +struct raid_kobject { + u64 flags; + struct kobject kobj; +}; + +#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \ +{ \ + .attr = { .name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ +} + +#define BTRFS_ATTR_RW(_prefix, _name, _show, _store) \ + static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ + __INIT_KOBJ_ATTR(_name, 0644, _show, _store) + +#define BTRFS_ATTR(_prefix, _name, _show) \ + static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ + __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) + +#define BTRFS_ATTR_PTR(_prefix, _name) \ + (&btrfs_attr_##_prefix##_##_name.attr) + +#define BTRFS_FEAT_ATTR(_name, _feature_set, _feature_prefix, _feature_bit) \ +static struct btrfs_feature_attr btrfs_attr_features_##_name = { \ + .kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO, \ + btrfs_feature_attr_show, \ + btrfs_feature_attr_store), \ + .feature_set = _feature_set, \ + .feature_bit = _feature_prefix ##_## _feature_bit, \ +} +#define BTRFS_FEAT_ATTR_PTR(_name) \ + (&btrfs_attr_features_##_name.kobj_attr.attr) + +#define BTRFS_FEAT_ATTR_COMPAT(name, feature) \ + BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature) +#define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \ + BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT_RO, feature) +#define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \ + BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature) static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj); +static struct btrfs_feature_attr *to_btrfs_feature_attr(struct kobj_attribute *a) +{ + return container_of(a, struct btrfs_feature_attr, kobj_attr); +} + +static struct kobj_attribute *attr_to_btrfs_attr(struct attribute *attr) +{ + return container_of(attr, struct kobj_attribute, attr); +} + +static struct btrfs_feature_attr *attr_to_btrfs_feature_attr( + struct attribute *attr) +{ + return to_btrfs_feature_attr(attr_to_btrfs_attr(attr)); +} + static u64 get_features(struct btrfs_fs_info *fs_info, enum btrfs_feature_set set) { @@ -247,6 +311,25 @@ static const struct attribute_group btrfs_static_feature_attr_group = { .attrs = btrfs_supported_static_feature_attrs, }; +#ifdef CONFIG_BTRFS_DEBUG + +/* + * Runtime debugging exported via sysfs + * + * /sys/fs/btrfs/debug - applies to module or all filesystems + * /sys/fs/btrfs/UUID - applies only to the given filesystem + */ +static struct attribute *btrfs_debug_feature_attrs[] = { + NULL +}; + +static const struct attribute_group btrfs_debug_feature_attr_group = { + .name = "debug", + .attrs = btrfs_debug_feature_attrs, +}; + +#endif + static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf) { u64 val; @@ -316,7 +399,7 @@ static void release_raid_kobj(struct kobject *kobj) kfree(to_raid_kobj(kobj)); } -struct kobj_type btrfs_raid_ktype = { +static struct kobj_type btrfs_raid_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = release_raid_kobj, .default_groups = raid_groups, @@ -375,7 +458,7 @@ static void space_info_release(struct kobject *kobj) kfree(sinfo); } -struct kobj_type space_info_ktype = { +static struct kobj_type space_info_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = space_info_release, .default_groups = space_info_groups, @@ -655,12 +738,17 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) btrfs_sysfs_rm_device_link(fs_info->fs_devices, NULL); } -const char * const btrfs_feature_set_names[FEAT_MAX] = { +static const char * const btrfs_feature_set_names[FEAT_MAX] = { [FEAT_COMPAT] = "compat", [FEAT_COMPAT_RO] = "compat_ro", [FEAT_INCOMPAT] = "incompat", }; +const char * const btrfs_feature_set_name(enum btrfs_feature_set set) +{ + return btrfs_feature_set_names[set]; +} + char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags) { size_t bufsize = 4096; /* safe max, 64 names * 64 bytes */ @@ -730,6 +818,110 @@ static void init_feature_attrs(void) } } +/* + * Create a sysfs entry for a given block group type at path + * /sys/fs/btrfs/UUID/allocation/data/TYPE + */ +void btrfs_sysfs_add_block_group_type(struct btrfs_block_group_cache *cache) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_space_info *space_info = cache->space_info; + struct raid_kobject *rkobj; + const int index = btrfs_bg_flags_to_raid_index(cache->flags); + unsigned int nofs_flag; + int ret; + + /* + * Setup a NOFS context because kobject_add(), deep in its call chain, + * does GFP_KERNEL allocations, and we are often called in a context + * where if reclaim is triggered we can deadlock (we are either holding + * a transaction handle or some lock required for a transaction + * commit). + */ + nofs_flag = memalloc_nofs_save(); + + rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); + if (!rkobj) { + memalloc_nofs_restore(nofs_flag); + btrfs_warn(cache->fs_info, + "couldn't alloc memory for raid level kobject"); + return; + } + + rkobj->flags = cache->flags; + kobject_init(&rkobj->kobj, &btrfs_raid_ktype); + ret = kobject_add(&rkobj->kobj, &space_info->kobj, "%s", + btrfs_bg_type_to_raid_name(rkobj->flags)); + memalloc_nofs_restore(nofs_flag); + if (ret) { + kobject_put(&rkobj->kobj); + btrfs_warn(fs_info, + "failed to add kobject for block cache, ignoring"); + return; + } + + space_info->block_group_kobjs[index] = &rkobj->kobj; +} + +/* + * Remove sysfs directories for all block group types of a given space info and + * the space info as well + */ +void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info) +{ + int i; + + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + struct kobject *kobj; + + kobj = space_info->block_group_kobjs[i]; + space_info->block_group_kobjs[i] = NULL; + if (kobj) { + kobject_del(kobj); + kobject_put(kobj); + } + } + kobject_del(&space_info->kobj); + kobject_put(&space_info->kobj); +} + +static const char *alloc_name(u64 flags) +{ + switch (flags) { + case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA: + return "mixed"; + case BTRFS_BLOCK_GROUP_METADATA: + return "metadata"; + case BTRFS_BLOCK_GROUP_DATA: + return "data"; + case BTRFS_BLOCK_GROUP_SYSTEM: + return "system"; + default: + WARN_ON(1); + return "invalid-combination"; + }; +} + +/* + * Create a sysfs entry for a space info type at path + * /sys/fs/btrfs/UUID/allocation/TYPE + */ +int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info) +{ + int ret; + + ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, + fs_info->space_info_kobj, "%s", + alloc_name(space_info->flags)); + if (ret) { + kobject_put(&space_info->kobj); + return ret; + } + + return 0; +} + /* when one_device is NULL, it removes all device links */ int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, @@ -806,14 +998,34 @@ int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, return error; } -/* /sys/fs/btrfs/ entry */ -static struct kset *btrfs_kset; +void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action) +{ + int ret; -/* /sys/kernel/debug/btrfs */ -static struct dentry *btrfs_debugfs_root_dentry; + ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); + if (ret) + pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n", + action, kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), + &disk_to_dev(bdev->bd_disk)->kobj); +} -/* Debugging tunables and exported data */ -u64 btrfs_debugfs_test; +void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices, + const u8 *fsid) +{ + char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; + + /* + * Sprouting changes fsid of the mounted filesystem, rename the fsid + * directory + */ + snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fsid); + if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf)) + btrfs_warn(fs_devices->fs_info, + "sysfs: failed to create fsid for sprout"); +} + +/* /sys/fs/btrfs/ entry */ +static struct kset *btrfs_kset; /* * Can be called by the device discovery thread. @@ -859,6 +1071,13 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) if (error) goto failure; +#ifdef CONFIG_BTRFS_DEBUG + error = sysfs_create_group(fsid_kobj, + &btrfs_debug_feature_attr_group); + if (error) + goto failure; +#endif + error = addrm_unknown_feature_attrs(fs_info, true); if (error) goto failure; @@ -913,25 +1132,6 @@ void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group); } -static void btrfs_init_debugfs(void) -{ -#ifdef CONFIG_DEBUG_FS - btrfs_debugfs_root_dentry = debugfs_create_dir("btrfs", NULL); - - /* - * Example code, how to export data through debugfs. - * - * file: /sys/kernel/debug/btrfs/test - * contents of: btrfs_debugfs_test - */ -#ifdef CONFIG_BTRFS_DEBUG - debugfs_create_u64("test", S_IRUGO | S_IWUSR, btrfs_debugfs_root_dentry, - &btrfs_debugfs_test); -#endif - -#endif -} - int __init btrfs_init_sysfs(void) { int ret; @@ -940,8 +1140,6 @@ int __init btrfs_init_sysfs(void) if (!btrfs_kset) return -ENOMEM; - btrfs_init_debugfs(); - init_feature_attrs(); ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); if (ret) @@ -951,12 +1149,17 @@ int __init btrfs_init_sysfs(void) if (ret) goto out_remove_group; +#ifdef CONFIG_BTRFS_DEBUG + ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group); + if (ret) + goto out2; +#endif + return 0; out_remove_group: sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); out2: - debugfs_remove_recursive(btrfs_debugfs_root_dentry); kset_unregister(btrfs_kset); return ret; @@ -968,6 +1171,5 @@ void __cold btrfs_exit_sysfs(void) &btrfs_static_feature_attr_group); sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); kset_unregister(btrfs_kset); - debugfs_remove_recursive(btrfs_debugfs_root_dentry); } diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 40716b357c1d..610e9c36a94c 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -3,10 +3,7 @@ #ifndef BTRFS_SYSFS_H #define BTRFS_SYSFS_H -/* - * Data exported through sysfs - */ -extern u64 btrfs_debugfs_test; +#include <linux/kobject.h> enum btrfs_feature_set { FEAT_COMPAT, @@ -15,71 +12,8 @@ enum btrfs_feature_set { FEAT_MAX }; -#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \ -{ \ - .attr = { .name = __stringify(_name), .mode = _mode }, \ - .show = _show, \ - .store = _store, \ -} - -#define BTRFS_ATTR_RW(_prefix, _name, _show, _store) \ - static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ - __INIT_KOBJ_ATTR(_name, 0644, _show, _store) - -#define BTRFS_ATTR(_prefix, _name, _show) \ - static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ - __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) - -#define BTRFS_ATTR_PTR(_prefix, _name) \ - (&btrfs_attr_##_prefix##_##_name.attr) - - -struct btrfs_feature_attr { - struct kobj_attribute kobj_attr; - enum btrfs_feature_set feature_set; - u64 feature_bit; -}; - -#define BTRFS_FEAT_ATTR(_name, _feature_set, _feature_prefix, _feature_bit) \ -static struct btrfs_feature_attr btrfs_attr_features_##_name = { \ - .kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO, \ - btrfs_feature_attr_show, \ - btrfs_feature_attr_store), \ - .feature_set = _feature_set, \ - .feature_bit = _feature_prefix ##_## _feature_bit, \ -} -#define BTRFS_FEAT_ATTR_PTR(_name) \ - (&btrfs_attr_features_##_name.kobj_attr.attr) - -#define BTRFS_FEAT_ATTR_COMPAT(name, feature) \ - BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature) -#define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \ - BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT_RO, feature) -#define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \ - BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature) - -/* convert from attribute */ -static inline struct btrfs_feature_attr * -to_btrfs_feature_attr(struct kobj_attribute *a) -{ - return container_of(a, struct btrfs_feature_attr, kobj_attr); -} - -static inline struct kobj_attribute *attr_to_btrfs_attr(struct attribute *attr) -{ - return container_of(attr, struct kobj_attribute, attr); -} - -static inline struct btrfs_feature_attr * -attr_to_btrfs_feature_attr(struct attribute *attr) -{ - return to_btrfs_feature_attr(attr_to_btrfs_attr(attr)); -} - char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); -extern const char * const btrfs_feature_set_names[FEAT_MAX]; -extern struct kobj_type space_info_ktype; -extern struct kobj_type btrfs_raid_ktype; +const char * const btrfs_feature_set_name(enum btrfs_feature_set set); int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, @@ -88,7 +22,19 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, struct kobject *parent); int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); +void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices, + const u8 *fsid); void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, u64 bit, enum btrfs_feature_set set); +void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action); + +int __init btrfs_init_sysfs(void); +void __cold btrfs_exit_sysfs(void); +int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info); +void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info); +void btrfs_sysfs_add_block_group_type(struct btrfs_block_group_cache *cache); +int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info); +void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info); #endif diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 1e3ba4949399..b5e80563efaa 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -15,6 +15,7 @@ #include "../volumes.h" #include "../disk-io.h" #include "../qgroup.h" +#include "../block-group.h" static struct vfsmount *test_mnt = NULL; diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 1bf6b5a79191..123d9a614357 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -438,6 +438,7 @@ static int test_find_first_clear_extent_bit(void) { struct extent_io_tree tree; u64 start, end; + int ret = -EINVAL; test_msg("running find_first_clear_extent_bit test"); extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST, NULL); @@ -452,9 +453,11 @@ static int test_find_first_clear_extent_bit(void) find_first_clear_extent_bit(&tree, SZ_512K, &start, &end, CHUNK_TRIMMED | CHUNK_ALLOCATED); - if (start != 0 || end != SZ_1M -1) + if (start != 0 || end != SZ_1M - 1) { test_err("error finding beginning range: start %llu end %llu", start, end); + goto out; + } /* Now add 32M-64M so that we have a hole between 4M-32M */ set_extent_bits(&tree, SZ_32M, SZ_64M - 1, @@ -466,9 +469,11 @@ static int test_find_first_clear_extent_bit(void) find_first_clear_extent_bit(&tree, 12 * SZ_1M, &start, &end, CHUNK_TRIMMED | CHUNK_ALLOCATED); - if (start != SZ_4M || end != SZ_32M - 1) + if (start != SZ_4M || end != SZ_32M - 1) { test_err("error finding trimmed range: start %llu end %llu", start, end); + goto out; + } /* * Search in the middle of allocated range, should get the next one @@ -477,9 +482,11 @@ static int test_find_first_clear_extent_bit(void) find_first_clear_extent_bit(&tree, SZ_2M, &start, &end, CHUNK_TRIMMED | CHUNK_ALLOCATED); - if (start != SZ_4M || end != SZ_32M -1) + if (start != SZ_4M || end != SZ_32M - 1) { test_err("error finding next unalloc range: start %llu end %llu", start, end); + goto out; + } /* * Set 64M-72M with CHUNK_ALLOC flag, then search for CHUNK_TRIMMED flag @@ -489,9 +496,11 @@ static int test_find_first_clear_extent_bit(void) find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end, CHUNK_TRIMMED); - if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) + if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) { test_err("error finding exact range: start %llu end %llu", start, end); + goto out; + } find_first_clear_extent_bit(&tree, SZ_64M - SZ_8M, &start, &end, CHUNK_TRIMMED); @@ -500,21 +509,29 @@ static int test_find_first_clear_extent_bit(void) * Search in the middle of set range whose immediate neighbour doesn't * have the bits set so it must be returned */ - if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) + if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) { test_err("error finding next alloc range: start %llu end %llu", start, end); + goto out; + } /* * Search beyond any known range, shall return after last known range * and end should be -1 */ find_first_clear_extent_bit(&tree, -1, &start, &end, CHUNK_TRIMMED); - if (start != SZ_64M + SZ_8M || end != -1) + if (start != SZ_64M + SZ_8M || end != -1) { test_err( "error handling beyond end of range search: start %llu end %llu", start, end); + goto out; + } - return 0; + ret = 0; +out: + clear_extent_bits(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED); + + return ret; } int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index af89f66f9e63..43ec7060fcd2 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -8,6 +8,7 @@ #include "../ctree.h" #include "../disk-io.h" #include "../free-space-cache.h" +#include "../block-group.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index a90dad166971..bc92df977630 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -9,6 +9,7 @@ #include "../disk-io.h" #include "../free-space-tree.h" #include "../transaction.h" +#include "../block-group.h" struct free_space_extent { u64 start; diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index bc6dbd1b42fd..09ecf7dc7b08 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -957,7 +957,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) /* [BTRFS_MAX_EXTENT_SIZE] */ ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, 0, - NULL, 0); + NULL); if (ret) { test_err("btrfs_set_extent_delalloc returned %d", ret); goto out; @@ -972,7 +972,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */ ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE, BTRFS_MAX_EXTENT_SIZE + sectorsize - 1, - 0, NULL, 0); + 0, NULL); if (ret) { test_err("btrfs_set_extent_delalloc returned %d", ret); goto out; @@ -988,8 +988,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, BTRFS_MAX_EXTENT_SIZE >> 1, (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, - EXTENT_DELALLOC | EXTENT_DIRTY | - EXTENT_UPTODATE, 0, 0, NULL); + EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1005,7 +1004,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1, (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, - 0, NULL, 0); + 0, NULL); if (ret) { test_err("btrfs_set_extent_delalloc returned %d", ret); goto out; @@ -1023,7 +1022,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize, (BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1, - 0, NULL, 0); + 0, NULL); if (ret) { test_err("btrfs_set_extent_delalloc returned %d", ret); goto out; @@ -1040,7 +1039,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) */ ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + sectorsize, - BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL, 0); + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL); if (ret) { test_err("btrfs_set_extent_delalloc returned %d", ret); goto out; @@ -1056,8 +1055,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, BTRFS_MAX_EXTENT_SIZE + sectorsize, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_UPTODATE, 0, 0, NULL); + EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1075,7 +1073,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) */ ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + sectorsize, - BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL, 0); + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL); if (ret) { test_err("btrfs_set_extent_delalloc returned %d", ret); goto out; @@ -1089,8 +1087,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) /* Empty */ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_UPTODATE, 0, 0, NULL); + EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1105,8 +1102,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) out: if (ret) clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_UPTODATE, 0, 0, NULL); + EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL); iput(inode); btrfs_free_dummy_root(root); btrfs_free_dummy_fs_info(fs_info); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index e3adb714c04b..8624bdee8c5b 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -10,6 +10,7 @@ #include <linux/pagemap.h> #include <linux/blkdev.h> #include <linux/uuid.h> +#include "misc.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -19,6 +20,7 @@ #include "volumes.h" #include "dev-replace.h" #include "qgroup.h" +#include "block-group.h" #define BTRFS_ROOT_TRANS_TAG 0 @@ -484,7 +486,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, * worth of delayed refs updates in this trans handle, and * refill that amount for whatever is missing in the reserve. */ - num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items); + num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items); if (delayed_refs_rsv->full == 0) { delayed_refs_bytes = num_bytes; num_bytes <<= 1; @@ -635,7 +637,7 @@ struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv( if (IS_ERR(trans)) return trans; - num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items); + num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items); ret = btrfs_cond_migrate_bytes(fs_info, &fs_info->trans_block_rsv, num_bytes, min_factor); if (ret) { diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index ccd5706199d7..43e488f5d063 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -821,6 +821,417 @@ static int check_inode_item(struct extent_buffer *leaf, return 0; } +static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, + int slot) +{ + struct btrfs_fs_info *fs_info = leaf->fs_info; + struct btrfs_root_item ri; + const u64 valid_root_flags = BTRFS_ROOT_SUBVOL_RDONLY | + BTRFS_ROOT_SUBVOL_DEAD; + + /* No such tree id */ + if (key->objectid == 0) { + generic_err(leaf, slot, "invalid root id 0"); + return -EUCLEAN; + } + + /* + * Some older kernel may create ROOT_ITEM with non-zero offset, so here + * we only check offset for reloc tree whose key->offset must be a + * valid tree. + */ + if (key->objectid == BTRFS_TREE_RELOC_OBJECTID && key->offset == 0) { + generic_err(leaf, slot, "invalid root id 0 for reloc tree"); + return -EUCLEAN; + } + + if (btrfs_item_size_nr(leaf, slot) != sizeof(ri)) { + generic_err(leaf, slot, + "invalid root item size, have %u expect %zu", + btrfs_item_size_nr(leaf, slot), sizeof(ri)); + } + + read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot), + sizeof(ri)); + + /* Generation related */ + if (btrfs_root_generation(&ri) > + btrfs_super_generation(fs_info->super_copy) + 1) { + generic_err(leaf, slot, + "invalid root generation, have %llu expect (0, %llu]", + btrfs_root_generation(&ri), + btrfs_super_generation(fs_info->super_copy) + 1); + return -EUCLEAN; + } + if (btrfs_root_generation_v2(&ri) > + btrfs_super_generation(fs_info->super_copy) + 1) { + generic_err(leaf, slot, + "invalid root v2 generation, have %llu expect (0, %llu]", + btrfs_root_generation_v2(&ri), + btrfs_super_generation(fs_info->super_copy) + 1); + return -EUCLEAN; + } + if (btrfs_root_last_snapshot(&ri) > + btrfs_super_generation(fs_info->super_copy) + 1) { + generic_err(leaf, slot, + "invalid root last_snapshot, have %llu expect (0, %llu]", + btrfs_root_last_snapshot(&ri), + btrfs_super_generation(fs_info->super_copy) + 1); + return -EUCLEAN; + } + + /* Alignment and level check */ + if (!IS_ALIGNED(btrfs_root_bytenr(&ri), fs_info->sectorsize)) { + generic_err(leaf, slot, + "invalid root bytenr, have %llu expect to be aligned to %u", + btrfs_root_bytenr(&ri), fs_info->sectorsize); + return -EUCLEAN; + } + if (btrfs_root_level(&ri) >= BTRFS_MAX_LEVEL) { + generic_err(leaf, slot, + "invalid root level, have %u expect [0, %u]", + btrfs_root_level(&ri), BTRFS_MAX_LEVEL - 1); + return -EUCLEAN; + } + if (ri.drop_level >= BTRFS_MAX_LEVEL) { + generic_err(leaf, slot, + "invalid root level, have %u expect [0, %u]", + ri.drop_level, BTRFS_MAX_LEVEL - 1); + return -EUCLEAN; + } + + /* Flags check */ + if (btrfs_root_flags(&ri) & ~valid_root_flags) { + generic_err(leaf, slot, + "invalid root flags, have 0x%llx expect mask 0x%llx", + btrfs_root_flags(&ri), valid_root_flags); + return -EUCLEAN; + } + return 0; +} + +__printf(3,4) +__cold +static void extent_err(const struct extent_buffer *eb, int slot, + const char *fmt, ...) +{ + struct btrfs_key key; + struct va_format vaf; + va_list args; + u64 bytenr; + u64 len; + + btrfs_item_key_to_cpu(eb, &key, slot); + bytenr = key.objectid; + if (key.type == BTRFS_METADATA_ITEM_KEY || + key.type == BTRFS_TREE_BLOCK_REF_KEY || + key.type == BTRFS_SHARED_BLOCK_REF_KEY) + len = eb->fs_info->nodesize; + else + len = key.offset; + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_crit(eb->fs_info, + "corrupt %s: block=%llu slot=%d extent bytenr=%llu len=%llu %pV", + btrfs_header_level(eb) == 0 ? "leaf" : "node", + eb->start, slot, bytenr, len, &vaf); + va_end(args); +} + +static int check_extent_item(struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + struct btrfs_fs_info *fs_info = leaf->fs_info; + struct btrfs_extent_item *ei; + bool is_tree_block = false; + unsigned long ptr; /* Current pointer inside inline refs */ + unsigned long end; /* Extent item end */ + const u32 item_size = btrfs_item_size_nr(leaf, slot); + u64 flags; + u64 generation; + u64 total_refs; /* Total refs in btrfs_extent_item */ + u64 inline_refs = 0; /* found total inline refs */ + + if (key->type == BTRFS_METADATA_ITEM_KEY && + !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) { + generic_err(leaf, slot, +"invalid key type, METADATA_ITEM type invalid when SKINNY_METADATA feature disabled"); + return -EUCLEAN; + } + /* key->objectid is the bytenr for both key types */ + if (!IS_ALIGNED(key->objectid, fs_info->sectorsize)) { + generic_err(leaf, slot, + "invalid key objectid, have %llu expect to be aligned to %u", + key->objectid, fs_info->sectorsize); + return -EUCLEAN; + } + + /* key->offset is tree level for METADATA_ITEM_KEY */ + if (key->type == BTRFS_METADATA_ITEM_KEY && + key->offset >= BTRFS_MAX_LEVEL) { + extent_err(leaf, slot, + "invalid tree level, have %llu expect [0, %u]", + key->offset, BTRFS_MAX_LEVEL - 1); + return -EUCLEAN; + } + + /* + * EXTENT/METADATA_ITEM consists of: + * 1) One btrfs_extent_item + * Records the total refs, type and generation of the extent. + * + * 2) One btrfs_tree_block_info (for EXTENT_ITEM and tree backref only) + * Records the first key and level of the tree block. + * + * 2) Zero or more btrfs_extent_inline_ref(s) + * Each inline ref has one btrfs_extent_inline_ref shows: + * 2.1) The ref type, one of the 4 + * TREE_BLOCK_REF Tree block only + * SHARED_BLOCK_REF Tree block only + * EXTENT_DATA_REF Data only + * SHARED_DATA_REF Data only + * 2.2) Ref type specific data + * Either using btrfs_extent_inline_ref::offset, or specific + * data structure. + */ + if (item_size < sizeof(*ei)) { + extent_err(leaf, slot, + "invalid item size, have %u expect [%zu, %u)", + item_size, sizeof(*ei), + BTRFS_LEAF_DATA_SIZE(fs_info)); + return -EUCLEAN; + } + end = item_size + btrfs_item_ptr_offset(leaf, slot); + + /* Checks against extent_item */ + ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); + flags = btrfs_extent_flags(leaf, ei); + total_refs = btrfs_extent_refs(leaf, ei); + generation = btrfs_extent_generation(leaf, ei); + if (generation > btrfs_super_generation(fs_info->super_copy) + 1) { + extent_err(leaf, slot, + "invalid generation, have %llu expect (0, %llu]", + generation, + btrfs_super_generation(fs_info->super_copy) + 1); + return -EUCLEAN; + } + if (!is_power_of_2(flags & (BTRFS_EXTENT_FLAG_DATA | + BTRFS_EXTENT_FLAG_TREE_BLOCK))) { + extent_err(leaf, slot, + "invalid extent flag, have 0x%llx expect 1 bit set in 0x%llx", + flags, BTRFS_EXTENT_FLAG_DATA | + BTRFS_EXTENT_FLAG_TREE_BLOCK); + return -EUCLEAN; + } + is_tree_block = !!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK); + if (is_tree_block) { + if (key->type == BTRFS_EXTENT_ITEM_KEY && + key->offset != fs_info->nodesize) { + extent_err(leaf, slot, + "invalid extent length, have %llu expect %u", + key->offset, fs_info->nodesize); + return -EUCLEAN; + } + } else { + if (key->type != BTRFS_EXTENT_ITEM_KEY) { + extent_err(leaf, slot, + "invalid key type, have %u expect %u for data backref", + key->type, BTRFS_EXTENT_ITEM_KEY); + return -EUCLEAN; + } + if (!IS_ALIGNED(key->offset, fs_info->sectorsize)) { + extent_err(leaf, slot, + "invalid extent length, have %llu expect aligned to %u", + key->offset, fs_info->sectorsize); + return -EUCLEAN; + } + } + ptr = (unsigned long)(struct btrfs_extent_item *)(ei + 1); + + /* Check the special case of btrfs_tree_block_info */ + if (is_tree_block && key->type != BTRFS_METADATA_ITEM_KEY) { + struct btrfs_tree_block_info *info; + + info = (struct btrfs_tree_block_info *)ptr; + if (btrfs_tree_block_level(leaf, info) >= BTRFS_MAX_LEVEL) { + extent_err(leaf, slot, + "invalid tree block info level, have %u expect [0, %u]", + btrfs_tree_block_level(leaf, info), + BTRFS_MAX_LEVEL - 1); + return -EUCLEAN; + } + ptr = (unsigned long)(struct btrfs_tree_block_info *)(info + 1); + } + + /* Check inline refs */ + while (ptr < end) { + struct btrfs_extent_inline_ref *iref; + struct btrfs_extent_data_ref *dref; + struct btrfs_shared_data_ref *sref; + u64 dref_offset; + u64 inline_offset; + u8 inline_type; + + if (ptr + sizeof(*iref) > end) { + extent_err(leaf, slot, +"inline ref item overflows extent item, ptr %lu iref size %zu end %lu", + ptr, sizeof(*iref), end); + return -EUCLEAN; + } + iref = (struct btrfs_extent_inline_ref *)ptr; + inline_type = btrfs_extent_inline_ref_type(leaf, iref); + inline_offset = btrfs_extent_inline_ref_offset(leaf, iref); + if (ptr + btrfs_extent_inline_ref_size(inline_type) > end) { + extent_err(leaf, slot, +"inline ref item overflows extent item, ptr %lu iref size %u end %lu", + ptr, inline_type, end); + return -EUCLEAN; + } + + switch (inline_type) { + /* inline_offset is subvolid of the owner, no need to check */ + case BTRFS_TREE_BLOCK_REF_KEY: + inline_refs++; + break; + /* Contains parent bytenr */ + case BTRFS_SHARED_BLOCK_REF_KEY: + if (!IS_ALIGNED(inline_offset, fs_info->sectorsize)) { + extent_err(leaf, slot, + "invalid tree parent bytenr, have %llu expect aligned to %u", + inline_offset, fs_info->sectorsize); + return -EUCLEAN; + } + inline_refs++; + break; + /* + * Contains owner subvolid, owner key objectid, adjusted offset. + * The only obvious corruption can happen in that offset. + */ + case BTRFS_EXTENT_DATA_REF_KEY: + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + dref_offset = btrfs_extent_data_ref_offset(leaf, dref); + if (!IS_ALIGNED(dref_offset, fs_info->sectorsize)) { + extent_err(leaf, slot, + "invalid data ref offset, have %llu expect aligned to %u", + dref_offset, fs_info->sectorsize); + return -EUCLEAN; + } + inline_refs += btrfs_extent_data_ref_count(leaf, dref); + break; + /* Contains parent bytenr and ref count */ + case BTRFS_SHARED_DATA_REF_KEY: + sref = (struct btrfs_shared_data_ref *)(iref + 1); + if (!IS_ALIGNED(inline_offset, fs_info->sectorsize)) { + extent_err(leaf, slot, + "invalid data parent bytenr, have %llu expect aligned to %u", + inline_offset, fs_info->sectorsize); + return -EUCLEAN; + } + inline_refs += btrfs_shared_data_ref_count(leaf, sref); + break; + default: + extent_err(leaf, slot, "unknown inline ref type: %u", + inline_type); + return -EUCLEAN; + } + ptr += btrfs_extent_inline_ref_size(inline_type); + } + /* No padding is allowed */ + if (ptr != end) { + extent_err(leaf, slot, + "invalid extent item size, padding bytes found"); + return -EUCLEAN; + } + + /* Finally, check the inline refs against total refs */ + if (inline_refs > total_refs) { + extent_err(leaf, slot, + "invalid extent refs, have %llu expect >= inline %llu", + total_refs, inline_refs); + return -EUCLEAN; + } + return 0; +} + +static int check_simple_keyed_refs(struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + u32 expect_item_size = 0; + + if (key->type == BTRFS_SHARED_DATA_REF_KEY) + expect_item_size = sizeof(struct btrfs_shared_data_ref); + + if (btrfs_item_size_nr(leaf, slot) != expect_item_size) { + generic_err(leaf, slot, + "invalid item size, have %u expect %u for key type %u", + btrfs_item_size_nr(leaf, slot), + expect_item_size, key->type); + return -EUCLEAN; + } + if (!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize)) { + generic_err(leaf, slot, +"invalid key objectid for shared block ref, have %llu expect aligned to %u", + key->objectid, leaf->fs_info->sectorsize); + return -EUCLEAN; + } + if (key->type != BTRFS_TREE_BLOCK_REF_KEY && + !IS_ALIGNED(key->offset, leaf->fs_info->sectorsize)) { + extent_err(leaf, slot, + "invalid tree parent bytenr, have %llu expect aligned to %u", + key->offset, leaf->fs_info->sectorsize); + return -EUCLEAN; + } + return 0; +} + +static int check_extent_data_ref(struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + struct btrfs_extent_data_ref *dref; + unsigned long ptr = btrfs_item_ptr_offset(leaf, slot); + const unsigned long end = ptr + btrfs_item_size_nr(leaf, slot); + + if (btrfs_item_size_nr(leaf, slot) % sizeof(*dref) != 0) { + generic_err(leaf, slot, + "invalid item size, have %u expect aligned to %zu for key type %u", + btrfs_item_size_nr(leaf, slot), + sizeof(*dref), key->type); + } + if (!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize)) { + generic_err(leaf, slot, +"invalid key objectid for shared block ref, have %llu expect aligned to %u", + key->objectid, leaf->fs_info->sectorsize); + return -EUCLEAN; + } + for (; ptr < end; ptr += sizeof(*dref)) { + u64 root_objectid; + u64 owner; + u64 offset; + u64 hash; + + dref = (struct btrfs_extent_data_ref *)ptr; + root_objectid = btrfs_extent_data_ref_root(leaf, dref); + owner = btrfs_extent_data_ref_objectid(leaf, dref); + offset = btrfs_extent_data_ref_offset(leaf, dref); + hash = hash_extent_data_ref(root_objectid, owner, offset); + if (hash != key->offset) { + extent_err(leaf, slot, + "invalid extent data ref hash, item has 0x%016llx key has 0x%016llx", + hash, key->offset); + return -EUCLEAN; + } + if (!IS_ALIGNED(offset, leaf->fs_info->sectorsize)) { + extent_err(leaf, slot, + "invalid extent data backref offset, have %llu expect aligned to %u", + offset, leaf->fs_info->sectorsize); + } + } + return 0; +} + /* * Common point to switch the item-specific validation. */ @@ -856,6 +1267,21 @@ static int check_leaf_item(struct extent_buffer *leaf, case BTRFS_INODE_ITEM_KEY: ret = check_inode_item(leaf, key, slot); break; + case BTRFS_ROOT_ITEM_KEY: + ret = check_root_item(leaf, key, slot); + break; + case BTRFS_EXTENT_ITEM_KEY: + case BTRFS_METADATA_ITEM_KEY: + ret = check_extent_item(leaf, key, slot); + break; + case BTRFS_TREE_BLOCK_REF_KEY: + case BTRFS_SHARED_DATA_REF_KEY: + case BTRFS_SHARED_BLOCK_REF_KEY: + ret = check_simple_keyed_refs(leaf, key, slot); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + ret = check_extent_data_ref(leaf, key, slot); + break; } return ret; } @@ -899,6 +1325,12 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) owner); return -EUCLEAN; } + /* Unknown tree */ + if (owner == 0) { + generic_err(leaf, 0, + "invalid owner, root 0 is not defined"); + return -EUCLEAN; + } return 0; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 6c8297bcfeb7..29b82a795522 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -8,6 +8,7 @@ #include <linux/blkdev.h> #include <linux/list_sort.h> #include <linux/iversion.h> +#include "misc.h" #include "ctree.h" #include "tree-log.h" #include "disk-io.h" @@ -24,10 +25,12 @@ * LOG_INODE_EXISTS means to log just enough to recreate the inode * during log replay */ -#define LOG_INODE_ALL 0 -#define LOG_INODE_EXISTS 1 -#define LOG_OTHER_INODE 2 -#define LOG_OTHER_INODE_ALL 3 +enum { + LOG_INODE_ALL, + LOG_INODE_EXISTS, + LOG_OTHER_INODE, + LOG_OTHER_INODE_ALL, +}; /* * directory trouble cases @@ -81,10 +84,12 @@ * The last stage is to deal with directories and links and extents * and all the other fun semantics */ -#define LOG_WALK_PIN_ONLY 0 -#define LOG_WALK_REPLAY_INODES 1 -#define LOG_WALK_REPLAY_DIR_INDEX 2 -#define LOG_WALK_REPLAY_ALL 3 +enum { + LOG_WALK_PIN_ONLY, + LOG_WALK_REPLAY_INODES, + LOG_WALK_REPLAY_DIR_INDEX, + LOG_WALK_REPLAY_ALL, +}; static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, @@ -188,10 +193,6 @@ static int join_running_log_trans(struct btrfs_root *root) { int ret = -ENOENT; - smp_mb(); - if (!root->log_root) - return -ENOENT; - mutex_lock(&root->log_mutex); if (root->log_root) { ret = 0; @@ -505,7 +506,7 @@ insert: ino_size != 0) { struct btrfs_map_token token; - btrfs_init_map_token(&token); + btrfs_init_map_token(&token, dst_eb); btrfs_set_token_inode_size(dst_eb, dst_item, ino_size, &token); } @@ -967,7 +968,7 @@ static noinline int backref_in_log(struct btrfs_root *log, if (btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], ref_objectid, - name, namelen, NULL)) + name, namelen)) match = 1; goto out; @@ -1266,12 +1267,12 @@ again: goto out; if (key->type == BTRFS_INODE_EXTREF_KEY) - ret = btrfs_find_name_in_ext_backref(log_eb, log_slot, - parent_id, name, - namelen, NULL); + ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot, + parent_id, name, + namelen); else - ret = btrfs_find_name_in_backref(log_eb, log_slot, name, - namelen, NULL); + ret = !!btrfs_find_name_in_backref(log_eb, log_slot, + name, namelen); if (!ret) { struct inode *dir; @@ -1333,12 +1334,11 @@ static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir, goto out; } if (key.type == BTRFS_INODE_EXTREF_KEY) - ret = btrfs_find_name_in_ext_backref(path->nodes[0], - path->slots[0], parent_id, - name, namelen, NULL); + ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], + path->slots[0], parent_id, name, namelen); else - ret = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], - name, namelen, NULL); + ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0], + name, namelen); out: btrfs_free_path(path); @@ -3842,7 +3842,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, { struct btrfs_map_token token; - btrfs_init_map_token(&token); + btrfs_init_map_token(&token, leaf); if (log_inode_only) { /* set the generation to zero so the recover code @@ -4302,8 +4302,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans, if (ret) return ret; - btrfs_init_map_token(&token); - ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start, em->start + em->len, NULL, 0, 1, sizeof(*fi), &extent_inserted); @@ -4321,6 +4319,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, return ret; } leaf = path->nodes[0]; + btrfs_init_map_token(&token, leaf); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -4985,7 +4984,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, BTRFS_I(inode), LOG_OTHER_INODE_ALL, 0, LLONG_MAX, ctx); - iput(inode); + btrfs_add_delayed_iput(inode); } } continue; @@ -5000,7 +4999,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, ret = btrfs_log_inode(trans, root, BTRFS_I(inode), LOG_OTHER_INODE, 0, LLONG_MAX, ctx); if (ret) { - iput(inode); + btrfs_add_delayed_iput(inode); continue; } @@ -5009,7 +5008,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { - iput(inode); + btrfs_add_delayed_iput(inode); continue; } @@ -5056,7 +5055,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, } path->slots[0]++; } - iput(inode); + btrfs_add_delayed_iput(inode); } return ret; @@ -5689,7 +5688,7 @@ process_leaf: } if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) { - iput(di_inode); + btrfs_add_delayed_iput(di_inode); break; } @@ -5701,7 +5700,7 @@ process_leaf: if (!ret && btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) ret = 1; - iput(di_inode); + btrfs_add_delayed_iput(di_inode); if (ret) goto next_dir_inode; if (ctx->log_new_dentries) { @@ -5848,7 +5847,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, if (!ret && ctx && ctx->log_new_dentries) ret = log_new_dir_dentries(trans, root, BTRFS_I(dir_inode), ctx); - iput(dir_inode); + btrfs_add_delayed_iput(dir_inode); if (ret) goto out; } @@ -5891,7 +5890,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, ret = btrfs_log_inode(trans, root, BTRFS_I(inode), LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); - iput(inode); + btrfs_add_delayed_iput(inode); if (ret) return ret; @@ -6233,7 +6232,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) struct btrfs_fs_info *fs_info = log_root_tree->fs_info; struct walk_control wc = { .process_func = process_one_buffer, - .stage = 0, + .stage = LOG_WALK_PIN_ONLY, }; path = btrfs_alloc_path(); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a447d3ec48d5..a324480bc88b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -14,6 +14,7 @@ #include <linux/semaphore.h> #include <linux/uuid.h> #include <linux/list_sort.h> +#include "misc.h" #include "ctree.h" #include "extent_map.h" #include "disk-io.h" @@ -24,11 +25,11 @@ #include "async-thread.h" #include "check-integrity.h" #include "rcu-string.h" -#include "math.h" #include "dev-replace.h" #include "sysfs.h" #include "tree-checker.h" #include "space-info.h" +#include "block-group.h" const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { @@ -190,7 +191,6 @@ out_overflow:; static int init_first_rw_device(struct btrfs_trans_handle *trans); static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); -static void __btrfs_reset_dev_stats(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); static int __btrfs_map_block(struct btrfs_fs_info *fs_info, @@ -358,19 +358,6 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices) kfree(fs_devices); } -static void btrfs_kobject_uevent(struct block_device *bdev, - enum kobject_action action) -{ - int ret; - - ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); - if (ret) - pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n", - action, - kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), - &disk_to_dev(bdev->bd_disk)->kobj); -} - void __exit btrfs_cleanup_fs_uuids(void) { struct btrfs_fs_devices *fs_devices; @@ -1128,6 +1115,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) struct btrfs_fs_devices *fs_devices; struct btrfs_device *device; struct btrfs_device *orig_dev; + int ret = 0; fs_devices = alloc_fs_devices(orig->fsid, NULL); if (IS_ERR(fs_devices)) @@ -1141,8 +1129,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) device = btrfs_alloc_device(NULL, &orig_dev->devid, orig_dev->uuid); - if (IS_ERR(device)) + if (IS_ERR(device)) { + ret = PTR_ERR(device); goto error; + } /* * This is ok to do without rcu read locked because we hold the @@ -1153,6 +1143,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) GFP_KERNEL); if (!name) { btrfs_free_device(device); + ret = -ENOMEM; goto error; } rcu_assign_pointer(device->name, name); @@ -1167,7 +1158,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) error: mutex_unlock(&orig->device_list_mutex); free_fs_devices(fs_devices); - return ERR_PTR(-ENOMEM); + return ERR_PTR(ret); } /* @@ -1551,9 +1542,16 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start, * @len is used to store the size of the free space that we find. * But if we don't find suitable free space, it is used to store the size of * the max free space. + * + * NOTE: This function will search *commit* root of device tree, and does extra + * check to ensure dev extents are not double allocated. + * This makes the function safe to allocate dev extents but may not report + * correct usable device space, as device extent freed in current transaction + * is not reported as avaiable. */ -int find_free_dev_extent_start(struct btrfs_device *device, u64 num_bytes, - u64 search_start, u64 *start, u64 *len) +static int find_free_dev_extent_start(struct btrfs_device *device, + u64 num_bytes, u64 search_start, u64 *start, + u64 *len) { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_root *root = fs_info->dev_root; @@ -1855,7 +1853,12 @@ static noinline int find_next_devid(struct btrfs_fs_info *fs_info, if (ret < 0) goto error; - BUG_ON(ret == 0); /* Corruption */ + if (ret == 0) { + /* Corruption */ + btrfs_err(fs_info, "corrupted chunk tree devid -1 matched"); + ret = -EUCLEAN; + goto error; + } ret = btrfs_previous_item(fs_info->chunk_root, path, BTRFS_DEV_ITEMS_OBJECTID, @@ -2686,22 +2689,14 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path } if (seeding_dev) { - char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; - ret = btrfs_finish_sprout(trans); if (ret) { btrfs_abort_transaction(trans, ret); goto error_sysfs; } - /* Sprouting would change fsid of the mounted root, - * so rename the fsid on the sysfs - */ - snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", - fs_info->fs_devices->fsid); - if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf)) - btrfs_warn(fs_info, - "sysfs: failed to create fsid for sprout"); + btrfs_sysfs_update_sprout_fsid(fs_devices, + fs_info->fs_devices->fsid); } ret = btrfs_commit_transaction(trans); @@ -3076,10 +3071,6 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) */ lockdep_assert_held(&fs_info->delete_unused_bgs_mutex); - ret = btrfs_can_relocate(fs_info, chunk_offset); - if (ret) - return -ENOSPC; - /* step one, relocate all the extents inside this chunk */ btrfs_scrub_pause(fs_info); ret = btrfs_relocate_block_group(fs_info, chunk_offset); @@ -6011,7 +6002,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, { struct extent_map *em; struct map_lookup *map; - u64 offset; u64 stripe_offset; u64 stripe_nr; u64 stripe_len; @@ -6042,11 +6032,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, return ret; em = btrfs_get_chunk_map(fs_info, logical, *length); - ASSERT(em); + ASSERT(!IS_ERR(em)); map = em->map_lookup; *length = geom.len; - offset = geom.offset; stripe_len = geom.stripe_len; stripe_nr = geom.stripe_nr; stripe_offset = geom.stripe_offset; @@ -7296,18 +7285,32 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) } } -static void __btrfs_reset_dev_stats(struct btrfs_device *dev) +static u64 btrfs_dev_stats_value(const struct extent_buffer *eb, + const struct btrfs_dev_stats_item *ptr, + int index) { - int i; + u64 val; - for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) - btrfs_dev_stat_reset(dev, i); + read_extent_buffer(eb, &val, + offsetof(struct btrfs_dev_stats_item, values) + + ((unsigned long)ptr) + (index * sizeof(u64)), + sizeof(val)); + return val; +} + +static void btrfs_set_dev_stats_value(struct extent_buffer *eb, + struct btrfs_dev_stats_item *ptr, + int index, u64 val) +{ + write_extent_buffer(eb, &val, + offsetof(struct btrfs_dev_stats_item, values) + + ((unsigned long)ptr) + (index * sizeof(u64)), + sizeof(val)); } int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) { struct btrfs_key key; - struct btrfs_key found_key; struct btrfs_root *dev_root = fs_info->dev_root; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct extent_buffer *eb; @@ -7318,10 +7321,8 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) int i; path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { @@ -7333,14 +7334,14 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) key.offset = device->devid; ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); if (ret) { - __btrfs_reset_dev_stats(device); + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) + btrfs_dev_stat_set(device, i, 0); device->dev_stats_valid = 1; btrfs_release_path(path); continue; } slot = path->slots[0]; eb = path->nodes[0]; - btrfs_item_key_to_cpu(eb, &found_key, slot); item_size = btrfs_item_size_nr(eb, slot); ptr = btrfs_item_ptr(eb, slot, @@ -7351,7 +7352,7 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) btrfs_dev_stat_set(device, i, btrfs_dev_stats_value(eb, ptr, i)); else - btrfs_dev_stat_reset(device, i); + btrfs_dev_stat_set(device, i, 0); } device->dev_stats_valid = 1; @@ -7360,7 +7361,6 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) } mutex_unlock(&fs_devices->device_list_mutex); -out: btrfs_free_path(path); return ret < 0 ? ret : 0; } @@ -7534,7 +7534,7 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, stats->values[i] = btrfs_dev_stat_read_and_reset(dev, i); else - btrfs_dev_stat_reset(dev, i); + btrfs_dev_stat_set(dev, i, 0); } } else { for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 7f6aa1816409..a7da1f3e3627 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -82,7 +82,6 @@ struct btrfs_device { unsigned long dev_state; blk_status_t last_flush_error; - int flush_bio_sent; #ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED seqcount_t data_seqcount; @@ -475,8 +474,6 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info); int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset); -int find_free_dev_extent_start(struct btrfs_device *device, u64 num_bytes, - u64 search_start, u64 *start, u64 *max_avail); int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); @@ -550,12 +547,6 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev, atomic_inc(&dev->dev_stats_ccnt); } -static inline void btrfs_dev_stat_reset(struct btrfs_device *dev, - int index) -{ - btrfs_dev_stat_set(dev, index, 0); -} - /* * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which * can be used as index to access btrfs_raid_array[]. diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index b86b7ad6b900..df1aace5df50 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -418,14 +418,6 @@ next: return ret; } -static unsigned int zlib_set_level(unsigned int level) -{ - if (!level) - return BTRFS_ZLIB_DEFAULT_LEVEL; - - return min_t(unsigned int, level, 9); -} - const struct btrfs_compress_op btrfs_zlib_compress = { .init_workspace_manager = zlib_init_workspace_manager, .cleanup_workspace_manager = zlib_cleanup_workspace_manager, @@ -436,5 +428,6 @@ const struct btrfs_compress_op btrfs_zlib_compress = { .compress_pages = zlib_compress_pages, .decompress_bio = zlib_decompress_bio, .decompress = zlib_decompress, - .set_level = zlib_set_level, + .max_level = 9, + .default_level = BTRFS_ZLIB_DEFAULT_LEVEL, }; diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 3837ca180d52..764d47b107e5 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -17,6 +17,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/zstd.h> +#include "misc.h" #include "compression.h" #include "ctree.h" @@ -710,14 +711,6 @@ finish: return ret; } -static unsigned int zstd_set_level(unsigned int level) -{ - if (!level) - return ZSTD_BTRFS_DEFAULT_LEVEL; - - return min_t(unsigned int, level, ZSTD_BTRFS_MAX_LEVEL); -} - const struct btrfs_compress_op btrfs_zstd_compress = { .init_workspace_manager = zstd_init_workspace_manager, .cleanup_workspace_manager = zstd_cleanup_workspace_manager, @@ -728,5 +721,6 @@ const struct btrfs_compress_op btrfs_zstd_compress = { .compress_pages = zstd_compress_pages, .decompress_bio = zstd_decompress_bio, .decompress = zstd_decompress, - .set_level = zstd_set_level, + .max_level = ZSTD_BTRFS_MAX_LEVEL, + .default_level = ZSTD_BTRFS_DEFAULT_LEVEL, }; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index ab4868c7308e..377fafc76f20 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -979,6 +979,8 @@ static int ceph_set_super(struct super_block *s, void *data) s->s_export_op = &ceph_export_ops; s->s_time_gran = 1; + s->s_time_min = 0; + s->s_time_max = U32_MAX; ret = set_anon_super(s, NULL); /* what is that second arg for? */ if (ret != 0) diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index b16219e5dac9..22cf04fb32d3 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -16,7 +16,7 @@ config CIFS select CRYPTO_GCM select CRYPTO_ECB select CRYPTO_AES - select CRYPTO_DES + select CRYPTO_LIB_DES select KEYS help This is the client VFS module for the SMB3 family of NAS protocols, @@ -211,3 +211,11 @@ config CIFS_FSCACHE Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data to be cached locally on disk through the general filesystem cache manager. If unsure, say N. + +config CIFS_ROOT + bool "SMB root file system (Experimental)" + depends on CIFS=y && IP_PNP + help + Enables root file system support over SMB protocol. + + Most people say N here. diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index 41332f20055b..51bae9340842 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -21,3 +21,5 @@ cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o dfs_cache.o cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o cifs-$(CONFIG_CIFS_SMB_DIRECT) += smbdirect.o + +cifs-$(CONFIG_CIFS_ROOT) += cifsroot.o diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index a38d796f5ffe..0b4eee3bed66 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -452,6 +452,7 @@ static ssize_t cifs_stats_proc_write(struct file *file, list_for_each(tmp1, &cifs_tcp_ses_list) { server = list_entry(tmp1, struct TCP_Server_Info, tcp_ses_list); + server->max_in_flight = 0; #ifdef CONFIG_CIFS_STATS2 for (i = 0; i < NUMBER_OF_SMB2_COMMANDS; i++) { atomic_set(&server->num_cmds[i], 0); @@ -526,6 +527,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v) list_for_each(tmp1, &cifs_tcp_ses_list) { server = list_entry(tmp1, struct TCP_Server_Info, tcp_ses_list); + seq_printf(m, "\nMax requests in flight: %d", server->max_in_flight); #ifdef CONFIG_CIFS_STATS2 seq_puts(m, "\nTotal time spent processing by command. Time "); seq_printf(m, "units are jiffies (%d per second)\n", HZ); diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h index 3d392620a2f4..100b0056a369 100644 --- a/fs/cifs/cifs_debug.h +++ b/fs/cifs/cifs_debug.h @@ -80,6 +80,60 @@ do { \ type, fmt, ##__VA_ARGS__); \ } while (0) +#define cifs_server_dbg_func(ratefunc, type, fmt, ...) \ +do { \ + const char *sn = ""; \ + if (server && server->hostname) \ + sn = server->hostname; \ + if ((type) & FYI && cifsFYI & CIFS_INFO) { \ + pr_debug_ ## ratefunc("%s: \\\\%s " fmt, \ + __FILE__, sn, ##__VA_ARGS__); \ + } else if ((type) & VFS) { \ + pr_err_ ## ratefunc("CIFS VFS: \\\\%s " fmt, \ + sn, ##__VA_ARGS__); \ + } else if ((type) & NOISY && (NOISY != 0)) { \ + pr_debug_ ## ratefunc("\\\\%s " fmt, \ + sn, ##__VA_ARGS__); \ + } \ +} while (0) + +#define cifs_server_dbg(type, fmt, ...) \ +do { \ + if ((type) & ONCE) \ + cifs_server_dbg_func(once, \ + type, fmt, ##__VA_ARGS__); \ + else \ + cifs_server_dbg_func(ratelimited, \ + type, fmt, ##__VA_ARGS__); \ +} while (0) + +#define cifs_tcon_dbg_func(ratefunc, type, fmt, ...) \ +do { \ + const char *tn = ""; \ + if (tcon && tcon->treeName) \ + tn = tcon->treeName; \ + if ((type) & FYI && cifsFYI & CIFS_INFO) { \ + pr_debug_ ## ratefunc("%s: %s " fmt, \ + __FILE__, tn, ##__VA_ARGS__); \ + } else if ((type) & VFS) { \ + pr_err_ ## ratefunc("CIFS VFS: %s " fmt, \ + tn, ##__VA_ARGS__); \ + } else if ((type) & NOISY && (NOISY != 0)) { \ + pr_debug_ ## ratefunc("%s " fmt, \ + tn, ##__VA_ARGS__); \ + } \ +} while (0) + +#define cifs_tcon_dbg(type, fmt, ...) \ +do { \ + if ((type) & ONCE) \ + cifs_tcon_dbg_func(once, \ + type, fmt, ##__VA_ARGS__); \ + else \ + cifs_tcon_dbg_func(ratelimited, \ + type, fmt, ##__VA_ARGS__); \ +} while (0) + /* * debug OFF * --------- @@ -91,6 +145,19 @@ do { \ pr_debug(fmt, ##__VA_ARGS__); \ } while (0) +#define cifs_server_dbg(type, fmt, ...) \ +do { \ + if (0) \ + pr_debug("\\\\%s " fmt, \ + server->hostname, ##__VA_ARGS__); \ +} while (0) + +#define cifs_tcon_dbg(type, fmt, ...) \ +do { \ + if (0) \ + pr_debug("%s " fmt, tcon->treeName, ##__VA_ARGS__); \ +} while (0) + #define cifs_info(fmt, ...) \ do { \ pr_info("CIFS: "fmt, ##__VA_ARGS__); \ diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index b326d2ca3765..6e7c4427369d 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -53,6 +53,8 @@ #define CIFS_MOUNT_NO_HANDLE_CACHE 0x4000000 /* disable caching dir handles */ #define CIFS_MOUNT_NO_DFS 0x8000000 /* disable DFS resolving */ #define CIFS_MOUNT_MODE_FROM_SID 0x10000000 /* retrieve mode from special ACE */ +#define CIFS_MOUNT_RO_CACHE 0x20000000 /* assumes share will not change */ +#define CIFS_MOUNT_RW_CACHE 0x40000000 /* assumes only client accessing */ struct cifs_sb_info { struct rb_root tlink_tree; diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h index 086ddc5108af..6c3bd07868d7 100644 --- a/fs/cifs/cifs_ioctl.h +++ b/fs/cifs/cifs_ioctl.h @@ -46,6 +46,7 @@ struct smb_snapshot_array { /* query_info flags */ #define PASSTHRU_QUERY_INFO 0x00000000 #define PASSTHRU_FSCTL 0x00000001 +#define PASSTHRU_SET_INFO 0x00000002 struct smb_query_info { __u32 info_type; __u32 file_info_class; diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 1d377b7f2860..f842944a5c76 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -701,10 +701,9 @@ static void dump_ace(struct cifs_ace *pace, char *end_of_acl) } #endif - static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, struct cifs_sid *pownersid, struct cifs_sid *pgrpsid, - struct cifs_fattr *fattr) + struct cifs_fattr *fattr, bool mode_from_special_sid) { int i; int num_aces = 0; @@ -757,22 +756,34 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, #ifdef CONFIG_CIFS_DEBUG2 dump_ace(ppace[i], end_of_acl); #endif - if (compare_sids(&(ppace[i]->sid), pownersid) == 0) + if (mode_from_special_sid && + (compare_sids(&(ppace[i]->sid), + &sid_unix_NFS_mode) == 0)) { + /* + * Full permissions are: + * 07777 = S_ISUID | S_ISGID | S_ISVTX | + * S_IRWXU | S_IRWXG | S_IRWXO + */ + fattr->cf_mode &= ~07777; + fattr->cf_mode |= + le32_to_cpu(ppace[i]->sid.sub_auth[2]); + break; + } else if (compare_sids(&(ppace[i]->sid), pownersid) == 0) access_flags_to_mode(ppace[i]->access_req, ppace[i]->type, &fattr->cf_mode, &user_mask); - if (compare_sids(&(ppace[i]->sid), pgrpsid) == 0) + else if (compare_sids(&(ppace[i]->sid), pgrpsid) == 0) access_flags_to_mode(ppace[i]->access_req, ppace[i]->type, &fattr->cf_mode, &group_mask); - if (compare_sids(&(ppace[i]->sid), &sid_everyone) == 0) + else if (compare_sids(&(ppace[i]->sid), &sid_everyone) == 0) access_flags_to_mode(ppace[i]->access_req, ppace[i]->type, &fattr->cf_mode, &other_mask); - if (compare_sids(&(ppace[i]->sid), &sid_authusers) == 0) + else if (compare_sids(&(ppace[i]->sid), &sid_authusers) == 0) access_flags_to_mode(ppace[i]->access_req, ppace[i]->type, &fattr->cf_mode, @@ -795,22 +806,49 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, static int set_chmod_dacl(struct cifs_acl *pndacl, struct cifs_sid *pownersid, - struct cifs_sid *pgrpsid, __u64 nmode) + struct cifs_sid *pgrpsid, __u64 nmode, bool modefromsid) { u16 size = 0; + u32 num_aces = 0; struct cifs_acl *pnndacl; pnndacl = (struct cifs_acl *)((char *)pndacl + sizeof(struct cifs_acl)); + if (modefromsid) { + struct cifs_ace *pntace = + (struct cifs_ace *)((char *)pnndacl + size); + int i; + + pntace->type = ACCESS_ALLOWED; + pntace->flags = 0x0; + pntace->access_req = 0; + pntace->sid.num_subauth = 3; + pntace->sid.revision = 1; + for (i = 0; i < NUM_AUTHS; i++) + pntace->sid.authority[i] = + sid_unix_NFS_mode.authority[i]; + pntace->sid.sub_auth[0] = sid_unix_NFS_mode.sub_auth[0]; + pntace->sid.sub_auth[1] = sid_unix_NFS_mode.sub_auth[1]; + pntace->sid.sub_auth[2] = cpu_to_le32(nmode & 07777); + + /* size = 1 + 1 + 2 + 4 + 1 + 1 + 6 + (psid->num_subauth*4) */ + pntace->size = cpu_to_le16(28); + size += 28; + num_aces++; + } + size += fill_ace_for_sid((struct cifs_ace *) ((char *)pnndacl + size), pownersid, nmode, S_IRWXU); + num_aces++; size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size), pgrpsid, nmode, S_IRWXG); + num_aces++; size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size), &sid_everyone, nmode, S_IRWXO); + num_aces++; + pndacl->num_aces = cpu_to_le32(num_aces); pndacl->size = cpu_to_le16(size + sizeof(struct cifs_acl)); - pndacl->num_aces = cpu_to_le32(3); return 0; } @@ -851,7 +889,8 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl) /* Convert CIFS ACL to POSIX form */ static int parse_sec_desc(struct cifs_sb_info *cifs_sb, - struct cifs_ntsd *pntsd, int acl_len, struct cifs_fattr *fattr) + struct cifs_ntsd *pntsd, int acl_len, struct cifs_fattr *fattr, + bool get_mode_from_special_sid) { int rc = 0; struct cifs_sid *owner_sid_ptr, *group_sid_ptr; @@ -900,7 +939,7 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb, if (dacloffset) parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr, - group_sid_ptr, fattr); + group_sid_ptr, fattr, get_mode_from_special_sid); else cifs_dbg(FYI, "no ACL\n"); /* BB grant all or default perms? */ @@ -909,7 +948,8 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb, /* Convert permission bits from mode to equivalent CIFS ACL */ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, - __u32 secdesclen, __u64 nmode, kuid_t uid, kgid_t gid, int *aclflag) + __u32 secdesclen, __u64 nmode, kuid_t uid, kgid_t gid, + bool mode_from_sid, int *aclflag) { int rc = 0; __u32 dacloffset; @@ -934,7 +974,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, ndacl_ptr->num_aces = 0; rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr, - nmode); + nmode, mode_from_sid); sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size); /* copy sec desc control portion & owner and group sids */ copy_sec_desc(pntsd, pnntsd, sidsoffset); @@ -1128,8 +1168,8 @@ out: /* Translate the CIFS ACL (similar to NTFS ACL) for a file into mode bits */ int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, - struct inode *inode, const char *path, - const struct cifs_fid *pfid) + struct inode *inode, bool mode_from_special_sid, + const char *path, const struct cifs_fid *pfid) { struct cifs_ntsd *pntsd = NULL; u32 acllen = 0; @@ -1156,8 +1196,11 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, if (IS_ERR(pntsd)) { rc = PTR_ERR(pntsd); cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc); + } else if (mode_from_special_sid) { + rc = parse_sec_desc(cifs_sb, pntsd, acllen, fattr, true); } else { - rc = parse_sec_desc(cifs_sb, pntsd, acllen, fattr); + /* get approximated mode from ACL */ + rc = parse_sec_desc(cifs_sb, pntsd, acllen, fattr, false); kfree(pntsd); if (rc) cifs_dbg(VFS, "parse sec desc failed rc = %d\n", rc); @@ -1181,6 +1224,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); struct smb_version_operations *ops; + bool mode_from_sid; if (IS_ERR(tlink)) return PTR_ERR(tlink); @@ -1218,8 +1262,13 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, return -ENOMEM; } + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID) + mode_from_sid = true; + else + mode_from_sid = false; + rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid, - &aclflag); + mode_from_sid, &aclflag); cifs_dbg(NOISY, "build_sec_desc rc: %d\n", rc); diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h index dd95a6fa24bf..eb428349f29a 100644 --- a/fs/cifs/cifsacl.h +++ b/fs/cifs/cifsacl.h @@ -45,7 +45,7 @@ */ #define DEFAULT_SEC_DESC_LEN (sizeof(struct cifs_ntsd) + \ sizeof(struct cifs_acl) + \ - (sizeof(struct cifs_ace) * 3)) + (sizeof(struct cifs_ace) * 4)) /* * Maximum size of a string representation of a SID: diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 3289b566463f..2e9c7f493f99 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -56,6 +56,15 @@ #include "dfs_cache.h" #endif +/* + * DOS dates from 1980/1/1 through 2107/12/31 + * Protocol specifications indicate the range should be to 119, which + * limits maximum year to 2099. But this range has not been checked. + */ +#define SMB_DATE_MAX (127<<9 | 12<<5 | 31) +#define SMB_DATE_MIN (0<<9 | 1<<5 | 1) +#define SMB_TIME_MAX (23<<11 | 59<<5 | 29) + int cifsFYI = 0; bool traceSMB; bool enable_oplocks = true; @@ -109,6 +118,7 @@ extern mempool_t *cifs_req_poolp; extern mempool_t *cifs_mid_poolp; struct workqueue_struct *cifsiod_wq; +struct workqueue_struct *decrypt_wq; struct workqueue_struct *cifsoplockd_wq; __u32 cifs_lock_secret; @@ -142,6 +152,7 @@ cifs_read_super(struct super_block *sb) struct inode *inode; struct cifs_sb_info *cifs_sb; struct cifs_tcon *tcon; + struct timespec64 ts; int rc = 0; cifs_sb = CIFS_SB(sb); @@ -161,6 +172,18 @@ cifs_read_super(struct super_block *sb) /* BB FIXME fix time_gran to be larger for LANMAN sessions */ sb->s_time_gran = 100; + if (tcon->unix_ext) { + ts = cifs_NTtimeToUnix(0); + sb->s_time_min = ts.tv_sec; + ts = cifs_NTtimeToUnix(cpu_to_le64(S64_MAX)); + sb->s_time_max = ts.tv_sec; + } else { + ts = cnvrtDosUnixTm(cpu_to_le16(SMB_DATE_MIN), 0, 0); + sb->s_time_min = ts.tv_sec; + ts = cnvrtDosUnixTm(cpu_to_le16(SMB_DATE_MAX), cpu_to_le16(SMB_TIME_MAX), 0); + sb->s_time_max = ts.tv_sec; + } + sb->s_magic = CIFS_MAGIC_NUMBER; sb->s_op = &cifs_super_ops; sb->s_xattr = cifs_xattr_handlers; @@ -400,6 +423,10 @@ cifs_show_cache_flavor(struct seq_file *s, struct cifs_sb_info *cifs_sb) seq_puts(s, "strict"); else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) seq_puts(s, "none"); + else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RW_CACHE) + seq_puts(s, "singleclient"); /* assume only one client access */ + else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE) + seq_puts(s, "ro"); /* read only caching assumed */ else seq_puts(s, "loose"); } @@ -433,6 +460,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) cifs_show_security(s, tcon->ses); cifs_show_cache_flavor(s, cifs_sb); + if (tcon->no_lease) + seq_puts(s, ",nolease"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) seq_puts(s, ",multiuser"); else if (tcon->ses->user_name) @@ -554,6 +583,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",rsize=%u", cifs_sb->rsize); seq_printf(s, ",wsize=%u", cifs_sb->wsize); seq_printf(s, ",bsize=%u", cifs_sb->bsize); + if (tcon->ses->server->min_offload) + seq_printf(s, ",esize=%u", tcon->ses->server->min_offload); seq_printf(s, ",echo_interval=%lu", tcon->ses->server->echo_interval / HZ); @@ -1495,11 +1526,25 @@ init_cifs(void) goto out_clean_proc; } + /* + * Consider in future setting limit!=0 maybe to min(num_of_cores - 1, 3) + * so that we don't launch too many worker threads but + * Documentation/workqueue.txt recommends setting it to 0 + */ + + /* WQ_UNBOUND allows decrypt tasks to run on any CPU */ + decrypt_wq = alloc_workqueue("smb3decryptd", + WQ_UNBOUND|WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + if (!decrypt_wq) { + rc = -ENOMEM; + goto out_destroy_cifsiod_wq; + } + cifsoplockd_wq = alloc_workqueue("cifsoplockd", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); if (!cifsoplockd_wq) { rc = -ENOMEM; - goto out_destroy_cifsiod_wq; + goto out_destroy_decrypt_wq; } rc = cifs_fscache_register(); @@ -1565,6 +1610,8 @@ out_unreg_fscache: cifs_fscache_unregister(); out_destroy_cifsoplockd_wq: destroy_workqueue(cifsoplockd_wq); +out_destroy_decrypt_wq: + destroy_workqueue(decrypt_wq); out_destroy_cifsiod_wq: destroy_workqueue(cifsiod_wq); out_clean_proc: @@ -1591,6 +1638,7 @@ exit_cifs(void) cifs_destroy_inodecache(); cifs_fscache_unregister(); destroy_workqueue(cifsoplockd_wq); + destroy_workqueue(decrypt_wq); destroy_workqueue(cifsiod_wq); cifs_proc_clean(); } @@ -1601,7 +1649,6 @@ MODULE_DESCRIPTION ("VFS to access SMB3 servers e.g. Samba, Macs, Azure and Windows (and " "also older servers complying with the SNIA CIFS Specification)"); MODULE_VERSION(CIFS_VERSION); -MODULE_SOFTDEP("pre: des"); MODULE_SOFTDEP("pre: ecb"); MODULE_SOFTDEP("pre: hmac"); MODULE_SOFTDEP("pre: md4"); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 99caf77df4a2..bc4ca94137f2 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -152,5 +152,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.22" +#define CIFS_VERSION "2.23" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index fe610e7e3670..54e204589cb9 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -542,6 +542,7 @@ struct smb_vol { umode_t dir_mode; enum securityEnum sectype; /* sectype requested via mnt opts */ bool sign; /* was signing requested via mnt opts? */ + bool ignore_signature:1; bool retry:1; bool intr:1; bool setuids:1; @@ -559,6 +560,8 @@ struct smb_vol { bool server_ino:1; /* use inode numbers from server ie UniqueId */ bool direct_io:1; bool strict_io:1; /* strict cache behavior */ + bool cache_ro:1; + bool cache_rw:1; bool remap:1; /* set to remap seven reserved chars in filenames */ bool sfu_remap:1; /* remap seven reserved chars ala SFU */ bool posix_paths:1; /* unset to not ask for posix pathnames. */ @@ -576,6 +579,7 @@ struct smb_vol { bool noblocksnd:1; bool noautotune:1; bool nostrictsync:1; /* do not force expensive SMBflush on every sync */ + bool no_lease:1; /* disable requesting leases */ bool fsc:1; /* enable fscache */ bool mfsymlinks:1; /* use Minshall+French Symlinks */ bool multiuser:1; @@ -589,6 +593,7 @@ struct smb_vol { unsigned int bsize; unsigned int rsize; unsigned int wsize; + unsigned int min_offload; bool sockopt_tcp_nodelay:1; unsigned long actimeo; /* attribute cache timeout (jiffies) */ struct smb_version_operations *ops; @@ -602,6 +607,7 @@ struct smb_vol { __u32 handle_timeout; /* persistent and durable handle timeout in ms */ unsigned int max_credits; /* smb3 max_credits 10 < credits < 60000 */ __u16 compression; /* compression algorithm 0xFFFF default 0=disabled */ + bool rootfs:1; /* if it's a SMB root file system */ }; /** @@ -620,7 +626,8 @@ struct smb_vol { CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO | \ CIFS_MOUNT_CIFS_BACKUPUID | CIFS_MOUNT_CIFS_BACKUPGID | \ CIFS_MOUNT_UID_FROM_ACL | CIFS_MOUNT_NO_HANDLE_CACHE | \ - CIFS_MOUNT_NO_DFS | CIFS_MOUNT_MODE_FROM_SID) + CIFS_MOUNT_NO_DFS | CIFS_MOUNT_MODE_FROM_SID | \ + CIFS_MOUNT_RO_CACHE | CIFS_MOUNT_RW_CACHE) /** * Generic VFS superblock mount flags (s_flags) to consider when @@ -672,12 +679,14 @@ struct TCP_Server_Info { unsigned int credits; /* send no more requests at once */ unsigned int max_credits; /* can override large 32000 default at mnt */ unsigned int in_flight; /* number of requests on the wire to server */ + unsigned int max_in_flight; /* max number of requests that were on wire */ spinlock_t req_lock; /* protect the two values above */ struct mutex srv_mutex; struct task_struct *tsk; char server_GUID[16]; __u16 sec_mode; bool sign; /* is signing enabled on this connection? */ + bool ignore_signature:1; /* skip validation of signatures in SMB2/3 rsp */ bool session_estab; /* mark when very first sess is established */ int echo_credits; /* echo reserved slots */ int oplock_credits; /* oplock break reserved slots */ @@ -740,6 +749,7 @@ struct TCP_Server_Info { #endif /* STATS2 */ unsigned int max_read; unsigned int max_write; + unsigned int min_offload; __le16 compress_algorithm; __le16 cipher_type; /* save initital negprot hash */ @@ -755,6 +765,7 @@ struct TCP_Server_Info { * reconnect. */ int nr_targets; + bool noblockcnt; /* use non-blocking connect() */ }; struct cifs_credits { @@ -1082,6 +1093,7 @@ struct cifs_tcon { bool need_reopen_files:1; /* need to reopen tcon file handles */ bool use_resilient:1; /* use resilient instead of durable handles */ bool use_persistent:1; /* use persistent instead of durable handles */ + bool no_lease:1; /* Do not request leases on files or directories */ __le32 capabilities; __u32 share_flags; __u32 maximal_access; @@ -1366,9 +1378,9 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file); #define CIFS_CACHE_RW_FLG (CIFS_CACHE_READ_FLG | CIFS_CACHE_WRITE_FLG) #define CIFS_CACHE_RHW_FLG (CIFS_CACHE_RW_FLG | CIFS_CACHE_HANDLE_FLG) -#define CIFS_CACHE_READ(cinode) (cinode->oplock & CIFS_CACHE_READ_FLG) +#define CIFS_CACHE_READ(cinode) ((cinode->oplock & CIFS_CACHE_READ_FLG) || (CIFS_SB(cinode->vfs_inode.i_sb)->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE)) #define CIFS_CACHE_HANDLE(cinode) (cinode->oplock & CIFS_CACHE_HANDLE_FLG) -#define CIFS_CACHE_WRITE(cinode) (cinode->oplock & CIFS_CACHE_WRITE_FLG) +#define CIFS_CACHE_WRITE(cinode) ((cinode->oplock & CIFS_CACHE_WRITE_FLG) || (CIFS_SB(cinode->vfs_inode.i_sb)->mnt_cifs_flags & CIFS_MOUNT_RW_CACHE)) /* * One of these for each file inode @@ -1887,6 +1899,7 @@ void cifs_queue_oplock_break(struct cifsFileInfo *cfile); extern const struct slow_work_ops cifs_oplock_break_ops; extern struct workqueue_struct *cifsiod_wq; +extern struct workqueue_struct *decrypt_wq; extern struct workqueue_struct *cifsoplockd_wq; extern __u32 cifs_lock_secret; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 592a6cea2b79..99b1b1ef558c 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -137,7 +137,11 @@ extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool); extern int cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only, struct cifsFileInfo **ret_file); +extern int cifs_get_writable_path(struct cifs_tcon *tcon, const char *name, + struct cifsFileInfo **ret_file); extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool); +extern int cifs_get_readable_path(struct cifs_tcon *tcon, const char *name, + struct cifsFileInfo **ret_file); extern unsigned int smbCalcSize(void *buf, struct TCP_Server_Info *server); extern int decode_negTokenInit(unsigned char *security_blob, int length, struct TCP_Server_Info *server); @@ -197,6 +201,7 @@ extern int cifs_rename_pending_delete(const char *full_path, const unsigned int xid); extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, struct inode *inode, + bool get_mode_from_special_sid, const char *path, const struct cifs_fid *pfid); extern int id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64, kuid_t, kgid_t); diff --git a/fs/cifs/cifsroot.c b/fs/cifs/cifsroot.c new file mode 100644 index 000000000000..37edbfb8e096 --- /dev/null +++ b/fs/cifs/cifsroot.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * SMB root file system support + * + * Copyright (c) 2019 Paulo Alcantara <palcantara@suse.de> + */ +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/ctype.h> +#include <linux/string.h> +#include <linux/root_dev.h> +#include <linux/kernel.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <net/ipconfig.h> + +#define DEFAULT_MNT_OPTS \ + "vers=1.0,cifsacl,mfsymlinks,rsize=1048576,wsize=65536,uid=0,gid=0," \ + "hard,rootfs" + +static char root_dev[2048] __initdata = ""; +static char root_opts[1024] __initdata = DEFAULT_MNT_OPTS; + +static __be32 __init parse_srvaddr(char *start, char *end) +{ + /* TODO: ipv6 support */ + char addr[sizeof("aaa.bbb.ccc.ddd")]; + int i = 0; + + while (start < end && i < sizeof(addr) - 1) { + if (isdigit(*start) || *start == '.') + addr[i++] = *start; + start++; + } + addr[i] = '\0'; + return in_aton(addr); +} + +/* cifsroot=//<server-ip>/<share>[,options] */ +static int __init cifs_root_setup(char *line) +{ + char *s; + int len; + __be32 srvaddr = htonl(INADDR_NONE); + + ROOT_DEV = Root_CIFS; + + if (strlen(line) > 3 && line[0] == '/' && line[1] == '/') { + s = strchr(&line[2], '/'); + if (!s || s[1] == '\0') + return 1; + + /* make s point to ',' or '\0' at end of line */ + s = strchrnul(s, ','); + /* len is strlen(unc) + '\0' */ + len = s - line + 1; + if (len > sizeof(root_dev)) { + printk(KERN_ERR "Root-CIFS: UNC path too long\n"); + return 1; + } + strlcpy(root_dev, line, len); + srvaddr = parse_srvaddr(&line[2], s); + if (*s) { + int n = snprintf(root_opts, + sizeof(root_opts), "%s,%s", + DEFAULT_MNT_OPTS, s + 1); + if (n >= sizeof(root_opts)) { + printk(KERN_ERR "Root-CIFS: mount options string too long\n"); + root_opts[sizeof(root_opts)-1] = '\0'; + return 1; + } + } + } + + root_server_addr = srvaddr; + + return 1; +} + +__setup("cifsroot=", cifs_root_setup); + +int __init cifs_root_data(char **dev, char **opts) +{ + if (!root_dev[0] || root_server_addr == htonl(INADDR_NONE)) { + printk(KERN_ERR "Root-CIFS: no SMB server address\n"); + return -1; + } + + *dev = root_dev; + *opts = root_opts; + + return 0; +} diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 3907653e63c7..dbee2132e419 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1393,7 +1393,7 @@ int CIFS_open(const unsigned int xid, struct cifs_open_parms *oparms, int *oplock, FILE_ALL_INFO *buf) { - int rc = -EACCES; + int rc; OPEN_REQ *req = NULL; OPEN_RSP *rsp = NULL; int bytes_returned; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 5299effa6f7d..2850c3ce4391 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -74,7 +74,7 @@ enum { Opt_user_xattr, Opt_nouser_xattr, Opt_forceuid, Opt_noforceuid, Opt_forcegid, Opt_noforcegid, - Opt_noblocksend, Opt_noautotune, + Opt_noblocksend, Opt_noautotune, Opt_nolease, Opt_hard, Opt_soft, Opt_perm, Opt_noperm, Opt_mapposix, Opt_nomapposix, Opt_mapchars, Opt_nomapchars, Opt_sfu, @@ -91,18 +91,19 @@ enum { Opt_serverino, Opt_noserverino, Opt_rwpidforward, Opt_cifsacl, Opt_nocifsacl, Opt_acl, Opt_noacl, Opt_locallease, - Opt_sign, Opt_seal, Opt_noac, + Opt_sign, Opt_ignore_signature, Opt_seal, Opt_noac, Opt_fsc, Opt_mfsymlinks, Opt_multiuser, Opt_sloppy, Opt_nosharesock, Opt_persistent, Opt_nopersistent, Opt_resilient, Opt_noresilient, - Opt_domainauto, Opt_rdma, Opt_modesid, + Opt_domainauto, Opt_rdma, Opt_modesid, Opt_rootfs, Opt_compress, /* Mount options which take numeric value */ Opt_backupuid, Opt_backupgid, Opt_uid, Opt_cruid, Opt_gid, Opt_file_mode, Opt_dirmode, Opt_port, + Opt_min_enc_offload, Opt_blocksize, Opt_rsize, Opt_wsize, Opt_actimeo, Opt_echo_interval, Opt_max_credits, Opt_handletimeout, Opt_snapshot, @@ -134,6 +135,7 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_noforcegid, "noforcegid" }, { Opt_noblocksend, "noblocksend" }, { Opt_noautotune, "noautotune" }, + { Opt_nolease, "nolease" }, { Opt_hard, "hard" }, { Opt_soft, "soft" }, { Opt_perm, "perm" }, @@ -183,6 +185,7 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_noacl, "noacl" }, { Opt_locallease, "locallease" }, { Opt_sign, "sign" }, + { Opt_ignore_signature, "signloosely" }, { Opt_seal, "seal" }, { Opt_noac, "noac" }, { Opt_fsc, "fsc" }, @@ -206,6 +209,7 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_dirmode, "dirmode=%s" }, { Opt_dirmode, "dir_mode=%s" }, { Opt_port, "port=%s" }, + { Opt_min_enc_offload, "esize=%s" }, { Opt_blocksize, "bsize=%s" }, { Opt_rsize, "rsize=%s" }, { Opt_wsize, "wsize=%s" }, @@ -262,6 +266,7 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_ignore, "nomand" }, { Opt_ignore, "relatime" }, { Opt_ignore, "_netdev" }, + { Opt_rootfs, "rootfs" }, { Opt_err, NULL } }; @@ -298,6 +303,8 @@ enum { Opt_cache_loose, Opt_cache_strict, Opt_cache_none, + Opt_cache_ro, + Opt_cache_rw, Opt_cache_err }; @@ -305,6 +312,8 @@ static const match_table_t cifs_cacheflavor_tokens = { { Opt_cache_loose, "loose" }, { Opt_cache_strict, "strict" }, { Opt_cache_none, "none" }, + { Opt_cache_ro, "ro" }, + { Opt_cache_rw, "singleclient" }, { Opt_cache_err, NULL } }; @@ -489,7 +498,7 @@ cifs_reconnect(struct TCP_Server_Info *server) } else { rc = reconn_setup_dfs_targets(cifs_sb, &tgt_list, &tgt_it); if (rc && (rc != -EOPNOTSUPP)) { - cifs_dbg(VFS, "%s: no target servers for DFS failover\n", + cifs_server_dbg(VFS, "%s: no target servers for DFS failover\n", __func__); } else { server->nr_targets = dfs_cache_get_nr_tgts(&tgt_list); @@ -617,12 +626,12 @@ cifs_reconnect(struct TCP_Server_Info *server) rc = dfs_cache_noreq_update_tgthint(cifs_sb->origin_fullpath + 1, tgt_it); if (rc) { - cifs_dbg(VFS, "%s: failed to update DFS target hint: rc = %d\n", + cifs_server_dbg(VFS, "%s: failed to update DFS target hint: rc = %d\n", __func__, rc); } rc = dfs_cache_update_vol(cifs_sb->origin_fullpath, server); if (rc) { - cifs_dbg(VFS, "%s: failed to update vol info in DFS cache: rc = %d\n", + cifs_server_dbg(VFS, "%s: failed to update vol info in DFS cache: rc = %d\n", __func__, rc); } dfs_cache_free_tgts(&tgt_list); @@ -678,7 +687,7 @@ allocate_buffers(struct TCP_Server_Info *server) if (!server->bigbuf) { server->bigbuf = (char *)cifs_buf_get(); if (!server->bigbuf) { - cifs_dbg(VFS, "No memory for large SMB response\n"); + cifs_server_dbg(VFS, "No memory for large SMB response\n"); msleep(3000); /* retry will check if exiting */ return false; @@ -691,7 +700,7 @@ allocate_buffers(struct TCP_Server_Info *server) if (!server->smallbuf) { server->smallbuf = (char *)cifs_small_buf_get(); if (!server->smallbuf) { - cifs_dbg(VFS, "No memory for SMB response\n"); + cifs_server_dbg(VFS, "No memory for SMB response\n"); msleep(1000); /* retry will check if exiting */ return false; @@ -712,7 +721,7 @@ server_unresponsive(struct TCP_Server_Info *server) * We need to wait 3 echo intervals to make sure we handle such * situations right: * 1s client sends a normal SMB request - * 3s client gets a response + * 2s client gets a response * 30s echo workqueue job pops, and decides we got a response recently * and don't need to send another * ... @@ -722,8 +731,8 @@ server_unresponsive(struct TCP_Server_Info *server) if ((server->tcpStatus == CifsGood || server->tcpStatus == CifsNeedNegotiate) && time_after(jiffies, server->lstrp + 3 * server->echo_interval)) { - cifs_dbg(VFS, "Server %s has not responded in %lu seconds. Reconnecting...\n", - server->hostname, (3 * server->echo_interval) / HZ); + cifs_server_dbg(VFS, "has not responded in %lu seconds. Reconnecting...\n", + (3 * server->echo_interval) / HZ); cifs_reconnect(server); wake_up(&server->response_q); return true; @@ -861,7 +870,7 @@ is_smb_response(struct TCP_Server_Info *server, unsigned char type) wake_up(&server->response_q); break; default: - cifs_dbg(VFS, "RFC 1002 unknown response type 0x%x\n", type); + cifs_server_dbg(VFS, "RFC 1002 unknown response type 0x%x\n", type); cifs_reconnect(server); } @@ -1008,7 +1017,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) /* make sure this will fit in a large buffer */ if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server) - server->vals->header_preamble_size) { - cifs_dbg(VFS, "SMB response too long (%u bytes)\n", pdu_length); + cifs_server_dbg(VFS, "SMB response too long (%u bytes)\n", pdu_length); cifs_reconnect(server); wake_up(&server->response_q); return -ECONNABORTED; @@ -1149,7 +1158,7 @@ next_pdu: /* make sure we have enough to get to the MID */ if (server->pdu_size < HEADER_SIZE(server) - 1 - server->vals->header_preamble_size) { - cifs_dbg(VFS, "SMB response too short (%u bytes)\n", + cifs_server_dbg(VFS, "SMB response too short (%u bytes)\n", server->pdu_size); cifs_reconnect(server); wake_up(&server->response_q); @@ -1222,7 +1231,7 @@ next_pdu: smb2_add_credits_from_hdr(bufs[i], server); cifs_dbg(FYI, "Received oplock break\n"); } else { - cifs_dbg(VFS, "No task to wake, unknown frame " + cifs_server_dbg(VFS, "No task to wake, unknown frame " "received! NumMids %d\n", atomic_read(&midCount)); cifs_dump_mem("Received Data is: ", bufs[i], @@ -1418,14 +1427,32 @@ cifs_parse_cache_flavor(char *value, struct smb_vol *vol) case Opt_cache_loose: vol->direct_io = false; vol->strict_io = false; + vol->cache_ro = false; + vol->cache_rw = false; break; case Opt_cache_strict: vol->direct_io = false; vol->strict_io = true; + vol->cache_ro = false; + vol->cache_rw = false; break; case Opt_cache_none: vol->direct_io = true; vol->strict_io = false; + vol->cache_ro = false; + vol->cache_rw = false; + break; + case Opt_cache_ro: + vol->direct_io = false; + vol->strict_io = false; + vol->cache_ro = true; + vol->cache_rw = false; + break; + case Opt_cache_rw: + vol->direct_io = false; + vol->strict_io = false; + vol->cache_ro = false; + vol->cache_rw = true; break; default: cifs_dbg(VFS, "bad cache= option: %s\n", value); @@ -1713,6 +1740,9 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, case Opt_noautotune: vol->noautotune = 1; break; + case Opt_nolease: + vol->no_lease = 1; + break; case Opt_hard: vol->retry = 1; break; @@ -1748,6 +1778,11 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, case Opt_nodfs: vol->nodfs = 1; break; + case Opt_rootfs: +#ifdef CONFIG_CIFS_ROOT + vol->rootfs = true; +#endif + break; case Opt_posixpaths: vol->posix_paths = 1; break; @@ -1855,6 +1890,10 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, case Opt_sign: vol->sign = true; break; + case Opt_ignore_signature: + vol->sign = true; + vol->ignore_signature = true; + break; case Opt_seal: /* we do not do the following in secFlags because seal * is a per tree connection (mount) not a per socket @@ -1989,6 +2028,13 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, } port = (unsigned short)option; break; + case Opt_min_enc_offload: + if (get_option_ul(args, &option)) { + cifs_dbg(VFS, "Invalid minimum encrypted read offload size (esize)\n"); + goto cifs_parse_mount_err; + } + vol->min_offload = option; + break; case Opt_blocksize: if (get_option_ul(args, &option)) { cifs_dbg(VFS, "%s: Invalid blocksize value\n", @@ -2586,6 +2632,12 @@ static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol) if (server->rdma != vol->rdma) return 0; + if (server->ignore_signature != vol->ignore_signature) + return 0; + + if (server->min_offload != vol->min_offload) + return 0; + return 1; } @@ -2681,11 +2733,13 @@ cifs_get_tcp_session(struct smb_vol *volume_info) goto out_err_crypto_release; } - tcp_ses->noblocksnd = volume_info->noblocksnd; + tcp_ses->noblockcnt = volume_info->rootfs; + tcp_ses->noblocksnd = volume_info->noblocksnd || volume_info->rootfs; tcp_ses->noautotune = volume_info->noautotune; tcp_ses->tcp_nodelay = volume_info->sockopt_tcp_nodelay; tcp_ses->rdma = volume_info->rdma; tcp_ses->in_flight = 0; + tcp_ses->max_in_flight = 0; tcp_ses->credits = 1; init_waitqueue_head(&tcp_ses->response_q); init_waitqueue_head(&tcp_ses->request_q); @@ -2760,10 +2814,11 @@ smbd_connected: module_put(THIS_MODULE); goto out_err_crypto_release; } + tcp_ses->min_offload = volume_info->min_offload; tcp_ses->tcpStatus = CifsNeedNegotiate; tcp_ses->nr_targets = 1; - + tcp_ses->ignore_signature = volume_info->ignore_signature; /* thread spawned, put it on the list */ spin_lock(&cifs_tcp_ses_lock); list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list); @@ -2840,16 +2895,17 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb_vol *volume_info) struct nls_table *nls_codepage; char unc[SERVER_NAME_LENGTH + sizeof("//x/IPC$")] = {0}; bool seal = false; + struct TCP_Server_Info *server = ses->server; /* * If the mount request that resulted in the creation of the * session requires encryption, force IPC to be encrypted too. */ if (volume_info->seal) { - if (ses->server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION) + if (server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION) seal = true; else { - cifs_dbg(VFS, + cifs_server_dbg(VFS, "IPC: server doesn't support encryption\n"); return -EOPNOTSUPP; } @@ -2859,7 +2915,7 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb_vol *volume_info) if (tcon == NULL) return -ENOMEM; - scnprintf(unc, sizeof(unc), "\\\\%s\\IPC$", ses->server->hostname); + scnprintf(unc, sizeof(unc), "\\\\%s\\IPC$", server->hostname); /* cannot fail */ nls_codepage = load_nls_default(); @@ -2868,11 +2924,11 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb_vol *volume_info) tcon->ses = ses; tcon->ipc = true; tcon->seal = seal; - rc = ses->server->ops->tree_connect(xid, ses, unc, tcon, nls_codepage); + rc = server->ops->tree_connect(xid, ses, unc, tcon, nls_codepage); free_xid(xid); if (rc) { - cifs_dbg(VFS, "failed to connect to IPC (rc=%d)\n", rc); + cifs_server_dbg(VFS, "failed to connect to IPC (rc=%d)\n", rc); tconInfoFree(tcon); goto out; } @@ -2958,7 +3014,7 @@ void cifs_put_smb_ses(struct cifs_ses *ses) xid = get_xid(); rc = server->ops->logoff(xid, ses); if (rc) - cifs_dbg(VFS, "%s: Session Logoff failure rc=%d\n", + cifs_server_dbg(VFS, "%s: Session Logoff failure rc=%d\n", __func__, rc); _free_xid(xid); } @@ -3212,7 +3268,6 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) ses->sectype = volume_info->sectype; ses->sign = volume_info->sign; - mutex_lock(&ses->session_mutex); rc = cifs_negotiate_protocol(xid, ses); if (!rc) @@ -3250,6 +3305,8 @@ static int match_tcon(struct cifs_tcon *tcon, struct smb_vol *volume_info) return 0; if (tcon->handle_timeout != volume_info->handle_timeout) return 0; + if (tcon->no_lease != volume_info->no_lease) + return 0; return 1; } @@ -3455,6 +3512,14 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) tcon->use_resilient = true; } + /* If the user really knows what they are doing they can override */ + if (tcon->share_flags & SMB2_SHAREFLAG_NO_CACHING) { + if (volume_info->cache_ro) + cifs_dbg(VFS, "cache=ro requested on mount but NO_CACHING flag set on share\n"); + else if (volume_info->cache_rw) + cifs_dbg(VFS, "cache=singleclient requested on mount but NO_CACHING flag set on share\n"); + } + /* * We can have only one retry value for a connection to a share so for * resources mounted more than once to the same server share the last @@ -3464,6 +3529,7 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) tcon->nocase = volume_info->nocase; tcon->nohandlecache = volume_info->nohandlecache; tcon->local_lease = volume_info->local_lease; + tcon->no_lease = volume_info->no_lease; INIT_LIST_HEAD(&tcon->pending_opens); spin_lock(&cifs_tcp_ses_lock); @@ -3659,10 +3725,10 @@ bind_socket(struct TCP_Server_Info *server) saddr4 = (struct sockaddr_in *)&server->srcaddr; saddr6 = (struct sockaddr_in6 *)&server->srcaddr; if (saddr6->sin6_family == AF_INET6) - cifs_dbg(VFS, "Failed to bind to: %pI6c, error: %d\n", + cifs_server_dbg(VFS, "Failed to bind to: %pI6c, error: %d\n", &saddr6->sin6_addr, rc); else - cifs_dbg(VFS, "Failed to bind to: %pI4, error: %d\n", + cifs_server_dbg(VFS, "Failed to bind to: %pI4, error: %d\n", &saddr4->sin_addr.s_addr, rc); } } @@ -3766,7 +3832,7 @@ generic_ip_connect(struct TCP_Server_Info *server) rc = __sock_create(cifs_net_ns(server), sfamily, SOCK_STREAM, IPPROTO_TCP, &socket, 1); if (rc < 0) { - cifs_dbg(VFS, "Error %d creating socket\n", rc); + cifs_server_dbg(VFS, "Error %d creating socket\n", rc); server->ssocket = NULL; return rc; } @@ -3814,7 +3880,11 @@ generic_ip_connect(struct TCP_Server_Info *server) socket->sk->sk_sndbuf, socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo); - rc = socket->ops->connect(socket, saddr, slen, 0); + rc = socket->ops->connect(socket, saddr, slen, + server->noblockcnt ? O_NONBLOCK : 0); + + if (rc == -EINPROGRESS) + rc = 0; if (rc < 0) { cifs_dbg(FYI, "Error %d connecting to server\n", rc); sock_release(socket); @@ -4040,6 +4110,14 @@ int cifs_setup_cifs_sb(struct smb_vol *pvolume_info, cifs_dbg(FYI, "mounting share using direct i/o\n"); cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO; } + if (pvolume_info->cache_ro) { + cifs_dbg(VFS, "mounting share with read only caching. Ensure that the share will not be modified while in use.\n"); + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_RO_CACHE; + } else if (pvolume_info->cache_rw) { + cifs_dbg(VFS, "mounting share in single client RW caching mode. Ensure that no other systems will be accessing the share.\n"); + cifs_sb->mnt_cifs_flags |= (CIFS_MOUNT_RO_CACHE | + CIFS_MOUNT_RW_CACHE); + } if (pvolume_info->mfsymlinks) { if (pvolume_info->sfu_emul) { /* @@ -4150,7 +4228,7 @@ static int mount_get_conns(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, if ((vol->persistent == true) && (!(ses->server->capabilities & SMB2_GLOBAL_CAP_PERSISTENT_HANDLES))) { - cifs_dbg(VFS, "persistent handles not supported by server\n"); + cifs_server_dbg(VFS, "persistent handles not supported by server\n"); return -EOPNOTSUPP; } @@ -4182,8 +4260,18 @@ static int mount_get_conns(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, tcon->unix_ext = 0; /* server does not support them */ /* do not care if a following call succeed - informational */ - if (!tcon->pipe && server->ops->qfs_tcon) + if (!tcon->pipe && server->ops->qfs_tcon) { server->ops->qfs_tcon(*xid, tcon); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE) { + if (tcon->fsDevInfo.DeviceCharacteristics & + FILE_READ_ONLY_DEVICE) + cifs_dbg(VFS, "mounted to read only share\n"); + else if ((cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_RW_CACHE) == 0) + cifs_dbg(VFS, "read only mount of RW share\n"); + /* no need to log a RW mount of a typical RW share */ + } + } cifs_sb->wsize = server->ops->negotiate_wsize(tcon, vol); cifs_sb->rsize = server->ops->negotiate_rsize(tcon, vol); @@ -4588,7 +4676,7 @@ static int is_path_remote(struct cifs_sb_info *cifs_sb, struct smb_vol *vol, rc = cifs_are_all_path_components_accessible(server, xid, tcon, cifs_sb, full_path, tcon->Flags & SMB_SHARE_IS_IN_DFS); if (rc != 0) { - cifs_dbg(VFS, "cannot query dirs between root and final path, " + cifs_server_dbg(VFS, "cannot query dirs between root and final path, " "enabling CIFS_MOUNT_USE_PREFIX_PATH\n"); cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH; rc = 0; @@ -5090,7 +5178,7 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, rc = server->ops->sess_setup(xid, ses, nls_info); if (rc) - cifs_dbg(VFS, "Send error in SessSetup = %d\n", rc); + cifs_server_dbg(VFS, "Send error in SessSetup = %d\n", rc); return rc; } diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index be424e81e3ad..dd5ac841aefa 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -125,7 +125,7 @@ cifs_bp_rename_retry: } rcu_read_unlock(); - full_path = kmalloc(namelen+1, GFP_KERNEL); + full_path = kmalloc(namelen+1, GFP_ATOMIC); if (full_path == NULL) return full_path; full_path[namelen] = 0; /* trailing null */ diff --git a/fs/cifs/export.c b/fs/cifs/export.c index ce8b7f677c58..eb0bb8ca8e63 100644 --- a/fs/cifs/export.c +++ b/fs/cifs/export.c @@ -24,7 +24,7 @@ */ /* - * See Documentation/filesystems/nfs/Exporting + * See Documentation/filesystems/nfs/exporting.rst * and examples in fs/exportfs * * Since cifs is a network file system, an "fsid" must be included for diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 97090693d182..4b95700c507c 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1693,9 +1693,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock) bool posix_lck = false; struct cifs_sb_info *cifs_sb; struct cifs_tcon *tcon; - struct cifsInodeInfo *cinode; struct cifsFileInfo *cfile; - __u16 netfid; __u32 type; rc = -EACCES; @@ -1711,8 +1709,6 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock) cifs_read_flock(flock, &type, &lock, &unlock, &wait_flag, tcon->ses->server); cifs_sb = CIFS_FILE_SB(file); - netfid = cfile->fid.netfid; - cinode = CIFS_I(file_inode(file)); if (cap_unix(tcon->ses) && (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && @@ -1764,7 +1760,6 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data, int rc = 0; unsigned int bytes_written = 0; unsigned int total_written; - struct cifs_sb_info *cifs_sb; struct cifs_tcon *tcon; struct TCP_Server_Info *server; unsigned int xid; @@ -1772,8 +1767,6 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data, struct cifsInodeInfo *cifsi = CIFS_I(d_inode(dentry)); struct cifs_io_parms io_parms; - cifs_sb = CIFS_SB(dentry->d_sb); - cifs_dbg(FYI, "write %zd bytes to offset %lld of %pd\n", write_size, *offset, dentry); @@ -1980,6 +1973,77 @@ find_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only) return cfile; } +int +cifs_get_writable_path(struct cifs_tcon *tcon, const char *name, + struct cifsFileInfo **ret_file) +{ + struct list_head *tmp; + struct cifsFileInfo *cfile; + struct cifsInodeInfo *cinode; + char *full_path; + + *ret_file = NULL; + + spin_lock(&tcon->open_file_lock); + list_for_each(tmp, &tcon->openFileList) { + cfile = list_entry(tmp, struct cifsFileInfo, + tlist); + full_path = build_path_from_dentry(cfile->dentry); + if (full_path == NULL) { + spin_unlock(&tcon->open_file_lock); + return -ENOMEM; + } + if (strcmp(full_path, name)) { + kfree(full_path); + continue; + } + + kfree(full_path); + cinode = CIFS_I(d_inode(cfile->dentry)); + spin_unlock(&tcon->open_file_lock); + return cifs_get_writable_file(cinode, 0, ret_file); + } + + spin_unlock(&tcon->open_file_lock); + return -ENOENT; +} + +int +cifs_get_readable_path(struct cifs_tcon *tcon, const char *name, + struct cifsFileInfo **ret_file) +{ + struct list_head *tmp; + struct cifsFileInfo *cfile; + struct cifsInodeInfo *cinode; + char *full_path; + + *ret_file = NULL; + + spin_lock(&tcon->open_file_lock); + list_for_each(tmp, &tcon->openFileList) { + cfile = list_entry(tmp, struct cifsFileInfo, + tlist); + full_path = build_path_from_dentry(cfile->dentry); + if (full_path == NULL) { + spin_unlock(&tcon->open_file_lock); + return -ENOMEM; + } + if (strcmp(full_path, name)) { + kfree(full_path); + continue; + } + + kfree(full_path); + cinode = CIFS_I(d_inode(cfile->dentry)); + spin_unlock(&tcon->open_file_lock); + *ret_file = find_readable_file(cinode, 0); + return *ret_file ? 0 : -ENOENT; + } + + spin_unlock(&tcon->open_file_lock); + return -ENOENT; +} + static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) { struct address_space *mapping = page->mapping; @@ -3577,10 +3641,8 @@ collect_uncached_read_data(struct cifs_aio_ctx *ctx) struct cifs_readdata *rdata, *tmp; struct iov_iter *to = &ctx->iter; struct cifs_sb_info *cifs_sb; - struct cifs_tcon *tcon; int rc; - tcon = tlink_tcon(ctx->cfile->tlink); cifs_sb = CIFS_SB(ctx->cfile->dentry->d_sb); mutex_lock(&ctx->aio_mutex); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 56ca4b8ccaba..26cdfbf1e164 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -893,8 +893,17 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, } /* fill in 0777 bits from ACL */ - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { - rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, full_path, fid); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID) { + rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, true, + full_path, fid); + if (rc) { + cifs_dbg(FYI, "%s: Get mode from SID failed. rc=%d\n", + __func__, rc); + goto cgii_exit; + } + } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { + rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, false, + full_path, fid); if (rc) { cifs_dbg(FYI, "%s: Getting ACL failed with error: %d\n", __func__, rc); @@ -2480,7 +2489,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) if (attrs->ia_valid & ATTR_GID) gid = attrs->ia_gid; - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) || + (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID)) { if (uid_valid(uid) || gid_valid(gid)) { rc = id_mode_to_cifs_acl(inode, full_path, NO_CHANGE_64, uid, gid); @@ -2501,7 +2511,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) if (attrs->ia_valid & ATTR_MODE) { mode = attrs->ia_mode; rc = 0; - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) || + (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID)) { rc = id_mode_to_cifs_acl(inode, full_path, mode, INVALID_UID, INVALID_GID); if (rc) { diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index ed92958e842d..49c17ee18254 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -949,8 +949,8 @@ static const int total_days_of_prev_months[] = { struct timespec64 cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset) { struct timespec64 ts; - time64_t sec; - int min, days, month, year; + time64_t sec, days; + int min, day, month, year; u16 date = le16_to_cpu(le_date); u16 time = le16_to_cpu(le_time); SMB_TIME *st = (SMB_TIME *)&time; @@ -966,15 +966,15 @@ struct timespec64 cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset) sec += 60 * 60 * st->Hours; if (st->Hours > 24) cifs_dbg(VFS, "illegal hours %d\n", st->Hours); - days = sd->Day; + day = sd->Day; month = sd->Month; - if (days < 1 || days > 31 || month < 1 || month > 12) { - cifs_dbg(VFS, "illegal date, month %d day: %d\n", month, days); - days = clamp(days, 1, 31); + if (day < 1 || day > 31 || month < 1 || month > 12) { + cifs_dbg(VFS, "illegal date, month %d day: %d\n", month, day); + day = clamp(day, 1, 31); month = clamp(month, 1, 12); } month -= 1; - days += total_days_of_prev_months[month]; + days = day + total_days_of_prev_months[month]; days += 3652; /* account for difference in days between 1980 and 1970 */ year = sd->Year; days += year * 365; diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index d8d9cdfa30b6..d2a3fb7e5c8d 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -51,7 +51,8 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, __u32 desired_access, __u32 create_disposition, - __u32 create_options, void *ptr, int command) + __u32 create_options, void *ptr, int command, + struct cifsFileInfo *cfile) { int rc; __le16 *utf16_path = NULL; @@ -83,10 +84,16 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; memset(rsp_iov, 0, sizeof(rsp_iov)); + /* We already have a handle so we can skip the open */ + if (cfile) + goto after_open; + /* Open */ utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); - if (!utf16_path) - return -ENOMEM; + if (!utf16_path) { + rc = -ENOMEM; + goto finished; + } oparms.tcon = tcon; oparms.desired_access = desired_access; @@ -106,7 +113,10 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, if (rc) goto finished; - smb2_set_next_command(tcon, &rqst[num_rqst++]); + smb2_set_next_command(tcon, &rqst[num_rqst]); + after_open: + num_rqst++; + rc = 0; /* Operation */ switch (command) { @@ -115,15 +125,31 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, rqst[num_rqst].rq_iov = qi_iov; rqst[num_rqst].rq_nvec = 1; - rc = SMB2_query_info_init(tcon, &rqst[num_rqst], COMPOUND_FID, - COMPOUND_FID, FILE_ALL_INFORMATION, + if (cfile) + rc = SMB2_query_info_init(tcon, &rqst[num_rqst], + cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + FILE_ALL_INFORMATION, + SMB2_O_INFO_FILE, 0, + sizeof(struct smb2_file_all_info) + + PATH_MAX * 2, 0, NULL); + else { + rc = SMB2_query_info_init(tcon, &rqst[num_rqst], + COMPOUND_FID, + COMPOUND_FID, + FILE_ALL_INFORMATION, SMB2_O_INFO_FILE, 0, sizeof(struct smb2_file_all_info) + PATH_MAX * 2, 0, NULL); + if (!rc) { + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst]); + } + } + if (rc) goto finished; - smb2_set_next_command(tcon, &rqst[num_rqst]); - smb2_set_related(&rqst[num_rqst++]); + num_rqst++; trace_smb3_query_info_compound_enter(xid, ses->Suid, tcon->tid, full_path); break; @@ -182,14 +208,27 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, size[0] = sizeof(FILE_BASIC_INFO); data[0] = ptr; - rc = SMB2_set_info_init(tcon, &rqst[num_rqst], COMPOUND_FID, - COMPOUND_FID, current->tgid, - FILE_BASIC_INFORMATION, - SMB2_O_INFO_FILE, 0, data, size); + if (cfile) + rc = SMB2_set_info_init(tcon, &rqst[num_rqst], + cfile->fid.persistent_fid, + cfile->fid.volatile_fid, current->tgid, + FILE_BASIC_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + else { + rc = SMB2_set_info_init(tcon, &rqst[num_rqst], + COMPOUND_FID, + COMPOUND_FID, current->tgid, + FILE_BASIC_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + if (!rc) { + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst]); + } + } + if (rc) goto finished; - smb2_set_next_command(tcon, &rqst[num_rqst]); - smb2_set_related(&rqst[num_rqst++]); + num_rqst++; trace_smb3_set_info_compound_enter(xid, ses->Suid, tcon->tid, full_path); break; @@ -210,14 +249,25 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, size[1] = len + 2 /* null */; data[1] = (__le16 *)ptr; - rc = SMB2_set_info_init(tcon, &rqst[num_rqst], COMPOUND_FID, - COMPOUND_FID, current->tgid, - FILE_RENAME_INFORMATION, + if (cfile) + rc = SMB2_set_info_init(tcon, &rqst[num_rqst], + cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + current->tgid, FILE_RENAME_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + else { + rc = SMB2_set_info_init(tcon, &rqst[num_rqst], + COMPOUND_FID, COMPOUND_FID, + current->tgid, FILE_RENAME_INFORMATION, SMB2_O_INFO_FILE, 0, data, size); + if (!rc) { + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst]); + } + } if (rc) goto finished; - smb2_set_next_command(tcon, &rqst[num_rqst]); - smb2_set_related(&rqst[num_rqst++]); + num_rqst++; trace_smb3_rename_enter(xid, ses->Suid, tcon->tid, full_path); break; case SMB2_OP_HARDLINK: @@ -254,21 +304,43 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, if (rc) goto finished; + /* We already have a handle so we can skip the close */ + if (cfile) + goto after_close; /* Close */ memset(&close_iov, 0, sizeof(close_iov)); rqst[num_rqst].rq_iov = close_iov; rqst[num_rqst].rq_nvec = 1; rc = SMB2_close_init(tcon, &rqst[num_rqst], COMPOUND_FID, COMPOUND_FID); - smb2_set_related(&rqst[num_rqst++]); + smb2_set_related(&rqst[num_rqst]); if (rc) goto finished; - - rc = compound_send_recv(xid, ses, flags, num_rqst, rqst, - resp_buftype, rsp_iov); + after_close: + num_rqst++; + + if (cfile) { + cifsFileInfo_put(cfile); + cfile = NULL; + rc = compound_send_recv(xid, ses, flags, num_rqst - 2, + &rqst[1], &resp_buftype[1], + &rsp_iov[1]); + } else + rc = compound_send_recv(xid, ses, flags, num_rqst, + rqst, resp_buftype, + rsp_iov); finished: + if (cfile) + cifsFileInfo_put(cfile); + SMB2_open_free(&rqst[0]); + if (rc == -EREMCHG) { + printk_once(KERN_WARNING "server share %s deleted\n", + tcon->treeName); + tcon->need_reconnect = true; + } + switch (command) { case SMB2_OP_QUERY_INFO: if (rc == 0) { @@ -371,6 +443,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, __u32 create_options = 0; struct cifs_fid fid; bool no_cached_open = tcon->nohandlecache; + struct cifsFileInfo *cfile; *adjust_tz = false; *symlink = false; @@ -402,9 +475,10 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, if (backup_cred(cifs_sb)) create_options |= CREATE_OPEN_BACKUP_INTENT; + cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, - smb2_data, SMB2_OP_QUERY_INFO); + smb2_data, SMB2_OP_QUERY_INFO, cfile); if (rc == -EOPNOTSUPP) { *symlink = true; create_options |= OPEN_REPARSE_POINT; @@ -413,7 +487,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, smb2_data, - SMB2_OP_QUERY_INFO); + SMB2_OP_QUERY_INFO, NULL); } if (rc) goto out; @@ -430,7 +504,7 @@ smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, { return smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, - CREATE_NOT_FILE, NULL, SMB2_OP_MKDIR); + CREATE_NOT_FILE, NULL, SMB2_OP_MKDIR, NULL); } void @@ -440,6 +514,7 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name, { FILE_BASIC_INFO data; struct cifsInodeInfo *cifs_i; + struct cifsFileInfo *cfile; u32 dosattrs; int tmprc; @@ -447,9 +522,11 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name, cifs_i = CIFS_I(inode); dosattrs = cifs_i->cifsAttrs | ATTR_READONLY; data.Attributes = cpu_to_le32(dosattrs); + cifs_get_writable_path(tcon, name, &cfile); tmprc = smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, - CREATE_NOT_FILE, &data, SMB2_OP_SET_INFO); + CREATE_NOT_FILE, &data, SMB2_OP_SET_INFO, + cfile); if (tmprc == 0) cifs_i->cifsAttrs = dosattrs; } @@ -460,7 +537,7 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, { return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, CREATE_NOT_FILE, - NULL, SMB2_OP_RMDIR); + NULL, SMB2_OP_RMDIR, NULL); } int @@ -469,13 +546,14 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, { return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT, - NULL, SMB2_OP_DELETE); + NULL, SMB2_OP_DELETE, NULL); } static int smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, const char *from_name, const char *to_name, - struct cifs_sb_info *cifs_sb, __u32 access, int command) + struct cifs_sb_info *cifs_sb, __u32 access, int command, + struct cifsFileInfo *cfile) { __le16 *smb2_to_name = NULL; int rc; @@ -486,7 +564,7 @@ smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, goto smb2_rename_path; } rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access, - FILE_OPEN, 0, smb2_to_name, command); + FILE_OPEN, 0, smb2_to_name, command, cfile); smb2_rename_path: kfree(smb2_to_name); return rc; @@ -497,8 +575,12 @@ smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon, const char *from_name, const char *to_name, struct cifs_sb_info *cifs_sb) { - return smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb, - DELETE, SMB2_OP_RENAME); + struct cifsFileInfo *cfile; + + cifs_get_writable_path(tcon, from_name, &cfile); + + return smb2_set_path_attr(xid, tcon, from_name, to_name, + cifs_sb, DELETE, SMB2_OP_RENAME, cfile); } int @@ -507,7 +589,8 @@ smb2_create_hardlink(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb) { return smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb, - FILE_READ_ATTRIBUTES, SMB2_OP_HARDLINK); + FILE_READ_ATTRIBUTES, SMB2_OP_HARDLINK, + NULL); } int @@ -519,7 +602,7 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, return smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_WRITE_DATA, FILE_OPEN, 0, &eof, - SMB2_OP_SET_EOF); + SMB2_OP_SET_EOF, NULL); } int @@ -541,7 +624,7 @@ smb2_set_file_info(struct inode *inode, const char *full_path, rc = smb2_compound_op(xid, tlink_tcon(tlink), cifs_sb, full_path, FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, buf, - SMB2_OP_SET_INFO); + SMB2_OP_SET_INFO, NULL); cifs_put_tlink(tlink); return rc; } diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index 82ade16c9501..7fde3775cb57 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -511,7 +511,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_PRINT_QUEUE_FULL, -EIO, "STATUS_PRINT_QUEUE_FULL"}, {STATUS_NO_SPOOL_SPACE, -EIO, "STATUS_NO_SPOOL_SPACE"}, {STATUS_PRINT_CANCELLED, -EIO, "STATUS_PRINT_CANCELLED"}, - {STATUS_NETWORK_NAME_DELETED, -EIO, "STATUS_NETWORK_NAME_DELETED"}, + {STATUS_NETWORK_NAME_DELETED, -EREMCHG, "STATUS_NETWORK_NAME_DELETED"}, {STATUS_NETWORK_ACCESS_DENIED, -EACCES, "STATUS_NETWORK_ACCESS_DENIED"}, {STATUS_BAD_DEVICE_TYPE, -EIO, "STATUS_BAD_DEVICE_TYPE"}, {STATUS_BAD_NETWORK_NAME, -ENOENT, "STATUS_BAD_NETWORK_NAME"}, diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 64a5864127be..eaed18061314 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -109,10 +109,10 @@ smb2_add_credits(struct TCP_Server_Info *server, /* change_conf hasn't been executed */ break; case 0: - cifs_dbg(VFS, "Possible client or server bug - zero credits\n"); + cifs_server_dbg(VFS, "Possible client or server bug - zero credits\n"); break; case 1: - cifs_dbg(VFS, "disabling echoes and oplocks\n"); + cifs_server_dbg(VFS, "disabling echoes and oplocks\n"); break; case 2: cifs_dbg(FYI, "disabling oplocks\n"); @@ -203,6 +203,8 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, credits->instance = server->reconnect_instance; server->credits -= credits->value; server->in_flight++; + if (server->in_flight > server->max_in_flight) + server->max_in_flight = server->in_flight; break; } } @@ -230,7 +232,7 @@ smb2_adjust_credits(struct TCP_Server_Info *server, if (server->reconnect_instance != credits->instance) { spin_unlock(&server->req_lock); - cifs_dbg(VFS, "trying to return %d credits to old session\n", + cifs_server_dbg(VFS, "trying to return %d credits to old session\n", credits->value - new_val); return -EAGAIN; } @@ -270,7 +272,7 @@ smb2_find_mid(struct TCP_Server_Info *server, char *buf) __u64 wire_mid = le64_to_cpu(shdr->MessageId); if (shdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) { - cifs_dbg(VFS, "Encrypted frame parsing not supported yet\n"); + cifs_server_dbg(VFS, "Encrypted frame parsing not supported yet\n"); return NULL; } @@ -294,10 +296,10 @@ smb2_dump_detail(void *buf, struct TCP_Server_Info *server) #ifdef CONFIG_CIFS_DEBUG2 struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; - cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Mid: %llu Pid: %d\n", + cifs_server_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Mid: %llu Pid: %d\n", shdr->Command, shdr->Status, shdr->Flags, shdr->MessageId, shdr->ProcessId); - cifs_dbg(VFS, "smb buf %p len %u\n", buf, + cifs_server_dbg(VFS, "smb buf %p len %u\n", buf, server->ops->calc_smb_size(buf, server)); #endif } @@ -576,7 +578,7 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) "server does not support query network interfaces\n"); goto out; } else if (rc != 0) { - cifs_dbg(VFS, "error %d on ioctl to get interface list\n", rc); + cifs_tcon_dbg(VFS, "error %d on ioctl to get interface list\n", rc); goto out; } @@ -656,6 +658,15 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) return 0; } + /* + * We do not hold the lock for the open because in case + * SMB2_open needs to reconnect, it will end up calling + * cifs_mark_open_files_invalid() which takes the lock again + * thus causing a deadlock + */ + + mutex_unlock(&tcon->crfid.fid_mutex); + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; @@ -677,7 +688,7 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, &utf16_path); if (rc) - goto oshr_exit; + goto oshr_free; smb2_set_next_command(tcon, &rqst[0]); memset(&qi_iov, 0, sizeof(qi_iov)); @@ -690,18 +701,10 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) sizeof(struct smb2_file_all_info) + PATH_MAX * 2, 0, NULL); if (rc) - goto oshr_exit; + goto oshr_free; smb2_set_related(&rqst[1]); - /* - * We do not hold the lock for the open because in case - * SMB2_open needs to reconnect, it will end up calling - * cifs_mark_open_files_invalid() which takes the lock again - * thus causing a deadlock - */ - - mutex_unlock(&tcon->crfid.fid_mutex); rc = compound_send_recv(xid, ses, flags, 2, rqst, resp_buftype, rsp_iov); mutex_lock(&tcon->crfid.fid_mutex); @@ -739,8 +742,14 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) /* Cached root is still invalid, continue normaly */ - if (rc) + if (rc) { + if (rc == -EREMCHG) { + tcon->need_reconnect = true; + printk_once(KERN_WARNING "server share %s deleted\n", + tcon->treeName); + } goto oshr_exit; + } o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base; oparms.fid->persistent_fid = o_rsp->PersistentFileId; @@ -1330,11 +1339,11 @@ SMB2_request_res_key(const unsigned int xid, struct cifs_tcon *tcon, (char **)&res_key, &ret_data_len); if (rc) { - cifs_dbg(VFS, "refcpy ioctl error %d getting resume key\n", rc); + cifs_tcon_dbg(VFS, "refcpy ioctl error %d getting resume key\n", rc); goto req_res_key_exit; } if (ret_data_len < sizeof(struct resume_key_req)) { - cifs_dbg(VFS, "Invalid refcopy resume key length\n"); + cifs_tcon_dbg(VFS, "Invalid refcopy resume key length\n"); rc = -EINVAL; goto req_res_key_exit; } @@ -1369,7 +1378,10 @@ smb2_ioctl_query_info(const unsigned int xid, struct cifs_fid fid; struct kvec qi_iov[1]; struct kvec io_iov[SMB2_IOCTL_IOV_SIZE]; + struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE]; struct kvec close_iov[1]; + unsigned int size[2]; + void *data[2]; memset(rqst, 0, sizeof(rqst)); resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; @@ -1404,7 +1416,6 @@ smb2_ioctl_query_info(const unsigned int xid, memset(&oparms, 0, sizeof(oparms)); oparms.tcon = tcon; - oparms.desired_access = FILE_READ_ATTRIBUTES | READ_CONTROL; oparms.disposition = FILE_OPEN; if (is_dir) oparms.create_options = CREATE_NOT_FILE; @@ -1413,9 +1424,6 @@ smb2_ioctl_query_info(const unsigned int xid, oparms.fid = &fid; oparms.reconnect = false; - /* - * FSCTL codes encode the special access they need in the fsctl code. - */ if (qi.flags & PASSTHRU_FSCTL) { switch (qi.info_type & FSCTL_DEVICE_ACCESS_MASK) { case FSCTL_DEVICE_ACCESS_FILE_READ_WRITE_ACCESS: @@ -1431,6 +1439,10 @@ smb2_ioctl_query_info(const unsigned int xid, oparms.desired_access = GENERIC_WRITE; break; } + } else if (qi.flags & PASSTHRU_SET_INFO) { + oparms.desired_access = GENERIC_WRITE; + } else { + oparms.desired_access = FILE_READ_ATTRIBUTES | READ_CONTROL; } rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, path); @@ -1454,6 +1466,24 @@ smb2_ioctl_query_info(const unsigned int xid, qi.output_buffer_length, CIFSMaxBufSize); } + } else if (qi.flags == PASSTHRU_SET_INFO) { + /* Can eventually relax perm check since server enforces too */ + if (!capable(CAP_SYS_ADMIN)) + rc = -EPERM; + else { + memset(&si_iov, 0, sizeof(si_iov)); + rqst[1].rq_iov = si_iov; + rqst[1].rq_nvec = 1; + + size[0] = 8; + data[0] = buffer; + + rc = SMB2_set_info_init(tcon, &rqst[1], + COMPOUND_FID, COMPOUND_FID, + current->tgid, + FILE_END_OF_FILE_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + } } else if (qi.flags == PASSTHRU_QUERY_INFO) { memset(&qi_iov, 0, sizeof(qi_iov)); rqst[1].rq_iov = qi_iov; @@ -1465,7 +1495,7 @@ smb2_ioctl_query_info(const unsigned int xid, qi.input_buffer_length, qi.output_buffer_length, buffer); } else { /* unknown flags */ - cifs_dbg(VFS, "invalid passthru query flags: 0x%x\n", qi.flags); + cifs_tcon_dbg(VFS, "invalid passthru query flags: 0x%x\n", qi.flags); rc = -EINVAL; } @@ -1592,7 +1622,7 @@ smb2_copychunk_range(const unsigned int xid, if (rc == 0) { if (ret_data_len != sizeof(struct copychunk_ioctl_rsp)) { - cifs_dbg(VFS, "invalid cchunk response size\n"); + cifs_tcon_dbg(VFS, "invalid cchunk response size\n"); rc = -EIO; goto cchunk_out; } @@ -1606,12 +1636,12 @@ smb2_copychunk_range(const unsigned int xid, */ if (le32_to_cpu(retbuf->TotalBytesWritten) > le32_to_cpu(pcchunk->Length)) { - cifs_dbg(VFS, "invalid copy chunk response\n"); + cifs_tcon_dbg(VFS, "invalid copy chunk response\n"); rc = -EIO; goto cchunk_out; } if (le32_to_cpu(retbuf->ChunksWritten) != 1) { - cifs_dbg(VFS, "invalid num chunks written\n"); + cifs_tcon_dbg(VFS, "invalid num chunks written\n"); rc = -EIO; goto cchunk_out; } @@ -2214,6 +2244,11 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, resp_buftype, rsp_iov); if (rc) { free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); + if (rc == -EREMCHG) { + tcon->need_reconnect = true; + printk_once(KERN_WARNING "server share %s deleted\n", + tcon->treeName); + } goto qic_exit; } *rsp = rsp_iov[1]; @@ -2401,7 +2436,7 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses, if (rc) { if ((rc != -ENOENT) && (rc != -EOPNOTSUPP)) - cifs_dbg(VFS, "ioctl error in %s rc=%d\n", __func__, rc); + cifs_tcon_dbg(VFS, "ioctl error in %s rc=%d\n", __func__, rc); goto out; } @@ -2410,7 +2445,7 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses, nls_codepage, remap, search_name, true /* is_unicode */); if (rc) { - cifs_dbg(VFS, "parse error in %s rc=%d\n", __func__, rc); + cifs_tcon_dbg(VFS, "parse error in %s rc=%d\n", __func__, rc); goto out; } @@ -2640,7 +2675,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, if (plen + le32_to_cpu(ioctl_rsp->OutputOffset) > rsp_iov[1].iov_len) { - cifs_dbg(VFS, "srv returned invalid ioctl len: %d\n", + cifs_tcon_dbg(VFS, "srv returned invalid ioctl len: %d\n", plen); rc = -EIO; goto querty_exit; @@ -2939,7 +2974,6 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, loff_t offset, loff_t len) { struct inode *inode; - struct cifsInodeInfo *cifsi; struct cifsFileInfo *cfile = file->private_data; struct file_zero_data_information fsctl_buf; long rc; @@ -2949,7 +2983,6 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, xid = get_xid(); inode = d_inode(cfile->dentry); - cifsi = CIFS_I(inode); /* Need to make file sparse, if not already, before freeing range. */ /* Consider adding equivalent for compressed since it could also work */ @@ -3595,14 +3628,14 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, rc = smb2_get_enc_key(server, tr_hdr->SessionId, enc, key); if (rc) { - cifs_dbg(VFS, "%s: Could not get %scryption key\n", __func__, + cifs_server_dbg(VFS, "%s: Could not get %scryption key\n", __func__, enc ? "en" : "de"); return 0; } rc = smb3_crypto_aead_allocate(server); if (rc) { - cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__); + cifs_server_dbg(VFS, "%s: crypto alloc failed\n", __func__); return rc; } @@ -3610,19 +3643,19 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, server->secmech.ccmaesdecrypt; rc = crypto_aead_setkey(tfm, key, SMB3_SIGN_KEY_SIZE); if (rc) { - cifs_dbg(VFS, "%s: Failed to set aead key %d\n", __func__, rc); + cifs_server_dbg(VFS, "%s: Failed to set aead key %d\n", __func__, rc); return rc; } rc = crypto_aead_setauthsize(tfm, SMB2_SIGNATURE_SIZE); if (rc) { - cifs_dbg(VFS, "%s: Failed to set authsize %d\n", __func__, rc); + cifs_server_dbg(VFS, "%s: Failed to set authsize %d\n", __func__, rc); return rc; } req = aead_request_alloc(tfm, GFP_KERNEL); if (!req) { - cifs_dbg(VFS, "%s: Failed to alloc aead request\n", __func__); + cifs_server_dbg(VFS, "%s: Failed to alloc aead request\n", __func__); return -ENOMEM; } @@ -3633,7 +3666,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, sg = init_sg(num_rqst, rqst, sign); if (!sg) { - cifs_dbg(VFS, "%s: Failed to init sg\n", __func__); + cifs_server_dbg(VFS, "%s: Failed to init sg\n", __func__); rc = -ENOMEM; goto free_req; } @@ -3641,7 +3674,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, iv_len = crypto_aead_ivsize(tfm); iv = kzalloc(iv_len, GFP_KERNEL); if (!iv) { - cifs_dbg(VFS, "%s: Failed to alloc iv\n", __func__); + cifs_server_dbg(VFS, "%s: Failed to alloc iv\n", __func__); rc = -ENOMEM; goto free_sg; } @@ -3883,7 +3916,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, bool use_rdma_mr = false; if (shdr->Command != SMB2_READ) { - cifs_dbg(VFS, "only big read responses are supported\n"); + cifs_server_dbg(VFS, "only big read responses are supported\n"); return -ENOTSUPP; } @@ -3998,8 +4031,55 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, return length; } +struct smb2_decrypt_work { + struct work_struct decrypt; + struct TCP_Server_Info *server; + struct page **ppages; + char *buf; + unsigned int npages; + unsigned int len; +}; + + +static void smb2_decrypt_offload(struct work_struct *work) +{ + struct smb2_decrypt_work *dw = container_of(work, + struct smb2_decrypt_work, decrypt); + int i, rc; + struct mid_q_entry *mid; + + rc = decrypt_raw_data(dw->server, dw->buf, dw->server->vals->read_rsp_size, + dw->ppages, dw->npages, dw->len); + if (rc) { + cifs_dbg(VFS, "error decrypting rc=%d\n", rc); + goto free_pages; + } + + dw->server->lstrp = jiffies; + mid = smb2_find_mid(dw->server, dw->buf); + if (mid == NULL) + cifs_dbg(FYI, "mid not found\n"); + else { + mid->decrypted = true; + rc = handle_read_data(dw->server, mid, dw->buf, + dw->server->vals->read_rsp_size, + dw->ppages, dw->npages, dw->len); + mid->callback(mid); + cifs_mid_q_entry_release(mid); + } + +free_pages: + for (i = dw->npages-1; i >= 0; i--) + put_page(dw->ppages[i]); + + kfree(dw->ppages); + cifs_small_buf_release(dw->buf); +} + + static int -receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid) +receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid, + int *num_mids) { char *buf = server->smallbuf; struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf; @@ -4009,7 +4089,9 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid) unsigned int buflen = server->pdu_size; int rc; int i = 0; + struct smb2_decrypt_work *dw; + *num_mids = 1; len = min_t(unsigned int, buflen, server->vals->read_rsp_size + sizeof(struct smb2_transform_hdr)) - HEADER_SIZE(server) + 1; @@ -4045,6 +4127,32 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid) if (rc) goto free_pages; + /* + * For large reads, offload to different thread for better performance, + * use more cores decrypting which can be expensive + */ + + if ((server->min_offload) && (server->in_flight > 1) && + (server->pdu_size >= server->min_offload)) { + dw = kmalloc(sizeof(struct smb2_decrypt_work), GFP_KERNEL); + if (dw == NULL) + goto non_offloaded_decrypt; + + dw->buf = server->smallbuf; + server->smallbuf = (char *)cifs_small_buf_get(); + + INIT_WORK(&dw->decrypt, smb2_decrypt_offload); + + dw->npages = npages; + dw->server = server; + dw->ppages = pages; + dw->len = len; + queue_work(cifsiod_wq, &dw->decrypt); + *num_mids = 0; /* worker thread takes care of finding mid */ + return -1; + } + +non_offloaded_decrypt: rc = decrypt_raw_data(server, buf, server->vals->read_rsp_size, pages, npages, len); if (rc) @@ -4129,7 +4237,7 @@ one_more: } if (*num_mids >= MAX_COMPOUND) { - cifs_dbg(VFS, "too many PDUs in compound\n"); + cifs_server_dbg(VFS, "too many PDUs in compound\n"); return -1; } bufs[*num_mids] = buf; @@ -4175,7 +4283,7 @@ smb3_receive_transform(struct TCP_Server_Info *server, if (pdu_length < sizeof(struct smb2_transform_hdr) + sizeof(struct smb2_sync_hdr)) { - cifs_dbg(VFS, "Transform message is too small (%u)\n", + cifs_server_dbg(VFS, "Transform message is too small (%u)\n", pdu_length); cifs_reconnect(server); wake_up(&server->response_q); @@ -4183,7 +4291,7 @@ smb3_receive_transform(struct TCP_Server_Info *server, } if (pdu_length < orig_len + sizeof(struct smb2_transform_hdr)) { - cifs_dbg(VFS, "Transform message is broken\n"); + cifs_server_dbg(VFS, "Transform message is broken\n"); cifs_reconnect(server); wake_up(&server->response_q); return -ECONNABORTED; @@ -4191,8 +4299,7 @@ smb3_receive_transform(struct TCP_Server_Info *server, /* TODO: add support for compounds containing READ. */ if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server)) { - *num_mids = 1; - return receive_encrypted_read(server, &mids[0]); + return receive_encrypted_read(server, &mids[0], num_mids); } return receive_encrypted_standard(server, mids, bufs, num_mids); diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 31e4a1b0b170..87066f1af12c 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -503,8 +503,7 @@ build_netname_ctxt(struct smb2_netname_neg_context *pneg_ctxt, char *hostname) pneg_ctxt->ContextType = SMB2_NETNAME_NEGOTIATE_CONTEXT_ID; /* copy up to max of first 100 bytes of server name to NetName field */ - pneg_ctxt->DataLength = cpu_to_le16(2 + - (2 * cifs_strtoUTF16(pneg_ctxt->NetName, hostname, 100, cp))); + pneg_ctxt->DataLength = cpu_to_le16(2 * cifs_strtoUTF16(pneg_ctxt->NetName, hostname, 100, cp)); /* context size is DataLength + minimal smb2_neg_context */ return DIV_ROUND_UP(le16_to_cpu(pneg_ctxt->DataLength) + sizeof(struct smb2_neg_context), 8) * 8; @@ -543,7 +542,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req, if (*total_len > 200) { /* In case length corrupted don't want to overrun smb buffer */ - cifs_dbg(VFS, "Bad frame length assembling neg contexts\n"); + cifs_server_dbg(VFS, "Bad frame length assembling neg contexts\n"); return; } @@ -661,7 +660,7 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp, cifs_dbg(FYI, "decoding %d negotiate contexts\n", ctxt_cnt); if (len_of_smb <= offset) { - cifs_dbg(VFS, "Invalid response: negotiate context offset\n"); + cifs_server_dbg(VFS, "Invalid response: negotiate context offset\n"); return -EINVAL; } @@ -693,7 +692,7 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp, else if (pctx->ContextType == SMB2_POSIX_EXTENSIONS_AVAILABLE) server->posix_ext_supported = true; else - cifs_dbg(VFS, "unknown negcontext of type %d ignored\n", + cifs_server_dbg(VFS, "unknown negcontext of type %d ignored\n", le16_to_cpu(pctx->ContextType)); if (rc) @@ -818,7 +817,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) req->Dialects[1] = cpu_to_le16(SMB302_PROT_ID); req->DialectCount = cpu_to_le16(2); total_len += 4; - } else if (strcmp(ses->server->vals->version_string, + } else if (strcmp(server->vals->version_string, SMBDEFAULT_VERSION_STRING) == 0) { req->Dialects[0] = cpu_to_le16(SMB21_PROT_ID); req->Dialects[1] = cpu_to_le16(SMB30_PROT_ID); @@ -841,16 +840,16 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) else req->SecurityMode = 0; - req->Capabilities = cpu_to_le32(ses->server->vals->req_capabilities); + req->Capabilities = cpu_to_le32(server->vals->req_capabilities); /* ClientGUID must be zero for SMB2.02 dialect */ - if (ses->server->vals->protocol_id == SMB20_PROT_ID) + if (server->vals->protocol_id == SMB20_PROT_ID) memset(req->ClientGUID, 0, SMB2_CLIENT_GUID_SIZE); else { memcpy(req->ClientGUID, server->client_guid, SMB2_CLIENT_GUID_SIZE); - if ((ses->server->vals->protocol_id == SMB311_PROT_ID) || - (strcmp(ses->server->vals->version_string, + if ((server->vals->protocol_id == SMB311_PROT_ID) || + (strcmp(server->vals->version_string, SMBDEFAULT_VERSION_STRING) == 0)) assemble_neg_contexts(req, server, &total_len); } @@ -869,42 +868,42 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) * cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_fail[SMB2...]); */ if (rc == -EOPNOTSUPP) { - cifs_dbg(VFS, "Dialect not supported by server. Consider " + cifs_server_dbg(VFS, "Dialect not supported by server. Consider " "specifying vers=1.0 or vers=2.0 on mount for accessing" " older servers\n"); goto neg_exit; } else if (rc != 0) goto neg_exit; - if (strcmp(ses->server->vals->version_string, + if (strcmp(server->vals->version_string, SMB3ANY_VERSION_STRING) == 0) { if (rsp->DialectRevision == cpu_to_le16(SMB20_PROT_ID)) { - cifs_dbg(VFS, + cifs_server_dbg(VFS, "SMB2 dialect returned but not requested\n"); return -EIO; } else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID)) { - cifs_dbg(VFS, + cifs_server_dbg(VFS, "SMB2.1 dialect returned but not requested\n"); return -EIO; } - } else if (strcmp(ses->server->vals->version_string, + } else if (strcmp(server->vals->version_string, SMBDEFAULT_VERSION_STRING) == 0) { if (rsp->DialectRevision == cpu_to_le16(SMB20_PROT_ID)) { - cifs_dbg(VFS, + cifs_server_dbg(VFS, "SMB2 dialect returned but not requested\n"); return -EIO; } else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID)) { /* ops set to 3.0 by default for default so update */ - ses->server->ops = &smb21_operations; - ses->server->vals = &smb21_values; + server->ops = &smb21_operations; + server->vals = &smb21_values; } else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID)) { - ses->server->ops = &smb311_operations; - ses->server->vals = &smb311_values; + server->ops = &smb311_operations; + server->vals = &smb311_values; } } else if (le16_to_cpu(rsp->DialectRevision) != - ses->server->vals->protocol_id) { + server->vals->protocol_id) { /* if requested single dialect ensure returned dialect matched */ - cifs_dbg(VFS, "Illegal 0x%x dialect returned: not requested\n", + cifs_server_dbg(VFS, "Illegal 0x%x dialect returned: not requested\n", le16_to_cpu(rsp->DialectRevision)); return -EIO; } @@ -922,7 +921,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID)) cifs_dbg(FYI, "negotiated smb3.1.1 dialect\n"); else { - cifs_dbg(VFS, "Illegal dialect returned by server 0x%x\n", + cifs_server_dbg(VFS, "Illegal dialect returned by server 0x%x\n", le16_to_cpu(rsp->DialectRevision)); rc = -EIO; goto neg_exit; @@ -982,7 +981,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) rc = smb311_decode_neg_context(rsp, server, rsp_iov.iov_len); else - cifs_dbg(VFS, "Missing expected negotiate contexts\n"); + cifs_server_dbg(VFS, "Missing expected negotiate contexts\n"); } neg_exit: free_rsp_buf(resp_buftype, rsp); @@ -996,11 +995,12 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) struct validate_negotiate_info_rsp *pneg_rsp = NULL; u32 rsplen; u32 inbuflen; /* max of 4 dialects */ + struct TCP_Server_Info *server = tcon->ses->server; cifs_dbg(FYI, "validate negotiate\n"); /* In SMB3.11 preauth integrity supersedes validate negotiate */ - if (tcon->ses->server->dialect == SMB311_PROT_ID) + if (server->dialect == SMB311_PROT_ID) return 0; /* @@ -1019,15 +1019,15 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) } if (tcon->ses->session_flags & SMB2_SESSION_FLAG_IS_NULL) - cifs_dbg(VFS, "Unexpected null user (anonymous) auth flag sent by server\n"); + cifs_tcon_dbg(VFS, "Unexpected null user (anonymous) auth flag sent by server\n"); pneg_inbuf = kmalloc(sizeof(*pneg_inbuf), GFP_NOFS); if (!pneg_inbuf) return -ENOMEM; pneg_inbuf->Capabilities = - cpu_to_le32(tcon->ses->server->vals->req_capabilities); - memcpy(pneg_inbuf->Guid, tcon->ses->server->client_guid, + cpu_to_le32(server->vals->req_capabilities); + memcpy(pneg_inbuf->Guid, server->client_guid, SMB2_CLIENT_GUID_SIZE); if (tcon->ses->sign) @@ -1040,7 +1040,7 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) pneg_inbuf->SecurityMode = 0; - if (strcmp(tcon->ses->server->vals->version_string, + if (strcmp(server->vals->version_string, SMB3ANY_VERSION_STRING) == 0) { pneg_inbuf->Dialects[0] = cpu_to_le16(SMB30_PROT_ID); pneg_inbuf->Dialects[1] = cpu_to_le16(SMB302_PROT_ID); @@ -1048,7 +1048,7 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) /* structure is big enough for 3 dialects, sending only 2 */ inbuflen = sizeof(*pneg_inbuf) - (2 * sizeof(pneg_inbuf->Dialects[0])); - } else if (strcmp(tcon->ses->server->vals->version_string, + } else if (strcmp(server->vals->version_string, SMBDEFAULT_VERSION_STRING) == 0) { pneg_inbuf->Dialects[0] = cpu_to_le16(SMB21_PROT_ID); pneg_inbuf->Dialects[1] = cpu_to_le16(SMB30_PROT_ID); @@ -1060,7 +1060,7 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) } else { /* otherwise specific dialect was requested */ pneg_inbuf->Dialects[0] = - cpu_to_le16(tcon->ses->server->vals->protocol_id); + cpu_to_le16(server->vals->protocol_id); pneg_inbuf->DialectCount = cpu_to_le16(1); /* structure is big enough for 3 dialects, sending only 1 */ inbuflen = sizeof(*pneg_inbuf) - @@ -1076,18 +1076,18 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) * Old Windows versions or Netapp SMB server can return * not supported error. Client should accept it. */ - cifs_dbg(VFS, "Server does not support validate negotiate\n"); + cifs_tcon_dbg(VFS, "Server does not support validate negotiate\n"); rc = 0; goto out_free_inbuf; } else if (rc != 0) { - cifs_dbg(VFS, "validate protocol negotiate failed: %d\n", rc); + cifs_tcon_dbg(VFS, "validate protocol negotiate failed: %d\n", rc); rc = -EIO; goto out_free_inbuf; } rc = -EIO; if (rsplen != sizeof(*pneg_rsp)) { - cifs_dbg(VFS, "invalid protocol negotiate response size: %d\n", + cifs_tcon_dbg(VFS, "invalid protocol negotiate response size: %d\n", rsplen); /* relax check since Mac returns max bufsize allowed on ioctl */ @@ -1096,16 +1096,16 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) } /* check validate negotiate info response matches what we got earlier */ - if (pneg_rsp->Dialect != cpu_to_le16(tcon->ses->server->dialect)) + if (pneg_rsp->Dialect != cpu_to_le16(server->dialect)) goto vneg_out; - if (pneg_rsp->SecurityMode != cpu_to_le16(tcon->ses->server->sec_mode)) + if (pneg_rsp->SecurityMode != cpu_to_le16(server->sec_mode)) goto vneg_out; /* do not validate server guid because not saved at negprot time yet */ if ((le32_to_cpu(pneg_rsp->Capabilities) | SMB2_NT_FIND | - SMB2_LARGE_FILES) != tcon->ses->server->capabilities) + SMB2_LARGE_FILES) != server->capabilities) goto vneg_out; /* validate negotiate successful */ @@ -1114,7 +1114,7 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) goto out_free_rsp; vneg_out: - cifs_dbg(VFS, "protocol revalidation - security settings mismatch\n"); + cifs_tcon_dbg(VFS, "protocol revalidation - security settings mismatch\n"); out_free_rsp: kfree(pneg_rsp); out_free_inbuf: @@ -1568,7 +1568,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, sess_data->func(sess_data); if ((ses->session_flags & SMB2_SESSION_FLAG_IS_GUEST) && (ses->sign)) - cifs_dbg(VFS, "signing requested but authenticated as guest\n"); + cifs_server_dbg(VFS, "signing requested but authenticated as guest\n"); rc = sess_data->result; out: kfree(sess_data); @@ -1661,10 +1661,11 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, __le16 *unc_path = NULL; int flags = 0; unsigned int total_len; + struct TCP_Server_Info *server = ses->server; cifs_dbg(FYI, "TCON\n"); - if (!(ses->server) || !tree) + if (!server || !tree) return -EIO; unc_path = kmalloc(MAX_SHARENAME_LENGTH * 2, GFP_KERNEL); @@ -1707,7 +1708,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, * unless it is guest or anonymous user. See MS-SMB2 3.2.5.3.1 * (Samba servers don't always set the flag so also check if null user) */ - if ((ses->server->dialect == SMB311_PROT_ID) && + if ((server->dialect == SMB311_PROT_ID) && !smb3_encryption_required(tcon) && !(ses->session_flags & (SMB2_SESSION_FLAG_IS_GUEST|SMB2_SESSION_FLAG_IS_NULL)) && @@ -1746,7 +1747,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, cifs_dbg(FYI, "connection to printer\n"); break; default: - cifs_dbg(VFS, "unknown share type %d\n", rsp->ShareType); + cifs_server_dbg(VFS, "unknown share type %d\n", rsp->ShareType); rc = -EOPNOTSUPP; goto tcon_error_exit; } @@ -1761,15 +1762,15 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) && ((tcon->share_flags & SHI1005_FLAGS_DFS) == 0)) - cifs_dbg(VFS, "DFS capability contradicts DFS flag\n"); + cifs_tcon_dbg(VFS, "DFS capability contradicts DFS flag\n"); if (tcon->seal && - !(tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION)) - cifs_dbg(VFS, "Encryption is requested but not supported\n"); + !(server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION)) + cifs_tcon_dbg(VFS, "Encryption is requested but not supported\n"); init_copy_chunk_defaults(tcon); - if (tcon->ses->server->ops->validate_negotiate) - rc = tcon->ses->server->ops->validate_negotiate(xid, tcon); + if (server->ops->validate_negotiate) + rc = server->ops->validate_negotiate(xid, tcon); tcon_exit: free_rsp_buf(resp_buftype, rsp); @@ -1778,7 +1779,7 @@ tcon_exit: tcon_error_exit: if (rsp && rsp->sync_hdr.Status == STATUS_BAD_NETWORK_NAME) { - cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree); + cifs_tcon_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree); } goto tcon_exit; } @@ -2458,7 +2459,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, __u8 *oplock, iov[1].iov_len = uni_path_len; iov[1].iov_base = path; - if (!server->oplocks) + if ((!server->oplocks) || (tcon->no_lease)) *oplock = SMB2_OPLOCK_LEVEL_NONE; if (!(server->capabilities & SMB2_GLOBAL_CAP_LEASING) || @@ -2594,6 +2595,11 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, } trace_smb3_open_err(xid, tcon->tid, ses->Suid, oparms->create_options, oparms->desired_access, rc); + if (rc == -EREMCHG) { + printk_once(KERN_WARNING "server share %s deleted\n", + tcon->treeName); + tcon->need_reconnect = true; + } goto creat_exit; } else trace_smb3_open_done(xid, rsp->PersistentFileId, tcon->tid, @@ -2742,6 +2748,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, int resp_buftype = CIFS_NO_BUFFER; int rc = 0; int flags = 0; + struct TCP_Server_Info *server; cifs_dbg(FYI, "SMB2 IOCTL\n"); @@ -2757,7 +2764,10 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, else return -EIO; - if (!ses || !(ses->server)) + if (!ses) + return -EIO; + server = ses->server; + if (!server) return -EIO; if (smb3_encryption_required(tcon)) @@ -2807,14 +2817,14 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, if (*plen == 0) goto ioctl_exit; /* server returned no data */ else if (*plen > rsp_iov.iov_len || *plen > 0xFF00) { - cifs_dbg(VFS, "srv returned invalid ioctl length: %d\n", *plen); + cifs_tcon_dbg(VFS, "srv returned invalid ioctl length: %d\n", *plen); *plen = 0; rc = -EIO; goto ioctl_exit; } if (rsp_iov.iov_len - *plen < le32_to_cpu(rsp->OutputOffset)) { - cifs_dbg(VFS, "Malformed ioctl resp: len %d offset %d\n", *plen, + cifs_tcon_dbg(VFS, "Malformed ioctl resp: len %d offset %d\n", *plen, le32_to_cpu(rsp->OutputOffset)); *plen = 0; rc = -EIO; @@ -2913,6 +2923,7 @@ SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = iov; rqst.rq_nvec = 1; + trace_smb3_close_enter(xid, persistent_fid, tcon->tid, ses->Suid); rc = SMB2_close_init(tcon, &rqst, persistent_fid, volatile_fid); if (rc) goto close_exit; @@ -2925,7 +2936,9 @@ SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon, trace_smb3_close_err(xid, persistent_fid, tcon->tid, ses->Suid, rc); goto close_exit; - } + } else + trace_smb3_close_done(xid, persistent_fid, tcon->tid, + ses->Suid); atomic_dec(&tcon->num_remote_opens); @@ -3055,12 +3068,16 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; int resp_buftype = CIFS_NO_BUFFER; struct cifs_ses *ses = tcon->ses; + struct TCP_Server_Info *server; int flags = 0; bool allocated = false; cifs_dbg(FYI, "Query Info\n"); - if (!ses || !(ses->server)) + if (!ses) + return -EIO; + server = ses->server; + if (!server) return -EIO; if (smb3_encryption_required(tcon)) @@ -3098,7 +3115,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, if (!*data) { *data = kmalloc(*dlen, GFP_KERNEL); if (!*data) { - cifs_dbg(VFS, + cifs_tcon_dbg(VFS, "Error %d allocating memory for acl\n", rc); *dlen = 0; @@ -3159,6 +3176,91 @@ SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon, } /* + * CHANGE_NOTIFY Request is sent to get notifications on changes to a directory + * See MS-SMB2 2.2.35 and 2.2.36 + */ + +int +SMB2_notify_init(const unsigned int xid, struct smb_rqst *rqst, + struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, + u32 completion_filter, bool watch_tree) +{ + struct smb2_change_notify_req *req; + struct kvec *iov = rqst->rq_iov; + unsigned int total_len; + int rc; + + rc = smb2_plain_req_init(SMB2_CHANGE_NOTIFY, tcon, (void **) &req, &total_len); + if (rc) + return rc; + + req->PersistentFileId = persistent_fid; + req->VolatileFileId = volatile_fid; + req->OutputBufferLength = SMB2_MAX_BUFFER_SIZE - MAX_SMB2_HDR_SIZE; + req->CompletionFilter = cpu_to_le32(completion_filter); + if (watch_tree) + req->Flags = cpu_to_le16(SMB2_WATCH_TREE); + else + req->Flags = 0; + + iov[0].iov_base = (char *)req; + iov[0].iov_len = total_len; + + return 0; +} + +int +SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, bool watch_tree, + u32 completion_filter) +{ + struct cifs_ses *ses = tcon->ses; + struct smb_rqst rqst; + struct kvec iov[1]; + struct kvec rsp_iov = {NULL, 0}; + int resp_buftype = CIFS_NO_BUFFER; + int flags = 0; + int rc = 0; + + cifs_dbg(FYI, "change notify\n"); + if (!ses || !(ses->server)) + return -EIO; + + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + + memset(&rqst, 0, sizeof(struct smb_rqst)); + memset(&iov, 0, sizeof(iov)); + rqst.rq_iov = iov; + rqst.rq_nvec = 1; + + rc = SMB2_notify_init(xid, &rqst, tcon, persistent_fid, volatile_fid, + completion_filter, watch_tree); + if (rc) + goto cnotify_exit; + + trace_smb3_notify_enter(xid, persistent_fid, tcon->tid, ses->Suid, + (u8)watch_tree, completion_filter); + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); + + if (rc != 0) { + cifs_stats_fail_inc(tcon, SMB2_CHANGE_NOTIFY_HE); + trace_smb3_notify_err(xid, persistent_fid, tcon->tid, ses->Suid, + (u8)watch_tree, completion_filter, rc); + } else + trace_smb3_notify_done(xid, persistent_fid, tcon->tid, + ses->Suid, (u8)watch_tree, completion_filter); + + cnotify_exit: + if (rqst.rq_iov) + cifs_small_buf_release(rqst.rq_iov[0].iov_base); /* request */ + free_rsp_buf(resp_buftype, rsp_iov.iov_base); + return rc; +} + + + +/* * This is a no-op for now. We're not really interested in the reply, but * rather in the fact that the server sent one and that server->lstrp * gets updated. @@ -3287,51 +3389,76 @@ SMB2_echo(struct TCP_Server_Info *server) return rc; } +void +SMB2_flush_free(struct smb_rqst *rqst) +{ + if (rqst && rqst->rq_iov) + cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */ +} + int -SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, - u64 volatile_fid) +SMB2_flush_init(const unsigned int xid, struct smb_rqst *rqst, + struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid) { - struct smb_rqst rqst; struct smb2_flush_req *req; - struct cifs_ses *ses = tcon->ses; - struct kvec iov[1]; - struct kvec rsp_iov; - int resp_buftype; - int rc = 0; - int flags = 0; + struct kvec *iov = rqst->rq_iov; unsigned int total_len; - - cifs_dbg(FYI, "Flush\n"); - - if (!ses || !(ses->server)) - return -EIO; + int rc; rc = smb2_plain_req_init(SMB2_FLUSH, tcon, (void **) &req, &total_len); if (rc) return rc; - if (smb3_encryption_required(tcon)) - flags |= CIFS_TRANSFORM_REQ; - req->PersistentFileId = persistent_fid; req->VolatileFileId = volatile_fid; iov[0].iov_base = (char *)req; iov[0].iov_len = total_len; + return 0; +} + +int +SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, + u64 volatile_fid) +{ + struct cifs_ses *ses = tcon->ses; + struct smb_rqst rqst; + struct kvec iov[1]; + struct kvec rsp_iov = {NULL, 0}; + int resp_buftype = CIFS_NO_BUFFER; + int flags = 0; + int rc = 0; + + cifs_dbg(FYI, "flush\n"); + if (!ses || !(ses->server)) + return -EIO; + + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + memset(&rqst, 0, sizeof(struct smb_rqst)); + memset(&iov, 0, sizeof(iov)); rqst.rq_iov = iov; rqst.rq_nvec = 1; + rc = SMB2_flush_init(xid, &rqst, tcon, persistent_fid, volatile_fid); + if (rc) + goto flush_exit; + + trace_smb3_flush_enter(xid, persistent_fid, tcon->tid, ses->Suid); rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); - cifs_small_buf_release(req); if (rc != 0) { cifs_stats_fail_inc(tcon, SMB2_FLUSH_HE); trace_smb3_flush_err(xid, persistent_fid, tcon->tid, ses->Suid, rc); - } + } else + trace_smb3_flush_done(xid, persistent_fid, tcon->tid, + ses->Suid); + flush_exit: + SMB2_flush_free(&rqst); free_rsp_buf(resp_buftype, rsp_iov.iov_base); return rc; } @@ -3446,8 +3573,8 @@ smb2_readv_callback(struct mid_q_entry *mid) struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)rdata->iov[0].iov_base; struct cifs_credits credits = { .value = 0, .instance = 0 }; - struct smb_rqst rqst = { .rq_iov = rdata->iov, - .rq_nvec = 2, + struct smb_rqst rqst = { .rq_iov = &rdata->iov[1], + .rq_nvec = 1, .rq_pages = rdata->pages, .rq_offset = rdata->page_offset, .rq_npages = rdata->nr_pages, @@ -3468,7 +3595,7 @@ smb2_readv_callback(struct mid_q_entry *mid) rc = smb2_verify_signature(&rqst, server); if (rc) - cifs_dbg(VFS, "SMB signature verification returned error = %d\n", + cifs_tcon_dbg(VFS, "SMB signature verification returned error = %d\n", rc); } /* FIXME: should this be counted toward the initiating task? */ @@ -3595,7 +3722,7 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes, char **buf, int *buf_type) { struct smb_rqst rqst; - int resp_buftype, rc = -EACCES; + int resp_buftype, rc; struct smb2_read_plain_req *req = NULL; struct smb2_read_rsp *rsp = NULL; struct kvec iov[1]; @@ -4058,7 +4185,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, info_buf_size = sizeof(SEARCH_ID_FULL_DIR_INFO) - 1; break; default: - cifs_dbg(VFS, "info level %u isn't supported\n", + cifs_tcon_dbg(VFS, "info level %u isn't supported\n", srch_inf->info_level); rc = -EINVAL; goto qdir_exit; @@ -4149,7 +4276,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, else if (resp_buftype == CIFS_SMALL_BUFFER) srch_inf->smallBuf = true; else - cifs_dbg(VFS, "illegal search buffer type\n"); + cifs_tcon_dbg(VFS, "illegal search buffer type\n"); trace_smb3_query_dir_done(xid, persistent_fid, tcon->tid, tcon->ses->Suid, index, srch_inf->entries_in_buffer); diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 747de9317659..ea735d59c36e 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -143,7 +143,9 @@ struct smb2_transform_hdr { #define SMB2_FLAGS_ASYNC_COMMAND cpu_to_le32(0x00000002) #define SMB2_FLAGS_RELATED_OPERATIONS cpu_to_le32(0x00000004) #define SMB2_FLAGS_SIGNED cpu_to_le32(0x00000008) +#define SMB2_FLAGS_PRIORITY_MASK cpu_to_le32(0x00000070) /* SMB3.1.1 */ #define SMB2_FLAGS_DFS_OPERATIONS cpu_to_le32(0x10000000) +#define SMB2_FLAGS_REPLAY_OPERATION cpu_to_le32(0x20000000) /* SMB3 & up */ /* * Definitions for SMB2 Protocol Data Units (network frames) diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 07ca72486cfa..67a91b11fd59 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -158,6 +158,10 @@ extern int SMB2_close_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, extern void SMB2_close_free(struct smb_rqst *rqst); extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id); +extern int SMB2_flush_init(const unsigned int xid, struct smb_rqst *rqst, + struct cifs_tcon *tcon, + u64 persistent_file_id, u64 volatile_file_id); +extern void SMB2_flush_free(struct smb_rqst *rqst); extern int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id, struct smb2_file_all_info *data); diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 1ccbcf9c2c3b..148d7942c796 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -176,7 +176,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) ses = smb2_find_smb_ses(server, shdr->SessionId); if (!ses) { - cifs_dbg(VFS, "%s: Could not find session\n", __func__); + cifs_server_dbg(VFS, "%s: Could not find session\n", __func__); return 0; } @@ -185,21 +185,21 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) rc = smb2_crypto_shash_allocate(server); if (rc) { - cifs_dbg(VFS, "%s: sha256 alloc failed\n", __func__); + cifs_server_dbg(VFS, "%s: sha256 alloc failed\n", __func__); return rc; } rc = crypto_shash_setkey(server->secmech.hmacsha256, ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE); if (rc) { - cifs_dbg(VFS, "%s: Could not update with response\n", __func__); + cifs_server_dbg(VFS, "%s: Could not update with response\n", __func__); return rc; } shash = &server->secmech.sdeschmacsha256->shash; rc = crypto_shash_init(shash); if (rc) { - cifs_dbg(VFS, "%s: Could not init sha256", __func__); + cifs_server_dbg(VFS, "%s: Could not init sha256", __func__); return rc; } @@ -215,7 +215,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) rc = crypto_shash_update(shash, iov[0].iov_base, iov[0].iov_len); if (rc) { - cifs_dbg(VFS, "%s: Could not update with payload\n", + cifs_server_dbg(VFS, "%s: Could not update with payload\n", __func__); return rc; } @@ -239,68 +239,69 @@ static int generate_key(struct cifs_ses *ses, struct kvec label, int rc = 0; unsigned char prfhash[SMB2_HMACSHA256_SIZE]; unsigned char *hashptr = prfhash; + struct TCP_Server_Info *server = ses->server; memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE); memset(key, 0x0, key_size); - rc = smb3_crypto_shash_allocate(ses->server); + rc = smb3_crypto_shash_allocate(server); if (rc) { - cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__); + cifs_server_dbg(VFS, "%s: crypto alloc failed\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_setkey(ses->server->secmech.hmacsha256, + rc = crypto_shash_setkey(server->secmech.hmacsha256, ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE); if (rc) { - cifs_dbg(VFS, "%s: Could not set with session key\n", __func__); + cifs_server_dbg(VFS, "%s: Could not set with session key\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_init(&ses->server->secmech.sdeschmacsha256->shash); + rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash); if (rc) { - cifs_dbg(VFS, "%s: Could not init sign hmac\n", __func__); + cifs_server_dbg(VFS, "%s: Could not init sign hmac\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash, + rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, i, 4); if (rc) { - cifs_dbg(VFS, "%s: Could not update with n\n", __func__); + cifs_server_dbg(VFS, "%s: Could not update with n\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash, + rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, label.iov_base, label.iov_len); if (rc) { - cifs_dbg(VFS, "%s: Could not update with label\n", __func__); + cifs_server_dbg(VFS, "%s: Could not update with label\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash, + rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, &zero, 1); if (rc) { - cifs_dbg(VFS, "%s: Could not update with zero\n", __func__); + cifs_server_dbg(VFS, "%s: Could not update with zero\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash, + rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, context.iov_base, context.iov_len); if (rc) { - cifs_dbg(VFS, "%s: Could not update with context\n", __func__); + cifs_server_dbg(VFS, "%s: Could not update with context\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash, + rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, L, 4); if (rc) { - cifs_dbg(VFS, "%s: Could not update with L\n", __func__); + cifs_server_dbg(VFS, "%s: Could not update with L\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_final(&ses->server->secmech.sdeschmacsha256->shash, + rc = crypto_shash_final(&server->secmech.sdeschmacsha256->shash, hashptr); if (rc) { - cifs_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__); + cifs_server_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__); goto smb3signkey_ret; } @@ -436,7 +437,7 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) ses = smb2_find_smb_ses(server, shdr->SessionId); if (!ses) { - cifs_dbg(VFS, "%s: Could not find session\n", __func__); + cifs_server_dbg(VFS, "%s: Could not find session\n", __func__); return 0; } @@ -446,7 +447,7 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) rc = crypto_shash_setkey(server->secmech.cmacaes, ses->smb3signingkey, SMB2_CMACAES_SIZE); if (rc) { - cifs_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__); + cifs_server_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__); return rc; } @@ -457,7 +458,7 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) */ rc = crypto_shash_init(shash); if (rc) { - cifs_dbg(VFS, "%s: Could not init cmac aes\n", __func__); + cifs_server_dbg(VFS, "%s: Could not init cmac aes\n", __func__); return rc; } @@ -473,7 +474,7 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) rc = crypto_shash_update(shash, iov[0].iov_base, iov[0].iov_len); if (rc) { - cifs_dbg(VFS, "%s: Could not update with payload\n", + cifs_server_dbg(VFS, "%s: Could not update with payload\n", __func__); return rc; } @@ -521,6 +522,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) if ((shdr->Command == SMB2_NEGOTIATE) || (shdr->Command == SMB2_SESSION_SETUP) || (shdr->Command == SMB2_OPLOCK_BREAK) || + server->ignore_signature || (!server->session_estab)) return 0; @@ -665,7 +667,7 @@ smb2_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, rc = smb2_verify_signature(&rqst, server); if (rc) - cifs_dbg(VFS, "SMB signature verification returned error = %d\n", + cifs_server_dbg(VFS, "SMB signature verification returned error = %d\n", rc); } @@ -739,7 +741,7 @@ smb3_crypto_aead_allocate(struct TCP_Server_Info *server) else tfm = crypto_alloc_aead("ccm(aes)", 0, 0); if (IS_ERR(tfm)) { - cifs_dbg(VFS, "%s: Failed to alloc encrypt aead\n", + cifs_server_dbg(VFS, "%s: Failed to alloc encrypt aead\n", __func__); return PTR_ERR(tfm); } @@ -754,7 +756,7 @@ smb3_crypto_aead_allocate(struct TCP_Server_Info *server) if (IS_ERR(tfm)) { crypto_free_aead(server->secmech.ccmaesencrypt); server->secmech.ccmaesencrypt = NULL; - cifs_dbg(VFS, "%s: Failed to alloc decrypt aead\n", + cifs_server_dbg(VFS, "%s: Failed to alloc decrypt aead\n", __func__); return PTR_ERR(tfm); } diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index 2b6d87bfdf8e..39a938443e3e 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c @@ -11,13 +11,14 @@ */ -#include <linux/crypto.h> #include <linux/module.h> #include <linux/slab.h> +#include <linux/fips.h> #include <linux/fs.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/random.h> +#include <crypto/des.h> #include "cifs_fs_sb.h" #include "cifs_unicode.h" #include "cifspdu.h" @@ -58,19 +59,18 @@ static int smbhash(unsigned char *out, const unsigned char *in, unsigned char *key) { unsigned char key2[8]; - struct crypto_cipher *tfm_des; + struct des_ctx ctx; str_to_key(key, key2); - tfm_des = crypto_alloc_cipher("des", 0, 0); - if (IS_ERR(tfm_des)) { - cifs_dbg(VFS, "could not allocate des crypto API\n"); - return PTR_ERR(tfm_des); + if (fips_enabled) { + cifs_dbg(VFS, "FIPS compliance enabled: DES not permitted\n"); + return -ENOENT; } - crypto_cipher_setkey(tfm_des, key2, 8); - crypto_cipher_encrypt_one(tfm_des, out, in); - crypto_free_cipher(tfm_des); + des_expand_key(&ctx, key2, DES_KEY_SIZE); + des_encrypt(&ctx, out, in); + memzero_explicit(&ctx, sizeof(ctx)); return 0; } diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index 99c4d799c24b..e7e350b13d6a 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -117,6 +117,41 @@ DEFINE_SMB3_RW_DONE_EVENT(falloc_done); /* * For handle based calls other than read and write, and get/set info */ +DECLARE_EVENT_CLASS(smb3_fd_class, + TP_PROTO(unsigned int xid, + __u64 fid, + __u32 tid, + __u64 sesid), + TP_ARGS(xid, fid, tid, sesid), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u64, fid) + __field(__u32, tid) + __field(__u64, sesid) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->fid = fid; + __entry->tid = tid; + __entry->sesid = sesid; + ), + TP_printk("\txid=%u sid=0x%llx tid=0x%x fid=0x%llx", + __entry->xid, __entry->sesid, __entry->tid, __entry->fid) +) + +#define DEFINE_SMB3_FD_EVENT(name) \ +DEFINE_EVENT(smb3_fd_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u64 fid, \ + __u32 tid, \ + __u64 sesid), \ + TP_ARGS(xid, fid, tid, sesid)) + +DEFINE_SMB3_FD_EVENT(flush_enter); +DEFINE_SMB3_FD_EVENT(flush_done); +DEFINE_SMB3_FD_EVENT(close_enter); +DEFINE_SMB3_FD_EVENT(close_done); + DECLARE_EVENT_CLASS(smb3_fd_err_class, TP_PROTO(unsigned int xid, __u64 fid, @@ -200,6 +235,8 @@ DEFINE_EVENT(smb3_inf_enter_class, smb3_##name, \ DEFINE_SMB3_INF_ENTER_EVENT(query_info_enter); DEFINE_SMB3_INF_ENTER_EVENT(query_info_done); +DEFINE_SMB3_INF_ENTER_EVENT(notify_enter); +DEFINE_SMB3_INF_ENTER_EVENT(notify_done); DECLARE_EVENT_CLASS(smb3_inf_err_class, TP_PROTO(unsigned int xid, @@ -246,6 +283,7 @@ DEFINE_EVENT(smb3_inf_err_class, smb3_##name, \ DEFINE_SMB3_INF_ERR_EVENT(query_info_err); DEFINE_SMB3_INF_ERR_EVENT(set_info_err); +DEFINE_SMB3_INF_ERR_EVENT(notify_err); DEFINE_SMB3_INF_ERR_EVENT(fsctl_err); DECLARE_EVENT_CLASS(smb3_inf_compound_enter_class, diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 5d6d44bfe10a..308ad0f495e1 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -118,7 +118,7 @@ DeleteMidQEntry(struct mid_q_entry *midEntry) #ifdef CONFIG_CIFS_STATS2 now = jiffies; if (now < midEntry->when_alloc) - cifs_dbg(VFS, "invalid mid allocation time\n"); + cifs_server_dbg(VFS, "invalid mid allocation time\n"); roundtrip_time = now - midEntry->when_alloc; if (smb_cmd < NUMBER_OF_SMB2_COMMANDS) { @@ -232,7 +232,7 @@ smb_send_kvec(struct TCP_Server_Info *server, struct msghdr *smb_msg, retries++; if (retries >= 14 || (!server->noblocksnd && (retries > 2))) { - cifs_dbg(VFS, "sends on sock %p stuck for 15 seconds\n", + cifs_server_dbg(VFS, "sends on sock %p stuck for 15 seconds\n", ssocket); return -EAGAIN; } @@ -246,7 +246,7 @@ smb_send_kvec(struct TCP_Server_Info *server, struct msghdr *smb_msg, if (rc == 0) { /* should never happen, letting socket clear before retrying is our only obvious option here */ - cifs_dbg(VFS, "tcp sent no data\n"); + cifs_server_dbg(VFS, "tcp sent no data\n"); msleep(500); continue; } @@ -440,7 +440,7 @@ unmask: } smbd_done: if (rc < 0 && rc != -EINTR) - cifs_dbg(VFS, "Error %d sending data on socket to server\n", + cifs_server_dbg(VFS, "Error %d sending data on socket to server\n", rc); else if (rc > 0) rc = 0; @@ -473,8 +473,8 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, cur_rqst[0].rq_nvec = 1; if (!server->ops->init_transform_rq) { - cifs_dbg(VFS, "Encryption requested but transform callback " - "is missing\n"); + cifs_server_dbg(VFS, "Encryption requested but transform " + "callback is missing\n"); return -EIO; } @@ -532,6 +532,8 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, if ((flags & CIFS_TIMEOUT_MASK) == CIFS_NON_BLOCKING) { /* oplock breaks must not be held up */ server->in_flight++; + if (server->in_flight > server->max_in_flight) + server->max_in_flight = server->in_flight; *credits -= 1; *instance = server->reconnect_instance; spin_unlock(&server->req_lock); @@ -548,7 +550,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, if (!rc) { trace_smb3_credit_timeout(server->CurrentMid, server->hostname, num_credits); - cifs_dbg(VFS, "wait timed out after %d ms\n", + cifs_server_dbg(VFS, "wait timed out after %d ms\n", timeout); return -ENOTSUPP; } @@ -589,7 +591,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, trace_smb3_credit_timeout( server->CurrentMid, server->hostname, num_credits); - cifs_dbg(VFS, "wait timed out after %d ms\n", + cifs_server_dbg(VFS, "wait timed out after %d ms\n", timeout); return -ENOTSUPP; } @@ -608,6 +610,8 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, if ((flags & CIFS_TIMEOUT_MASK) != CIFS_BLOCKING_OP) { *credits -= num_credits; server->in_flight += num_credits; + if (server->in_flight > server->max_in_flight) + server->max_in_flight = server->in_flight; *instance = server->reconnect_instance; } spin_unlock(&server->req_lock); @@ -869,7 +873,7 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server) break; default: list_del_init(&mid->qhead); - cifs_dbg(VFS, "%s: invalid mid state mid=%llu state=%d\n", + cifs_server_dbg(VFS, "%s: invalid mid state mid=%llu state=%d\n", __func__, mid->mid, mid->mid_state); rc = -EIO; } @@ -910,7 +914,7 @@ cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, rc = cifs_verify_signature(&rqst, server, mid->sequence_number); if (rc) - cifs_dbg(VFS, "SMB signature verification returned error = %d\n", + cifs_server_dbg(VFS, "SMB signature verification returned error = %d\n", rc); } @@ -1107,7 +1111,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, } if (rc != 0) { for (; i < num_rqst; i++) { - cifs_dbg(VFS, "Cancelling wait for mid %llu cmd: %d\n", + cifs_server_dbg(VFS, "Cancelling wait for mid %llu cmd: %d\n", midQ[i]->mid, le16_to_cpu(midQ[i]->command)); send_cancel(server, &rqst[i], midQ[i]); spin_lock(&GlobalMid_Lock); @@ -1242,17 +1246,19 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, struct kvec iov = { .iov_base = in_buf, .iov_len = len }; struct smb_rqst rqst = { .rq_iov = &iov, .rq_nvec = 1 }; struct cifs_credits credits = { .value = 1, .instance = 0 }; + struct TCP_Server_Info *server; if (ses == NULL) { cifs_dbg(VFS, "Null smb session\n"); return -EIO; } - if (ses->server == NULL) { + server = ses->server; + if (server == NULL) { cifs_dbg(VFS, "Null tcp session\n"); return -EIO; } - if (ses->server->tcpStatus == CifsExiting) + if (server->tcpStatus == CifsExiting) return -ENOENT; /* Ensure that we do not send more than 50 overlapping requests @@ -1260,12 +1266,12 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, use ses->maxReq */ if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { - cifs_dbg(VFS, "Illegal length, greater than maximum frame, %d\n", + cifs_server_dbg(VFS, "Illegal length, greater than maximum frame, %d\n", len); return -EIO; } - rc = wait_for_free_request(ses->server, flags, &credits.instance); + rc = wait_for_free_request(server, flags, &credits.instance); if (rc) return rc; @@ -1273,70 +1279,70 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, and avoid races inside tcp sendmsg code that could cause corruption of smb data */ - mutex_lock(&ses->server->srv_mutex); + mutex_lock(&server->srv_mutex); rc = allocate_mid(ses, in_buf, &midQ); if (rc) { mutex_unlock(&ses->server->srv_mutex); /* Update # of requests on wire to server */ - add_credits(ses->server, &credits, 0); + add_credits(server, &credits, 0); return rc; } - rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number); + rc = cifs_sign_smb(in_buf, server, &midQ->sequence_number); if (rc) { - mutex_unlock(&ses->server->srv_mutex); + mutex_unlock(&server->srv_mutex); goto out; } midQ->mid_state = MID_REQUEST_SUBMITTED; - cifs_in_send_inc(ses->server); - rc = smb_send(ses->server, in_buf, len); - cifs_in_send_dec(ses->server); + cifs_in_send_inc(server); + rc = smb_send(server, in_buf, len); + cifs_in_send_dec(server); cifs_save_when_sent(midQ); if (rc < 0) - ses->server->sequence_number -= 2; + server->sequence_number -= 2; - mutex_unlock(&ses->server->srv_mutex); + mutex_unlock(&server->srv_mutex); if (rc < 0) goto out; - rc = wait_for_response(ses->server, midQ); + rc = wait_for_response(server, midQ); if (rc != 0) { - send_cancel(ses->server, &rqst, midQ); + send_cancel(server, &rqst, midQ); spin_lock(&GlobalMid_Lock); if (midQ->mid_state == MID_REQUEST_SUBMITTED) { /* no longer considered to be "in-flight" */ midQ->callback = DeleteMidQEntry; spin_unlock(&GlobalMid_Lock); - add_credits(ses->server, &credits, 0); + add_credits(server, &credits, 0); return rc; } spin_unlock(&GlobalMid_Lock); } - rc = cifs_sync_mid_result(midQ, ses->server); + rc = cifs_sync_mid_result(midQ, server); if (rc != 0) { - add_credits(ses->server, &credits, 0); + add_credits(server, &credits, 0); return rc; } if (!midQ->resp_buf || !out_buf || midQ->mid_state != MID_RESPONSE_RECEIVED) { rc = -EIO; - cifs_dbg(VFS, "Bad MID state?\n"); + cifs_server_dbg(VFS, "Bad MID state?\n"); goto out; } *pbytes_returned = get_rfc1002_length(midQ->resp_buf); memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4); - rc = cifs_check_receive(midQ, ses->server, 0); + rc = cifs_check_receive(midQ, server, 0); out: cifs_delete_mid(midQ); - add_credits(ses->server, &credits, 0); + add_credits(server, &credits, 0); return rc; } @@ -1379,19 +1385,21 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, struct kvec iov = { .iov_base = in_buf, .iov_len = len }; struct smb_rqst rqst = { .rq_iov = &iov, .rq_nvec = 1 }; unsigned int instance; + struct TCP_Server_Info *server; if (tcon == NULL || tcon->ses == NULL) { cifs_dbg(VFS, "Null smb session\n"); return -EIO; } ses = tcon->ses; + server = ses->server; - if (ses->server == NULL) { + if (server == NULL) { cifs_dbg(VFS, "Null tcp session\n"); return -EIO; } - if (ses->server->tcpStatus == CifsExiting) + if (server->tcpStatus == CifsExiting) return -ENOENT; /* Ensure that we do not send more than 50 overlapping requests @@ -1399,12 +1407,12 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, use ses->maxReq */ if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { - cifs_dbg(VFS, "Illegal length, greater than maximum frame, %d\n", + cifs_tcon_dbg(VFS, "Illegal length, greater than maximum frame, %d\n", len); return -EIO; } - rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP, &instance); + rc = wait_for_free_request(server, CIFS_BLOCKING_OP, &instance); if (rc) return rc; @@ -1412,31 +1420,31 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, and avoid races inside tcp sendmsg code that could cause corruption of smb data */ - mutex_lock(&ses->server->srv_mutex); + mutex_lock(&server->srv_mutex); rc = allocate_mid(ses, in_buf, &midQ); if (rc) { - mutex_unlock(&ses->server->srv_mutex); + mutex_unlock(&server->srv_mutex); return rc; } - rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number); + rc = cifs_sign_smb(in_buf, server, &midQ->sequence_number); if (rc) { cifs_delete_mid(midQ); - mutex_unlock(&ses->server->srv_mutex); + mutex_unlock(&server->srv_mutex); return rc; } midQ->mid_state = MID_REQUEST_SUBMITTED; - cifs_in_send_inc(ses->server); - rc = smb_send(ses->server, in_buf, len); - cifs_in_send_dec(ses->server); + cifs_in_send_inc(server); + rc = smb_send(server, in_buf, len); + cifs_in_send_dec(server); cifs_save_when_sent(midQ); if (rc < 0) - ses->server->sequence_number -= 2; + server->sequence_number -= 2; - mutex_unlock(&ses->server->srv_mutex); + mutex_unlock(&server->srv_mutex); if (rc < 0) { cifs_delete_mid(midQ); @@ -1444,21 +1452,21 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, } /* Wait for a reply - allow signals to interrupt. */ - rc = wait_event_interruptible(ses->server->response_q, + rc = wait_event_interruptible(server->response_q, (!(midQ->mid_state == MID_REQUEST_SUBMITTED)) || - ((ses->server->tcpStatus != CifsGood) && - (ses->server->tcpStatus != CifsNew))); + ((server->tcpStatus != CifsGood) && + (server->tcpStatus != CifsNew))); /* Were we interrupted by a signal ? */ if ((rc == -ERESTARTSYS) && (midQ->mid_state == MID_REQUEST_SUBMITTED) && - ((ses->server->tcpStatus == CifsGood) || - (ses->server->tcpStatus == CifsNew))) { + ((server->tcpStatus == CifsGood) || + (server->tcpStatus == CifsNew))) { if (in_buf->Command == SMB_COM_TRANSACTION2) { /* POSIX lock. We send a NT_CANCEL SMB to cause the blocking lock to return. */ - rc = send_cancel(ses->server, &rqst, midQ); + rc = send_cancel(server, &rqst, midQ); if (rc) { cifs_delete_mid(midQ); return rc; @@ -1477,9 +1485,9 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, } } - rc = wait_for_response(ses->server, midQ); + rc = wait_for_response(server, midQ); if (rc) { - send_cancel(ses->server, &rqst, midQ); + send_cancel(server, &rqst, midQ); spin_lock(&GlobalMid_Lock); if (midQ->mid_state == MID_REQUEST_SUBMITTED) { /* no longer considered to be "in-flight" */ @@ -1494,20 +1502,20 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, rstart = 1; } - rc = cifs_sync_mid_result(midQ, ses->server); + rc = cifs_sync_mid_result(midQ, server); if (rc != 0) return rc; /* rcvd frame is ok */ if (out_buf == NULL || midQ->mid_state != MID_RESPONSE_RECEIVED) { rc = -EIO; - cifs_dbg(VFS, "Bad MID state?\n"); + cifs_tcon_dbg(VFS, "Bad MID state?\n"); goto out; } *pbytes_returned = get_rfc1002_length(midQ->resp_buf); memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4); - rc = cifs_check_receive(midQ, ses->server, 0); + rc = cifs_check_receive(midQ, server, 0); out: cifs_delete_mid(midQ); if (rstart && rc == -EACCES) diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 321f56e487cb..b1c70e2b9b1e 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -188,6 +188,9 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = CODA_SUPER_MAGIC; sb->s_op = &coda_super_operations; sb->s_d_op = &coda_dentry_operations; + sb->s_time_gran = 1; + sb->s_time_min = S64_MIN; + sb->s_time_max = S64_MAX; error = super_setup_bdi(sb); if (error) diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index 644d48c12ce8..3aec27e5eb82 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -63,11 +63,8 @@ static long coda_pioctl(struct file *filp, unsigned int cmd, * Look up the pathname. Note that the pathname is in * user memory, and namei takes care of this */ - if (data.follow) - error = user_path(data.path, &path); - else - error = user_lpath(data.path, &path); - + error = user_path_at(AT_FDCWD, data.path, + data.follow ? LOOKUP_FOLLOW : 0, &path); if (error) return error; diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 9c3d309839a8..680aba9c00d5 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -76,14 +76,14 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr) if (ia_valid & ATTR_GID) sd_iattr->ia_gid = iattr->ia_gid; if (ia_valid & ATTR_ATIME) - sd_iattr->ia_atime = timespec64_trunc(iattr->ia_atime, - inode->i_sb->s_time_gran); + sd_iattr->ia_atime = timestamp_truncate(iattr->ia_atime, + inode); if (ia_valid & ATTR_MTIME) - sd_iattr->ia_mtime = timespec64_trunc(iattr->ia_mtime, - inode->i_sb->s_time_gran); + sd_iattr->ia_mtime = timestamp_truncate(iattr->ia_mtime, + inode); if (ia_valid & ATTR_CTIME) - sd_iattr->ia_ctime = timespec64_trunc(iattr->ia_ctime, - inode->i_sb->s_time_gran); + sd_iattr->ia_ctime = timestamp_truncate(iattr->ia_ctime, + inode); if (ia_valid & ATTR_MODE) { umode_t mode = iattr->ia_mode; diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 9352487bd0fc..d12ea28836a5 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -24,6 +24,7 @@ #include <linux/blkdev.h> #include <linux/mtd/mtd.h> #include <linux/mtd/super.h> +#include <linux/fs_context.h> #include <linux/slab.h> #include <linux/vfs.h> #include <linux/mutex.h> @@ -506,18 +507,19 @@ static void cramfs_kill_sb(struct super_block *sb) kfree(sbi); } -static int cramfs_remount(struct super_block *sb, int *flags, char *data) +static int cramfs_reconfigure(struct fs_context *fc) { - sync_filesystem(sb); - *flags |= SB_RDONLY; + sync_filesystem(fc->root->d_sb); + fc->sb_flags |= SB_RDONLY; return 0; } -static int cramfs_read_super(struct super_block *sb, - struct cramfs_super *super, int silent) +static int cramfs_read_super(struct super_block *sb, struct fs_context *fc, + struct cramfs_super *super) { struct cramfs_sb_info *sbi = CRAMFS_SB(sb); unsigned long root_offset; + bool silent = fc->sb_flags & SB_SILENT; /* We don't know the real size yet */ sbi->size = PAGE_SIZE; @@ -532,7 +534,7 @@ static int cramfs_read_super(struct super_block *sb, /* check for wrong endianness */ if (super->magic == CRAMFS_MAGIC_WEND) { if (!silent) - pr_err("wrong endianness\n"); + errorf(fc, "cramfs: wrong endianness"); return -EINVAL; } @@ -544,22 +546,22 @@ static int cramfs_read_super(struct super_block *sb, mutex_unlock(&read_mutex); if (super->magic != CRAMFS_MAGIC) { if (super->magic == CRAMFS_MAGIC_WEND && !silent) - pr_err("wrong endianness\n"); + errorf(fc, "cramfs: wrong endianness"); else if (!silent) - pr_err("wrong magic\n"); + errorf(fc, "cramfs: wrong magic"); return -EINVAL; } } /* get feature flags first */ if (super->flags & ~CRAMFS_SUPPORTED_FLAGS) { - pr_err("unsupported filesystem features\n"); + errorf(fc, "cramfs: unsupported filesystem features"); return -EINVAL; } /* Check that the root inode is in a sane state */ if (!S_ISDIR(super->root.mode)) { - pr_err("root is not a directory\n"); + errorf(fc, "cramfs: root is not a directory"); return -EINVAL; } /* correct strange, hard-coded permissions of mkcramfs */ @@ -578,12 +580,12 @@ static int cramfs_read_super(struct super_block *sb, sbi->magic = super->magic; sbi->flags = super->flags; if (root_offset == 0) - pr_info("empty filesystem"); + infof(fc, "cramfs: empty filesystem"); else if (!(super->flags & CRAMFS_FLAG_SHIFTED_ROOT_OFFSET) && ((root_offset != sizeof(struct cramfs_super)) && (root_offset != 512 + sizeof(struct cramfs_super)))) { - pr_err("bad root offset %lu\n", root_offset); + errorf(fc, "cramfs: bad root offset %lu", root_offset); return -EINVAL; } @@ -597,6 +599,8 @@ static int cramfs_finalize_super(struct super_block *sb, /* Set it all up.. */ sb->s_flags |= SB_RDONLY; + sb->s_time_min = 0; + sb->s_time_max = 0; sb->s_op = &cramfs_ops; root = get_cramfs_inode(sb, cramfs_root, 0); if (IS_ERR(root)) @@ -607,8 +611,7 @@ static int cramfs_finalize_super(struct super_block *sb, return 0; } -static int cramfs_blkdev_fill_super(struct super_block *sb, void *data, - int silent) +static int cramfs_blkdev_fill_super(struct super_block *sb, struct fs_context *fc) { struct cramfs_sb_info *sbi; struct cramfs_super super; @@ -623,14 +626,13 @@ static int cramfs_blkdev_fill_super(struct super_block *sb, void *data, for (i = 0; i < READ_BUFFERS; i++) buffer_blocknr[i] = -1; - err = cramfs_read_super(sb, &super, silent); + err = cramfs_read_super(sb, fc, &super); if (err) return err; return cramfs_finalize_super(sb, &super.root); } -static int cramfs_mtd_fill_super(struct super_block *sb, void *data, - int silent) +static int cramfs_mtd_fill_super(struct super_block *sb, struct fs_context *fc) { struct cramfs_sb_info *sbi; struct cramfs_super super; @@ -652,7 +654,7 @@ static int cramfs_mtd_fill_super(struct super_block *sb, void *data, pr_info("checking physical address %pap for linear cramfs image\n", &sbi->linear_phys_addr); - err = cramfs_read_super(sb, &super, silent); + err = cramfs_read_super(sb, fc, &super); if (err) return err; @@ -947,32 +949,41 @@ static const struct inode_operations cramfs_dir_inode_operations = { }; static const struct super_operations cramfs_ops = { - .remount_fs = cramfs_remount, .statfs = cramfs_statfs, }; -static struct dentry *cramfs_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) +static int cramfs_get_tree(struct fs_context *fc) { - struct dentry *ret = ERR_PTR(-ENOPROTOOPT); + int ret = -ENOPROTOOPT; if (IS_ENABLED(CONFIG_CRAMFS_MTD)) { - ret = mount_mtd(fs_type, flags, dev_name, data, - cramfs_mtd_fill_super); - if (!IS_ERR(ret)) + ret = get_tree_mtd(fc, cramfs_mtd_fill_super); + if (ret < 0) return ret; } - if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV)) { - ret = mount_bdev(fs_type, flags, dev_name, data, - cramfs_blkdev_fill_super); - } + if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV)) + ret = get_tree_bdev(fc, cramfs_blkdev_fill_super); return ret; } +static const struct fs_context_operations cramfs_context_ops = { + .get_tree = cramfs_get_tree, + .reconfigure = cramfs_reconfigure, +}; + +/* + * Set up the filesystem mount context. + */ +static int cramfs_init_fs_context(struct fs_context *fc) +{ + fc->ops = &cramfs_context_ops; + return 0; +} + static struct file_system_type cramfs_fs_type = { .owner = THIS_MODULE, .name = "cramfs", - .mount = cramfs_mount, + .init_fs_context = cramfs_init_fs_context, .kill_sb = cramfs_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig index 5fdf24877c17..ff5a1746cbae 100644 --- a/fs/crypto/Kconfig +++ b/fs/crypto/Kconfig @@ -7,6 +7,8 @@ config FS_ENCRYPTION select CRYPTO_ECB select CRYPTO_XTS select CRYPTO_CTS + select CRYPTO_SHA512 + select CRYPTO_HMAC select KEYS help Enable encryption of files and directories. This diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile index 4f0df5e682e4..232e2bb5a337 100644 --- a/fs/crypto/Makefile +++ b/fs/crypto/Makefile @@ -1,5 +1,13 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_FS_ENCRYPTION) += fscrypto.o -fscrypto-y := crypto.o fname.o hooks.o keyinfo.o policy.o +fscrypto-y := crypto.o \ + fname.o \ + hkdf.o \ + hooks.o \ + keyring.o \ + keysetup.o \ + keysetup_v1.o \ + policy.o + fscrypto-$(CONFIG_BLOCK) += bio.o diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 45c3d0427fb2..32a7ad0098cc 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -141,7 +141,7 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, memset(iv, 0, ci->ci_mode->ivsize); iv->lblk_num = cpu_to_le64(lblk_num); - if (ci->ci_flags & FS_POLICY_FLAG_DIRECT_KEY) + if (fscrypt_is_direct_key_policy(&ci->ci_policy)) memcpy(iv->nonce, ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE); if (ci->ci_essiv_tfm != NULL) @@ -188,10 +188,8 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); skcipher_request_free(req); if (res) { - fscrypt_err(inode->i_sb, - "%scryption failed for inode %lu, block %llu: %d", - (rw == FS_DECRYPT ? "de" : "en"), - inode->i_ino, lblk_num, res); + fscrypt_err(inode, "%scryption failed for block %llu: %d", + (rw == FS_DECRYPT ? "De" : "En"), lblk_num, res); return res; } return 0; @@ -453,7 +451,7 @@ fail: return res; } -void fscrypt_msg(struct super_block *sb, const char *level, +void fscrypt_msg(const struct inode *inode, const char *level, const char *fmt, ...) { static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, @@ -467,8 +465,9 @@ void fscrypt_msg(struct super_block *sb, const char *level, va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - if (sb) - printk("%sfscrypt (%s): %pV\n", level, sb->s_id, &vaf); + if (inode) + printk("%sfscrypt (%s, inode %lu): %pV\n", + level, inode->i_sb->s_id, inode->i_ino, &vaf); else printk("%sfscrypt: %pV\n", level, &vaf); va_end(args); @@ -479,6 +478,8 @@ void fscrypt_msg(struct super_block *sb, const char *level, */ static int __init fscrypt_init(void) { + int err = -ENOMEM; + /* * Use an unbound workqueue to allow bios to be decrypted in parallel * even when they happen to complete on the same CPU. This sacrifices @@ -501,31 +502,19 @@ static int __init fscrypt_init(void) if (!fscrypt_info_cachep) goto fail_free_ctx; + err = fscrypt_init_keyring(); + if (err) + goto fail_free_info; + return 0; +fail_free_info: + kmem_cache_destroy(fscrypt_info_cachep); fail_free_ctx: kmem_cache_destroy(fscrypt_ctx_cachep); fail_free_queue: destroy_workqueue(fscrypt_read_workqueue); fail: - return -ENOMEM; -} -module_init(fscrypt_init) - -/** - * fscrypt_exit() - Shutdown the fs encryption system - */ -static void __exit fscrypt_exit(void) -{ - fscrypt_destroy(); - - if (fscrypt_read_workqueue) - destroy_workqueue(fscrypt_read_workqueue); - kmem_cache_destroy(fscrypt_ctx_cachep); - kmem_cache_destroy(fscrypt_info_cachep); - - fscrypt_essiv_cleanup(); + return err; } -module_exit(fscrypt_exit); - -MODULE_LICENSE("GPL"); +late_initcall(fscrypt_init) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 00d150ff3033..3da3707c10e3 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -71,9 +71,7 @@ int fname_encrypt(struct inode *inode, const struct qstr *iname, res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); skcipher_request_free(req); if (res < 0) { - fscrypt_err(inode->i_sb, - "Filename encryption failed for inode %lu: %d", - inode->i_ino, res); + fscrypt_err(inode, "Filename encryption failed: %d", res); return res; } @@ -117,9 +115,7 @@ static int fname_decrypt(struct inode *inode, res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait); skcipher_request_free(req); if (res < 0) { - fscrypt_err(inode->i_sb, - "Filename decryption failed for inode %lu: %d", - inode->i_ino, res); + fscrypt_err(inode, "Filename decryption failed: %d", res); return res; } @@ -127,44 +123,45 @@ static int fname_decrypt(struct inode *inode, return 0; } -static const char *lookup_table = +static const char lookup_table[65] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; #define BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) /** - * digest_encode() - + * base64_encode() - * - * Encodes the input digest using characters from the set [a-zA-Z0-9_+]. + * Encodes the input string using characters from the set [A-Za-z0-9+,]. * The encoded string is roughly 4/3 times the size of the input string. + * + * Return: length of the encoded string */ -static int digest_encode(const char *src, int len, char *dst) +static int base64_encode(const u8 *src, int len, char *dst) { - int i = 0, bits = 0, ac = 0; + int i, bits = 0, ac = 0; char *cp = dst; - while (i < len) { - ac += (((unsigned char) src[i]) << bits); + for (i = 0; i < len; i++) { + ac += src[i] << bits; bits += 8; do { *cp++ = lookup_table[ac & 0x3f]; ac >>= 6; bits -= 6; } while (bits >= 6); - i++; } if (bits) *cp++ = lookup_table[ac & 0x3f]; return cp - dst; } -static int digest_decode(const char *src, int len, char *dst) +static int base64_decode(const char *src, int len, u8 *dst) { - int i = 0, bits = 0, ac = 0; + int i, bits = 0, ac = 0; const char *p; - char *cp = dst; + u8 *cp = dst; - while (i < len) { + for (i = 0; i < len; i++) { p = strchr(lookup_table, src[i]); if (p == NULL || src[i] == 0) return -2; @@ -175,7 +172,6 @@ static int digest_decode(const char *src, int len, char *dst) ac >>= 8; bits -= 8; } - i++; } if (ac) return -1; @@ -185,8 +181,9 @@ static int digest_decode(const char *src, int len, char *dst) bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, u32 max_len, u32 *encrypted_len_ret) { - int padding = 4 << (inode->i_crypt_info->ci_flags & - FS_POLICY_FLAGS_PAD_MASK); + const struct fscrypt_info *ci = inode->i_crypt_info; + int padding = 4 << (fscrypt_policy_flags(&ci->ci_policy) & + FSCRYPT_POLICY_FLAGS_PAD_MASK); u32 encrypted_len; if (orig_len > max_len) @@ -272,7 +269,7 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, return fname_decrypt(inode, iname, oname); if (iname->len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE) { - oname->len = digest_encode(iname->name, iname->len, + oname->len = base64_encode(iname->name, iname->len, oname->name); return 0; } @@ -287,7 +284,7 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, FSCRYPT_FNAME_DIGEST(iname->name, iname->len), FSCRYPT_FNAME_DIGEST_SIZE); oname->name[0] = '_'; - oname->len = 1 + digest_encode((const char *)&digested_name, + oname->len = 1 + base64_encode((const u8 *)&digested_name, sizeof(digested_name), oname->name + 1); return 0; } @@ -380,8 +377,8 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, if (fname->crypto_buf.name == NULL) return -ENOMEM; - ret = digest_decode(iname->name + digested, iname->len - digested, - fname->crypto_buf.name); + ret = base64_decode(iname->name + digested, iname->len - digested, + fname->crypto_buf.name); if (ret < 0) { ret = -ENOENT; goto errout; diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 8978eec9d766..e84efc01512e 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -4,9 +4,8 @@ * * Copyright (C) 2015, Google, Inc. * - * This contains encryption key functions. - * - * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. + * Originally written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar. + * Heavily modified since then. */ #ifndef _FSCRYPT_PRIVATE_H @@ -15,30 +14,133 @@ #include <linux/fscrypt.h> #include <crypto/hash.h> -/* Encryption parameters */ +#define CONST_STRLEN(str) (sizeof(str) - 1) + #define FS_KEY_DERIVATION_NONCE_SIZE 16 -/** - * Encryption context for inode - * - * Protector format: - * 1 byte: Protector format (1 = this version) - * 1 byte: File contents encryption mode - * 1 byte: File names encryption mode - * 1 byte: Flags - * 8 bytes: Master Key descriptor - * 16 bytes: Encryption Key derivation nonce - */ -struct fscrypt_context { - u8 format; +#define FSCRYPT_MIN_KEY_SIZE 16 + +#define FSCRYPT_CONTEXT_V1 1 +#define FSCRYPT_CONTEXT_V2 2 + +struct fscrypt_context_v1 { + u8 version; /* FSCRYPT_CONTEXT_V1 */ u8 contents_encryption_mode; u8 filenames_encryption_mode; u8 flags; - u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; + u8 master_key_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE]; u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; -} __packed; +}; -#define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 +struct fscrypt_context_v2 { + u8 version; /* FSCRYPT_CONTEXT_V2 */ + u8 contents_encryption_mode; + u8 filenames_encryption_mode; + u8 flags; + u8 __reserved[4]; + u8 master_key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]; + u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; +}; + +/** + * fscrypt_context - the encryption context of an inode + * + * This is the on-disk equivalent of an fscrypt_policy, stored alongside each + * encrypted file usually in a hidden extended attribute. It contains the + * fields from the fscrypt_policy, in order to identify the encryption algorithm + * and key with which the file is encrypted. It also contains a nonce that was + * randomly generated by fscrypt itself; this is used as KDF input or as a tweak + * to cause different files to be encrypted differently. + */ +union fscrypt_context { + u8 version; + struct fscrypt_context_v1 v1; + struct fscrypt_context_v2 v2; +}; + +/* + * Return the size expected for the given fscrypt_context based on its version + * number, or 0 if the context version is unrecognized. + */ +static inline int fscrypt_context_size(const union fscrypt_context *ctx) +{ + switch (ctx->version) { + case FSCRYPT_CONTEXT_V1: + BUILD_BUG_ON(sizeof(ctx->v1) != 28); + return sizeof(ctx->v1); + case FSCRYPT_CONTEXT_V2: + BUILD_BUG_ON(sizeof(ctx->v2) != 40); + return sizeof(ctx->v2); + } + return 0; +} + +#undef fscrypt_policy +union fscrypt_policy { + u8 version; + struct fscrypt_policy_v1 v1; + struct fscrypt_policy_v2 v2; +}; + +/* + * Return the size expected for the given fscrypt_policy based on its version + * number, or 0 if the policy version is unrecognized. + */ +static inline int fscrypt_policy_size(const union fscrypt_policy *policy) +{ + switch (policy->version) { + case FSCRYPT_POLICY_V1: + return sizeof(policy->v1); + case FSCRYPT_POLICY_V2: + return sizeof(policy->v2); + } + return 0; +} + +/* Return the contents encryption mode of a valid encryption policy */ +static inline u8 +fscrypt_policy_contents_mode(const union fscrypt_policy *policy) +{ + switch (policy->version) { + case FSCRYPT_POLICY_V1: + return policy->v1.contents_encryption_mode; + case FSCRYPT_POLICY_V2: + return policy->v2.contents_encryption_mode; + } + BUG(); +} + +/* Return the filenames encryption mode of a valid encryption policy */ +static inline u8 +fscrypt_policy_fnames_mode(const union fscrypt_policy *policy) +{ + switch (policy->version) { + case FSCRYPT_POLICY_V1: + return policy->v1.filenames_encryption_mode; + case FSCRYPT_POLICY_V2: + return policy->v2.filenames_encryption_mode; + } + BUG(); +} + +/* Return the flags (FSCRYPT_POLICY_FLAG*) of a valid encryption policy */ +static inline u8 +fscrypt_policy_flags(const union fscrypt_policy *policy) +{ + switch (policy->version) { + case FSCRYPT_POLICY_V1: + return policy->v1.flags; + case FSCRYPT_POLICY_V2: + return policy->v2.flags; + } + BUG(); +} + +static inline bool +fscrypt_is_direct_key_policy(const union fscrypt_policy *policy) +{ + return fscrypt_policy_flags(policy) & FSCRYPT_POLICY_FLAG_DIRECT_KEY; +} /** * For encrypted symlinks, the ciphertext length is stored at the beginning @@ -68,23 +170,37 @@ struct fscrypt_info { struct crypto_cipher *ci_essiv_tfm; /* - * Encryption mode used for this inode. It corresponds to either - * ci_data_mode or ci_filename_mode, depending on the inode type. + * Encryption mode used for this inode. It corresponds to either the + * contents or filenames encryption mode, depending on the inode type. */ struct fscrypt_mode *ci_mode; + /* Back-pointer to the inode */ + struct inode *ci_inode; + + /* + * The master key with which this inode was unlocked (decrypted). This + * will be NULL if the master key was found in a process-subscribed + * keyring rather than in the filesystem-level keyring. + */ + struct key *ci_master_key; + + /* + * Link in list of inodes that were unlocked with the master key. + * Only used when ->ci_master_key is set. + */ + struct list_head ci_master_key_link; + /* - * If non-NULL, then this inode uses a master key directly rather than a - * derived key, and ci_ctfm will equal ci_master_key->mk_ctfm. - * Otherwise, this inode uses a derived key. + * If non-NULL, then encryption is done using the master key directly + * and ci_ctfm will equal ci_direct_key->dk_ctfm. */ - struct fscrypt_master_key *ci_master_key; + struct fscrypt_direct_key *ci_direct_key; - /* fields from the fscrypt_context */ - u8 ci_data_mode; - u8 ci_filename_mode; - u8 ci_flags; - u8 ci_master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; + /* The encryption policy used by this inode */ + union fscrypt_policy ci_policy; + + /* This inode's nonce, copied from the fscrypt_context */ u8 ci_nonce[FS_KEY_DERIVATION_NONCE_SIZE]; }; @@ -98,16 +214,16 @@ typedef enum { static inline bool fscrypt_valid_enc_modes(u32 contents_mode, u32 filenames_mode) { - if (contents_mode == FS_ENCRYPTION_MODE_AES_128_CBC && - filenames_mode == FS_ENCRYPTION_MODE_AES_128_CTS) + if (contents_mode == FSCRYPT_MODE_AES_128_CBC && + filenames_mode == FSCRYPT_MODE_AES_128_CTS) return true; - if (contents_mode == FS_ENCRYPTION_MODE_AES_256_XTS && - filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS) + if (contents_mode == FSCRYPT_MODE_AES_256_XTS && + filenames_mode == FSCRYPT_MODE_AES_256_CTS) return true; - if (contents_mode == FS_ENCRYPTION_MODE_ADIANTUM && - filenames_mode == FS_ENCRYPTION_MODE_ADIANTUM) + if (contents_mode == FSCRYPT_MODE_ADIANTUM && + filenames_mode == FSCRYPT_MODE_ADIANTUM) return true; return false; @@ -125,12 +241,12 @@ extern struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags); extern const struct dentry_operations fscrypt_d_ops; extern void __printf(3, 4) __cold -fscrypt_msg(struct super_block *sb, const char *level, const char *fmt, ...); +fscrypt_msg(const struct inode *inode, const char *level, const char *fmt, ...); -#define fscrypt_warn(sb, fmt, ...) \ - fscrypt_msg(sb, KERN_WARNING, fmt, ##__VA_ARGS__) -#define fscrypt_err(sb, fmt, ...) \ - fscrypt_msg(sb, KERN_ERR, fmt, ##__VA_ARGS__) +#define fscrypt_warn(inode, fmt, ...) \ + fscrypt_msg((inode), KERN_WARNING, fmt, ##__VA_ARGS__) +#define fscrypt_err(inode, fmt, ...) \ + fscrypt_msg((inode), KERN_ERR, fmt, ##__VA_ARGS__) #define FSCRYPT_MAX_IV_SIZE 32 @@ -155,7 +271,172 @@ extern bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, u32 max_len, u32 *encrypted_len_ret); -/* keyinfo.c */ +/* hkdf.c */ + +struct fscrypt_hkdf { + struct crypto_shash *hmac_tfm; +}; + +extern int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, + unsigned int master_key_size); + +/* + * The list of contexts in which fscrypt uses HKDF. These values are used as + * the first byte of the HKDF application-specific info string to guarantee that + * info strings are never repeated between contexts. This ensures that all HKDF + * outputs are unique and cryptographically isolated, i.e. knowledge of one + * output doesn't reveal another. + */ +#define HKDF_CONTEXT_KEY_IDENTIFIER 1 +#define HKDF_CONTEXT_PER_FILE_KEY 2 +#define HKDF_CONTEXT_PER_MODE_KEY 3 + +extern int fscrypt_hkdf_expand(struct fscrypt_hkdf *hkdf, u8 context, + const u8 *info, unsigned int infolen, + u8 *okm, unsigned int okmlen); + +extern void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf); + +/* keyring.c */ + +/* + * fscrypt_master_key_secret - secret key material of an in-use master key + */ +struct fscrypt_master_key_secret { + + /* + * For v2 policy keys: HKDF context keyed by this master key. + * For v1 policy keys: not set (hkdf.hmac_tfm == NULL). + */ + struct fscrypt_hkdf hkdf; + + /* Size of the raw key in bytes. Set even if ->raw isn't set. */ + u32 size; + + /* For v1 policy keys: the raw key. Wiped for v2 policy keys. */ + u8 raw[FSCRYPT_MAX_KEY_SIZE]; + +} __randomize_layout; + +/* + * fscrypt_master_key - an in-use master key + * + * This represents a master encryption key which has been added to the + * filesystem and can be used to "unlock" the encrypted files which were + * encrypted with it. + */ +struct fscrypt_master_key { + + /* + * The secret key material. After FS_IOC_REMOVE_ENCRYPTION_KEY is + * executed, this is wiped and no new inodes can be unlocked with this + * key; however, there may still be inodes in ->mk_decrypted_inodes + * which could not be evicted. As long as some inodes still remain, + * FS_IOC_REMOVE_ENCRYPTION_KEY can be retried, or + * FS_IOC_ADD_ENCRYPTION_KEY can add the secret again. + * + * Locking: protected by key->sem (outer) and mk_secret_sem (inner). + * The reason for two locks is that key->sem also protects modifying + * mk_users, which ranks it above the semaphore for the keyring key + * type, which is in turn above page faults (via keyring_read). But + * sometimes filesystems call fscrypt_get_encryption_info() from within + * a transaction, which ranks it below page faults. So we need a + * separate lock which protects mk_secret but not also mk_users. + */ + struct fscrypt_master_key_secret mk_secret; + struct rw_semaphore mk_secret_sem; + + /* + * For v1 policy keys: an arbitrary key descriptor which was assigned by + * userspace (->descriptor). + * + * For v2 policy keys: a cryptographic hash of this key (->identifier). + */ + struct fscrypt_key_specifier mk_spec; + + /* + * Keyring which contains a key of type 'key_type_fscrypt_user' for each + * user who has added this key. Normally each key will be added by just + * one user, but it's possible that multiple users share a key, and in + * that case we need to keep track of those users so that one user can't + * remove the key before the others want it removed too. + * + * This is NULL for v1 policy keys; those can only be added by root. + * + * Locking: in addition to this keyrings own semaphore, this is + * protected by the master key's key->sem, so we can do atomic + * search+insert. It can also be searched without taking any locks, but + * in that case the returned key may have already been removed. + */ + struct key *mk_users; + + /* + * Length of ->mk_decrypted_inodes, plus one if mk_secret is present. + * Once this goes to 0, the master key is removed from ->s_master_keys. + * The 'struct fscrypt_master_key' will continue to live as long as the + * 'struct key' whose payload it is, but we won't let this reference + * count rise again. + */ + refcount_t mk_refcount; + + /* + * List of inodes that were unlocked using this key. This allows the + * inodes to be evicted efficiently if the key is removed. + */ + struct list_head mk_decrypted_inodes; + spinlock_t mk_decrypted_inodes_lock; + + /* Per-mode tfms for DIRECT_KEY policies, allocated on-demand */ + struct crypto_skcipher *mk_mode_keys[__FSCRYPT_MODE_MAX + 1]; + +} __randomize_layout; + +static inline bool +is_master_key_secret_present(const struct fscrypt_master_key_secret *secret) +{ + /* + * The READ_ONCE() is only necessary for fscrypt_drop_inode() and + * fscrypt_key_describe(). These run in atomic context, so they can't + * take ->mk_secret_sem and thus 'secret' can change concurrently which + * would be a data race. But they only need to know whether the secret + * *was* present at the time of check, so READ_ONCE() suffices. + */ + return READ_ONCE(secret->size) != 0; +} + +static inline const char *master_key_spec_type( + const struct fscrypt_key_specifier *spec) +{ + switch (spec->type) { + case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: + return "descriptor"; + case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER: + return "identifier"; + } + return "[unknown]"; +} + +static inline int master_key_spec_len(const struct fscrypt_key_specifier *spec) +{ + switch (spec->type) { + case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: + return FSCRYPT_KEY_DESCRIPTOR_SIZE; + case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER: + return FSCRYPT_KEY_IDENTIFIER_SIZE; + } + return 0; +} + +extern struct key * +fscrypt_find_master_key(struct super_block *sb, + const struct fscrypt_key_specifier *mk_spec); + +extern int fscrypt_verify_key_added(struct super_block *sb, + const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); + +extern int __init fscrypt_init_keyring(void); + +/* keysetup.c */ struct fscrypt_mode { const char *friendly_name; @@ -166,6 +447,36 @@ struct fscrypt_mode { bool needs_essiv; }; -extern void __exit fscrypt_essiv_cleanup(void); +static inline bool +fscrypt_mode_supports_direct_key(const struct fscrypt_mode *mode) +{ + return mode->ivsize >= offsetofend(union fscrypt_iv, nonce); +} + +extern struct crypto_skcipher * +fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key, + const struct inode *inode); + +extern int fscrypt_set_derived_key(struct fscrypt_info *ci, + const u8 *derived_key); + +/* keysetup_v1.c */ + +extern void fscrypt_put_direct_key(struct fscrypt_direct_key *dk); + +extern int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, + const u8 *raw_master_key); + +extern int fscrypt_setup_v1_file_key_via_subscribed_keyrings( + struct fscrypt_info *ci); +/* policy.c */ + +extern bool fscrypt_policies_equal(const union fscrypt_policy *policy1, + const union fscrypt_policy *policy2); +extern bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, + const struct inode *inode); +extern int fscrypt_policy_from_context(union fscrypt_policy *policy_u, + const union fscrypt_context *ctx_u, + int ctx_size); #endif /* _FSCRYPT_PRIVATE_H */ diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c new file mode 100644 index 000000000000..f21873e1b467 --- /dev/null +++ b/fs/crypto/hkdf.c @@ -0,0 +1,181 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Implementation of HKDF ("HMAC-based Extract-and-Expand Key Derivation + * Function"), aka RFC 5869. See also the original paper (Krawczyk 2010): + * "Cryptographic Extraction and Key Derivation: The HKDF Scheme". + * + * This is used to derive keys from the fscrypt master keys. + * + * Copyright 2019 Google LLC + */ + +#include <crypto/hash.h> +#include <crypto/sha.h> + +#include "fscrypt_private.h" + +/* + * HKDF supports any unkeyed cryptographic hash algorithm, but fscrypt uses + * SHA-512 because it is reasonably secure and efficient; and since it produces + * a 64-byte digest, deriving an AES-256-XTS key preserves all 64 bytes of + * entropy from the master key and requires only one iteration of HKDF-Expand. + */ +#define HKDF_HMAC_ALG "hmac(sha512)" +#define HKDF_HASHLEN SHA512_DIGEST_SIZE + +/* + * HKDF consists of two steps: + * + * 1. HKDF-Extract: extract a pseudorandom key of length HKDF_HASHLEN bytes from + * the input keying material and optional salt. + * 2. HKDF-Expand: expand the pseudorandom key into output keying material of + * any length, parameterized by an application-specific info string. + * + * HKDF-Extract can be skipped if the input is already a pseudorandom key of + * length HKDF_HASHLEN bytes. However, cipher modes other than AES-256-XTS take + * shorter keys, and we don't want to force users of those modes to provide + * unnecessarily long master keys. Thus fscrypt still does HKDF-Extract. No + * salt is used, since fscrypt master keys should already be pseudorandom and + * there's no way to persist a random salt per master key from kernel mode. + */ + +/* HKDF-Extract (RFC 5869 section 2.2), unsalted */ +static int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm, + unsigned int ikmlen, u8 prk[HKDF_HASHLEN]) +{ + static const u8 default_salt[HKDF_HASHLEN]; + SHASH_DESC_ON_STACK(desc, hmac_tfm); + int err; + + err = crypto_shash_setkey(hmac_tfm, default_salt, HKDF_HASHLEN); + if (err) + return err; + + desc->tfm = hmac_tfm; + err = crypto_shash_digest(desc, ikm, ikmlen, prk); + shash_desc_zero(desc); + return err; +} + +/* + * Compute HKDF-Extract using the given master key as the input keying material, + * and prepare an HMAC transform object keyed by the resulting pseudorandom key. + * + * Afterwards, the keyed HMAC transform object can be used for HKDF-Expand many + * times without having to recompute HKDF-Extract each time. + */ +int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, + unsigned int master_key_size) +{ + struct crypto_shash *hmac_tfm; + u8 prk[HKDF_HASHLEN]; + int err; + + hmac_tfm = crypto_alloc_shash(HKDF_HMAC_ALG, 0, 0); + if (IS_ERR(hmac_tfm)) { + fscrypt_err(NULL, "Error allocating " HKDF_HMAC_ALG ": %ld", + PTR_ERR(hmac_tfm)); + return PTR_ERR(hmac_tfm); + } + + if (WARN_ON(crypto_shash_digestsize(hmac_tfm) != sizeof(prk))) { + err = -EINVAL; + goto err_free_tfm; + } + + err = hkdf_extract(hmac_tfm, master_key, master_key_size, prk); + if (err) + goto err_free_tfm; + + err = crypto_shash_setkey(hmac_tfm, prk, sizeof(prk)); + if (err) + goto err_free_tfm; + + hkdf->hmac_tfm = hmac_tfm; + goto out; + +err_free_tfm: + crypto_free_shash(hmac_tfm); +out: + memzero_explicit(prk, sizeof(prk)); + return err; +} + +/* + * HKDF-Expand (RFC 5869 section 2.3). This expands the pseudorandom key, which + * was already keyed into 'hkdf->hmac_tfm' by fscrypt_init_hkdf(), into 'okmlen' + * bytes of output keying material parameterized by the application-specific + * 'info' of length 'infolen' bytes, prefixed by "fscrypt\0" and the 'context' + * byte. This is thread-safe and may be called by multiple threads in parallel. + * + * ('context' isn't part of the HKDF specification; it's just a prefix fscrypt + * adds to its application-specific info strings to guarantee that it doesn't + * accidentally repeat an info string when using HKDF for different purposes.) + */ +int fscrypt_hkdf_expand(struct fscrypt_hkdf *hkdf, u8 context, + const u8 *info, unsigned int infolen, + u8 *okm, unsigned int okmlen) +{ + SHASH_DESC_ON_STACK(desc, hkdf->hmac_tfm); + u8 prefix[9]; + unsigned int i; + int err; + const u8 *prev = NULL; + u8 counter = 1; + u8 tmp[HKDF_HASHLEN]; + + if (WARN_ON(okmlen > 255 * HKDF_HASHLEN)) + return -EINVAL; + + desc->tfm = hkdf->hmac_tfm; + + memcpy(prefix, "fscrypt\0", 8); + prefix[8] = context; + + for (i = 0; i < okmlen; i += HKDF_HASHLEN) { + + err = crypto_shash_init(desc); + if (err) + goto out; + + if (prev) { + err = crypto_shash_update(desc, prev, HKDF_HASHLEN); + if (err) + goto out; + } + + err = crypto_shash_update(desc, prefix, sizeof(prefix)); + if (err) + goto out; + + err = crypto_shash_update(desc, info, infolen); + if (err) + goto out; + + BUILD_BUG_ON(sizeof(counter) != 1); + if (okmlen - i < HKDF_HASHLEN) { + err = crypto_shash_finup(desc, &counter, 1, tmp); + if (err) + goto out; + memcpy(&okm[i], tmp, okmlen - i); + memzero_explicit(tmp, sizeof(tmp)); + } else { + err = crypto_shash_finup(desc, &counter, 1, &okm[i]); + if (err) + goto out; + } + counter++; + prev = &okm[i]; + } + err = 0; +out: + if (unlikely(err)) + memzero_explicit(okm, okmlen); /* so caller doesn't need to */ + shash_desc_zero(desc); + return err; +} + +void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf) +{ + crypto_free_shash(hkdf->hmac_tfm); +} diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index c1d6715d88e9..bb3b7fcfdd48 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -39,9 +39,9 @@ int fscrypt_file_open(struct inode *inode, struct file *filp) dir = dget_parent(file_dentry(filp)); if (IS_ENCRYPTED(d_inode(dir)) && !fscrypt_has_permitted_context(d_inode(dir), inode)) { - fscrypt_warn(inode->i_sb, - "inconsistent encryption contexts: %lu/%lu", - d_inode(dir)->i_ino, inode->i_ino); + fscrypt_warn(inode, + "Inconsistent encryption context (parent directory: %lu)", + d_inode(dir)->i_ino); err = -EPERM; } dput(dir); diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c deleted file mode 100644 index 207ebed918c1..000000000000 --- a/fs/crypto/keyinfo.c +++ /dev/null @@ -1,611 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * key management facility for FS encryption support. - * - * Copyright (C) 2015, Google, Inc. - * - * This contains encryption key functions. - * - * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. - */ - -#include <keys/user-type.h> -#include <linux/hashtable.h> -#include <linux/scatterlist.h> -#include <crypto/aes.h> -#include <crypto/algapi.h> -#include <crypto/sha.h> -#include <crypto/skcipher.h> -#include "fscrypt_private.h" - -static struct crypto_shash *essiv_hash_tfm; - -/* Table of keys referenced by FS_POLICY_FLAG_DIRECT_KEY policies */ -static DEFINE_HASHTABLE(fscrypt_master_keys, 6); /* 6 bits = 64 buckets */ -static DEFINE_SPINLOCK(fscrypt_master_keys_lock); - -/* - * Key derivation function. This generates the derived key by encrypting the - * master key with AES-128-ECB using the inode's nonce as the AES key. - * - * The master key must be at least as long as the derived key. If the master - * key is longer, then only the first 'derived_keysize' bytes are used. - */ -static int derive_key_aes(const u8 *master_key, - const struct fscrypt_context *ctx, - u8 *derived_key, unsigned int derived_keysize) -{ - int res = 0; - struct skcipher_request *req = NULL; - DECLARE_CRYPTO_WAIT(wait); - struct scatterlist src_sg, dst_sg; - struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0); - - if (IS_ERR(tfm)) { - res = PTR_ERR(tfm); - tfm = NULL; - goto out; - } - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); - req = skcipher_request_alloc(tfm, GFP_NOFS); - if (!req) { - res = -ENOMEM; - goto out; - } - skcipher_request_set_callback(req, - CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - crypto_req_done, &wait); - res = crypto_skcipher_setkey(tfm, ctx->nonce, sizeof(ctx->nonce)); - if (res < 0) - goto out; - - sg_init_one(&src_sg, master_key, derived_keysize); - sg_init_one(&dst_sg, derived_key, derived_keysize); - skcipher_request_set_crypt(req, &src_sg, &dst_sg, derived_keysize, - NULL); - res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); -out: - skcipher_request_free(req); - crypto_free_skcipher(tfm); - return res; -} - -/* - * Search the current task's subscribed keyrings for a "logon" key with - * description prefix:descriptor, and if found acquire a read lock on it and - * return a pointer to its validated payload in *payload_ret. - */ -static struct key * -find_and_lock_process_key(const char *prefix, - const u8 descriptor[FS_KEY_DESCRIPTOR_SIZE], - unsigned int min_keysize, - const struct fscrypt_key **payload_ret) -{ - char *description; - struct key *key; - const struct user_key_payload *ukp; - const struct fscrypt_key *payload; - - description = kasprintf(GFP_NOFS, "%s%*phN", prefix, - FS_KEY_DESCRIPTOR_SIZE, descriptor); - if (!description) - return ERR_PTR(-ENOMEM); - - key = request_key(&key_type_logon, description, NULL); - kfree(description); - if (IS_ERR(key)) - return key; - - down_read(&key->sem); - ukp = user_key_payload_locked(key); - - if (!ukp) /* was the key revoked before we acquired its semaphore? */ - goto invalid; - - payload = (const struct fscrypt_key *)ukp->data; - - if (ukp->datalen != sizeof(struct fscrypt_key) || - payload->size < 1 || payload->size > FS_MAX_KEY_SIZE) { - fscrypt_warn(NULL, - "key with description '%s' has invalid payload", - key->description); - goto invalid; - } - - if (payload->size < min_keysize) { - fscrypt_warn(NULL, - "key with description '%s' is too short (got %u bytes, need %u+ bytes)", - key->description, payload->size, min_keysize); - goto invalid; - } - - *payload_ret = payload; - return key; - -invalid: - up_read(&key->sem); - key_put(key); - return ERR_PTR(-ENOKEY); -} - -static struct fscrypt_mode available_modes[] = { - [FS_ENCRYPTION_MODE_AES_256_XTS] = { - .friendly_name = "AES-256-XTS", - .cipher_str = "xts(aes)", - .keysize = 64, - .ivsize = 16, - }, - [FS_ENCRYPTION_MODE_AES_256_CTS] = { - .friendly_name = "AES-256-CTS-CBC", - .cipher_str = "cts(cbc(aes))", - .keysize = 32, - .ivsize = 16, - }, - [FS_ENCRYPTION_MODE_AES_128_CBC] = { - .friendly_name = "AES-128-CBC", - .cipher_str = "cbc(aes)", - .keysize = 16, - .ivsize = 16, - .needs_essiv = true, - }, - [FS_ENCRYPTION_MODE_AES_128_CTS] = { - .friendly_name = "AES-128-CTS-CBC", - .cipher_str = "cts(cbc(aes))", - .keysize = 16, - .ivsize = 16, - }, - [FS_ENCRYPTION_MODE_ADIANTUM] = { - .friendly_name = "Adiantum", - .cipher_str = "adiantum(xchacha12,aes)", - .keysize = 32, - .ivsize = 32, - }, -}; - -static struct fscrypt_mode * -select_encryption_mode(const struct fscrypt_info *ci, const struct inode *inode) -{ - if (!fscrypt_valid_enc_modes(ci->ci_data_mode, ci->ci_filename_mode)) { - fscrypt_warn(inode->i_sb, - "inode %lu uses unsupported encryption modes (contents mode %d, filenames mode %d)", - inode->i_ino, ci->ci_data_mode, - ci->ci_filename_mode); - return ERR_PTR(-EINVAL); - } - - if (S_ISREG(inode->i_mode)) - return &available_modes[ci->ci_data_mode]; - - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - return &available_modes[ci->ci_filename_mode]; - - WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n", - inode->i_ino, (inode->i_mode & S_IFMT)); - return ERR_PTR(-EINVAL); -} - -/* Find the master key, then derive the inode's actual encryption key */ -static int find_and_derive_key(const struct inode *inode, - const struct fscrypt_context *ctx, - u8 *derived_key, const struct fscrypt_mode *mode) -{ - struct key *key; - const struct fscrypt_key *payload; - int err; - - key = find_and_lock_process_key(FS_KEY_DESC_PREFIX, - ctx->master_key_descriptor, - mode->keysize, &payload); - if (key == ERR_PTR(-ENOKEY) && inode->i_sb->s_cop->key_prefix) { - key = find_and_lock_process_key(inode->i_sb->s_cop->key_prefix, - ctx->master_key_descriptor, - mode->keysize, &payload); - } - if (IS_ERR(key)) - return PTR_ERR(key); - - if (ctx->flags & FS_POLICY_FLAG_DIRECT_KEY) { - if (mode->ivsize < offsetofend(union fscrypt_iv, nonce)) { - fscrypt_warn(inode->i_sb, - "direct key mode not allowed with %s", - mode->friendly_name); - err = -EINVAL; - } else if (ctx->contents_encryption_mode != - ctx->filenames_encryption_mode) { - fscrypt_warn(inode->i_sb, - "direct key mode not allowed with different contents and filenames modes"); - err = -EINVAL; - } else { - memcpy(derived_key, payload->raw, mode->keysize); - err = 0; - } - } else { - err = derive_key_aes(payload->raw, ctx, derived_key, - mode->keysize); - } - up_read(&key->sem); - key_put(key); - return err; -} - -/* Allocate and key a symmetric cipher object for the given encryption mode */ -static struct crypto_skcipher * -allocate_skcipher_for_mode(struct fscrypt_mode *mode, const u8 *raw_key, - const struct inode *inode) -{ - struct crypto_skcipher *tfm; - int err; - - tfm = crypto_alloc_skcipher(mode->cipher_str, 0, 0); - if (IS_ERR(tfm)) { - fscrypt_warn(inode->i_sb, - "error allocating '%s' transform for inode %lu: %ld", - mode->cipher_str, inode->i_ino, PTR_ERR(tfm)); - return tfm; - } - if (unlikely(!mode->logged_impl_name)) { - /* - * fscrypt performance can vary greatly depending on which - * crypto algorithm implementation is used. Help people debug - * performance problems by logging the ->cra_driver_name the - * first time a mode is used. Note that multiple threads can - * race here, but it doesn't really matter. - */ - mode->logged_impl_name = true; - pr_info("fscrypt: %s using implementation \"%s\"\n", - mode->friendly_name, - crypto_skcipher_alg(tfm)->base.cra_driver_name); - } - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); - err = crypto_skcipher_setkey(tfm, raw_key, mode->keysize); - if (err) - goto err_free_tfm; - - return tfm; - -err_free_tfm: - crypto_free_skcipher(tfm); - return ERR_PTR(err); -} - -/* Master key referenced by FS_POLICY_FLAG_DIRECT_KEY policy */ -struct fscrypt_master_key { - struct hlist_node mk_node; - refcount_t mk_refcount; - const struct fscrypt_mode *mk_mode; - struct crypto_skcipher *mk_ctfm; - u8 mk_descriptor[FS_KEY_DESCRIPTOR_SIZE]; - u8 mk_raw[FS_MAX_KEY_SIZE]; -}; - -static void free_master_key(struct fscrypt_master_key *mk) -{ - if (mk) { - crypto_free_skcipher(mk->mk_ctfm); - kzfree(mk); - } -} - -static void put_master_key(struct fscrypt_master_key *mk) -{ - if (!refcount_dec_and_lock(&mk->mk_refcount, &fscrypt_master_keys_lock)) - return; - hash_del(&mk->mk_node); - spin_unlock(&fscrypt_master_keys_lock); - - free_master_key(mk); -} - -/* - * Find/insert the given master key into the fscrypt_master_keys table. If - * found, it is returned with elevated refcount, and 'to_insert' is freed if - * non-NULL. If not found, 'to_insert' is inserted and returned if it's - * non-NULL; otherwise NULL is returned. - */ -static struct fscrypt_master_key * -find_or_insert_master_key(struct fscrypt_master_key *to_insert, - const u8 *raw_key, const struct fscrypt_mode *mode, - const struct fscrypt_info *ci) -{ - unsigned long hash_key; - struct fscrypt_master_key *mk; - - /* - * Careful: to avoid potentially leaking secret key bytes via timing - * information, we must key the hash table by descriptor rather than by - * raw key, and use crypto_memneq() when comparing raw keys. - */ - - BUILD_BUG_ON(sizeof(hash_key) > FS_KEY_DESCRIPTOR_SIZE); - memcpy(&hash_key, ci->ci_master_key_descriptor, sizeof(hash_key)); - - spin_lock(&fscrypt_master_keys_lock); - hash_for_each_possible(fscrypt_master_keys, mk, mk_node, hash_key) { - if (memcmp(ci->ci_master_key_descriptor, mk->mk_descriptor, - FS_KEY_DESCRIPTOR_SIZE) != 0) - continue; - if (mode != mk->mk_mode) - continue; - if (crypto_memneq(raw_key, mk->mk_raw, mode->keysize)) - continue; - /* using existing tfm with same (descriptor, mode, raw_key) */ - refcount_inc(&mk->mk_refcount); - spin_unlock(&fscrypt_master_keys_lock); - free_master_key(to_insert); - return mk; - } - if (to_insert) - hash_add(fscrypt_master_keys, &to_insert->mk_node, hash_key); - spin_unlock(&fscrypt_master_keys_lock); - return to_insert; -} - -/* Prepare to encrypt directly using the master key in the given mode */ -static struct fscrypt_master_key * -fscrypt_get_master_key(const struct fscrypt_info *ci, struct fscrypt_mode *mode, - const u8 *raw_key, const struct inode *inode) -{ - struct fscrypt_master_key *mk; - int err; - - /* Is there already a tfm for this key? */ - mk = find_or_insert_master_key(NULL, raw_key, mode, ci); - if (mk) - return mk; - - /* Nope, allocate one. */ - mk = kzalloc(sizeof(*mk), GFP_NOFS); - if (!mk) - return ERR_PTR(-ENOMEM); - refcount_set(&mk->mk_refcount, 1); - mk->mk_mode = mode; - mk->mk_ctfm = allocate_skcipher_for_mode(mode, raw_key, inode); - if (IS_ERR(mk->mk_ctfm)) { - err = PTR_ERR(mk->mk_ctfm); - mk->mk_ctfm = NULL; - goto err_free_mk; - } - memcpy(mk->mk_descriptor, ci->ci_master_key_descriptor, - FS_KEY_DESCRIPTOR_SIZE); - memcpy(mk->mk_raw, raw_key, mode->keysize); - - return find_or_insert_master_key(mk, raw_key, mode, ci); - -err_free_mk: - free_master_key(mk); - return ERR_PTR(err); -} - -static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt) -{ - struct crypto_shash *tfm = READ_ONCE(essiv_hash_tfm); - - /* init hash transform on demand */ - if (unlikely(!tfm)) { - struct crypto_shash *prev_tfm; - - tfm = crypto_alloc_shash("sha256", 0, 0); - if (IS_ERR(tfm)) { - fscrypt_warn(NULL, - "error allocating SHA-256 transform: %ld", - PTR_ERR(tfm)); - return PTR_ERR(tfm); - } - prev_tfm = cmpxchg(&essiv_hash_tfm, NULL, tfm); - if (prev_tfm) { - crypto_free_shash(tfm); - tfm = prev_tfm; - } - } - - { - SHASH_DESC_ON_STACK(desc, tfm); - desc->tfm = tfm; - - return crypto_shash_digest(desc, key, keysize, salt); - } -} - -static int init_essiv_generator(struct fscrypt_info *ci, const u8 *raw_key, - int keysize) -{ - int err; - struct crypto_cipher *essiv_tfm; - u8 salt[SHA256_DIGEST_SIZE]; - - essiv_tfm = crypto_alloc_cipher("aes", 0, 0); - if (IS_ERR(essiv_tfm)) - return PTR_ERR(essiv_tfm); - - ci->ci_essiv_tfm = essiv_tfm; - - err = derive_essiv_salt(raw_key, keysize, salt); - if (err) - goto out; - - /* - * Using SHA256 to derive the salt/key will result in AES-256 being - * used for IV generation. File contents encryption will still use the - * configured keysize (AES-128) nevertheless. - */ - err = crypto_cipher_setkey(essiv_tfm, salt, sizeof(salt)); - if (err) - goto out; - -out: - memzero_explicit(salt, sizeof(salt)); - return err; -} - -void __exit fscrypt_essiv_cleanup(void) -{ - crypto_free_shash(essiv_hash_tfm); -} - -/* - * Given the encryption mode and key (normally the derived key, but for - * FS_POLICY_FLAG_DIRECT_KEY mode it's the master key), set up the inode's - * symmetric cipher transform object(s). - */ -static int setup_crypto_transform(struct fscrypt_info *ci, - struct fscrypt_mode *mode, - const u8 *raw_key, const struct inode *inode) -{ - struct fscrypt_master_key *mk; - struct crypto_skcipher *ctfm; - int err; - - if (ci->ci_flags & FS_POLICY_FLAG_DIRECT_KEY) { - mk = fscrypt_get_master_key(ci, mode, raw_key, inode); - if (IS_ERR(mk)) - return PTR_ERR(mk); - ctfm = mk->mk_ctfm; - } else { - mk = NULL; - ctfm = allocate_skcipher_for_mode(mode, raw_key, inode); - if (IS_ERR(ctfm)) - return PTR_ERR(ctfm); - } - ci->ci_master_key = mk; - ci->ci_ctfm = ctfm; - - if (mode->needs_essiv) { - /* ESSIV implies 16-byte IVs which implies !DIRECT_KEY */ - WARN_ON(mode->ivsize != AES_BLOCK_SIZE); - WARN_ON(ci->ci_flags & FS_POLICY_FLAG_DIRECT_KEY); - - err = init_essiv_generator(ci, raw_key, mode->keysize); - if (err) { - fscrypt_warn(inode->i_sb, - "error initializing ESSIV generator for inode %lu: %d", - inode->i_ino, err); - return err; - } - } - return 0; -} - -static void put_crypt_info(struct fscrypt_info *ci) -{ - if (!ci) - return; - - if (ci->ci_master_key) { - put_master_key(ci->ci_master_key); - } else { - crypto_free_skcipher(ci->ci_ctfm); - crypto_free_cipher(ci->ci_essiv_tfm); - } - kmem_cache_free(fscrypt_info_cachep, ci); -} - -int fscrypt_get_encryption_info(struct inode *inode) -{ - struct fscrypt_info *crypt_info; - struct fscrypt_context ctx; - struct fscrypt_mode *mode; - u8 *raw_key = NULL; - int res; - - if (fscrypt_has_encryption_key(inode)) - return 0; - - res = fscrypt_initialize(inode->i_sb->s_cop->flags); - if (res) - return res; - - res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); - if (res < 0) { - if (!fscrypt_dummy_context_enabled(inode) || - IS_ENCRYPTED(inode)) - return res; - /* Fake up a context for an unencrypted directory */ - memset(&ctx, 0, sizeof(ctx)); - ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; - ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS; - ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS; - memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE); - } else if (res != sizeof(ctx)) { - return -EINVAL; - } - - if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1) - return -EINVAL; - - if (ctx.flags & ~FS_POLICY_FLAGS_VALID) - return -EINVAL; - - crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_NOFS); - if (!crypt_info) - return -ENOMEM; - - crypt_info->ci_flags = ctx.flags; - crypt_info->ci_data_mode = ctx.contents_encryption_mode; - crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; - memcpy(crypt_info->ci_master_key_descriptor, ctx.master_key_descriptor, - FS_KEY_DESCRIPTOR_SIZE); - memcpy(crypt_info->ci_nonce, ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE); - - mode = select_encryption_mode(crypt_info, inode); - if (IS_ERR(mode)) { - res = PTR_ERR(mode); - goto out; - } - WARN_ON(mode->ivsize > FSCRYPT_MAX_IV_SIZE); - crypt_info->ci_mode = mode; - - /* - * This cannot be a stack buffer because it may be passed to the - * scatterlist crypto API as part of key derivation. - */ - res = -ENOMEM; - raw_key = kmalloc(mode->keysize, GFP_NOFS); - if (!raw_key) - goto out; - - res = find_and_derive_key(inode, &ctx, raw_key, mode); - if (res) - goto out; - - res = setup_crypto_transform(crypt_info, mode, raw_key, inode); - if (res) - goto out; - - if (cmpxchg_release(&inode->i_crypt_info, NULL, crypt_info) == NULL) - crypt_info = NULL; -out: - if (res == -ENOKEY) - res = 0; - put_crypt_info(crypt_info); - kzfree(raw_key); - return res; -} -EXPORT_SYMBOL(fscrypt_get_encryption_info); - -/** - * fscrypt_put_encryption_info - free most of an inode's fscrypt data - * - * Free the inode's fscrypt_info. Filesystems must call this when the inode is - * being evicted. An RCU grace period need not have elapsed yet. - */ -void fscrypt_put_encryption_info(struct inode *inode) -{ - put_crypt_info(inode->i_crypt_info); - inode->i_crypt_info = NULL; -} -EXPORT_SYMBOL(fscrypt_put_encryption_info); - -/** - * fscrypt_free_inode - free an inode's fscrypt data requiring RCU delay - * - * Free the inode's cached decrypted symlink target, if any. Filesystems must - * call this after an RCU grace period, just before they free the inode. - */ -void fscrypt_free_inode(struct inode *inode) -{ - if (IS_ENCRYPTED(inode) && S_ISLNK(inode->i_mode)) { - kfree(inode->i_link); - inode->i_link = NULL; - } -} -EXPORT_SYMBOL(fscrypt_free_inode); diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c new file mode 100644 index 000000000000..c34fa7c61b43 --- /dev/null +++ b/fs/crypto/keyring.c @@ -0,0 +1,984 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Filesystem-level keyring for fscrypt + * + * Copyright 2019 Google LLC + */ + +/* + * This file implements management of fscrypt master keys in the + * filesystem-level keyring, including the ioctls: + * + * - FS_IOC_ADD_ENCRYPTION_KEY + * - FS_IOC_REMOVE_ENCRYPTION_KEY + * - FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS + * - FS_IOC_GET_ENCRYPTION_KEY_STATUS + * + * See the "User API" section of Documentation/filesystems/fscrypt.rst for more + * information about these ioctls. + */ + +#include <crypto/skcipher.h> +#include <linux/key-type.h> +#include <linux/seq_file.h> + +#include "fscrypt_private.h" + +static void wipe_master_key_secret(struct fscrypt_master_key_secret *secret) +{ + fscrypt_destroy_hkdf(&secret->hkdf); + memzero_explicit(secret, sizeof(*secret)); +} + +static void move_master_key_secret(struct fscrypt_master_key_secret *dst, + struct fscrypt_master_key_secret *src) +{ + memcpy(dst, src, sizeof(*dst)); + memzero_explicit(src, sizeof(*src)); +} + +static void free_master_key(struct fscrypt_master_key *mk) +{ + size_t i; + + wipe_master_key_secret(&mk->mk_secret); + + for (i = 0; i < ARRAY_SIZE(mk->mk_mode_keys); i++) + crypto_free_skcipher(mk->mk_mode_keys[i]); + + key_put(mk->mk_users); + kzfree(mk); +} + +static inline bool valid_key_spec(const struct fscrypt_key_specifier *spec) +{ + if (spec->__reserved) + return false; + return master_key_spec_len(spec) != 0; +} + +static int fscrypt_key_instantiate(struct key *key, + struct key_preparsed_payload *prep) +{ + key->payload.data[0] = (struct fscrypt_master_key *)prep->data; + return 0; +} + +static void fscrypt_key_destroy(struct key *key) +{ + free_master_key(key->payload.data[0]); +} + +static void fscrypt_key_describe(const struct key *key, struct seq_file *m) +{ + seq_puts(m, key->description); + + if (key_is_positive(key)) { + const struct fscrypt_master_key *mk = key->payload.data[0]; + + if (!is_master_key_secret_present(&mk->mk_secret)) + seq_puts(m, ": secret removed"); + } +} + +/* + * Type of key in ->s_master_keys. Each key of this type represents a master + * key which has been added to the filesystem. Its payload is a + * 'struct fscrypt_master_key'. The "." prefix in the key type name prevents + * users from adding keys of this type via the keyrings syscalls rather than via + * the intended method of FS_IOC_ADD_ENCRYPTION_KEY. + */ +static struct key_type key_type_fscrypt = { + .name = "._fscrypt", + .instantiate = fscrypt_key_instantiate, + .destroy = fscrypt_key_destroy, + .describe = fscrypt_key_describe, +}; + +static int fscrypt_user_key_instantiate(struct key *key, + struct key_preparsed_payload *prep) +{ + /* + * We just charge FSCRYPT_MAX_KEY_SIZE bytes to the user's key quota for + * each key, regardless of the exact key size. The amount of memory + * actually used is greater than the size of the raw key anyway. + */ + return key_payload_reserve(key, FSCRYPT_MAX_KEY_SIZE); +} + +static void fscrypt_user_key_describe(const struct key *key, struct seq_file *m) +{ + seq_puts(m, key->description); +} + +/* + * Type of key in ->mk_users. Each key of this type represents a particular + * user who has added a particular master key. + * + * Note that the name of this key type really should be something like + * ".fscrypt-user" instead of simply ".fscrypt". But the shorter name is chosen + * mainly for simplicity of presentation in /proc/keys when read by a non-root + * user. And it is expected to be rare that a key is actually added by multiple + * users, since users should keep their encryption keys confidential. + */ +static struct key_type key_type_fscrypt_user = { + .name = ".fscrypt", + .instantiate = fscrypt_user_key_instantiate, + .describe = fscrypt_user_key_describe, +}; + +/* Search ->s_master_keys or ->mk_users */ +static struct key *search_fscrypt_keyring(struct key *keyring, + struct key_type *type, + const char *description) +{ + /* + * We need to mark the keyring reference as "possessed" so that we + * acquire permission to search it, via the KEY_POS_SEARCH permission. + */ + key_ref_t keyref = make_key_ref(keyring, true /* possessed */); + + keyref = keyring_search(keyref, type, description, false); + if (IS_ERR(keyref)) { + if (PTR_ERR(keyref) == -EAGAIN || /* not found */ + PTR_ERR(keyref) == -EKEYREVOKED) /* recently invalidated */ + keyref = ERR_PTR(-ENOKEY); + return ERR_CAST(keyref); + } + return key_ref_to_ptr(keyref); +} + +#define FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE \ + (CONST_STRLEN("fscrypt-") + FIELD_SIZEOF(struct super_block, s_id)) + +#define FSCRYPT_MK_DESCRIPTION_SIZE (2 * FSCRYPT_KEY_IDENTIFIER_SIZE + 1) + +#define FSCRYPT_MK_USERS_DESCRIPTION_SIZE \ + (CONST_STRLEN("fscrypt-") + 2 * FSCRYPT_KEY_IDENTIFIER_SIZE + \ + CONST_STRLEN("-users") + 1) + +#define FSCRYPT_MK_USER_DESCRIPTION_SIZE \ + (2 * FSCRYPT_KEY_IDENTIFIER_SIZE + CONST_STRLEN(".uid.") + 10 + 1) + +static void format_fs_keyring_description( + char description[FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE], + const struct super_block *sb) +{ + sprintf(description, "fscrypt-%s", sb->s_id); +} + +static void format_mk_description( + char description[FSCRYPT_MK_DESCRIPTION_SIZE], + const struct fscrypt_key_specifier *mk_spec) +{ + sprintf(description, "%*phN", + master_key_spec_len(mk_spec), (u8 *)&mk_spec->u); +} + +static void format_mk_users_keyring_description( + char description[FSCRYPT_MK_USERS_DESCRIPTION_SIZE], + const u8 mk_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) +{ + sprintf(description, "fscrypt-%*phN-users", + FSCRYPT_KEY_IDENTIFIER_SIZE, mk_identifier); +} + +static void format_mk_user_description( + char description[FSCRYPT_MK_USER_DESCRIPTION_SIZE], + const u8 mk_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) +{ + + sprintf(description, "%*phN.uid.%u", FSCRYPT_KEY_IDENTIFIER_SIZE, + mk_identifier, __kuid_val(current_fsuid())); +} + +/* Create ->s_master_keys if needed. Synchronized by fscrypt_add_key_mutex. */ +static int allocate_filesystem_keyring(struct super_block *sb) +{ + char description[FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE]; + struct key *keyring; + + if (sb->s_master_keys) + return 0; + + format_fs_keyring_description(description, sb); + keyring = keyring_alloc(description, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + current_cred(), KEY_POS_SEARCH | + KEY_USR_SEARCH | KEY_USR_READ | KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); + if (IS_ERR(keyring)) + return PTR_ERR(keyring); + + /* Pairs with READ_ONCE() in fscrypt_find_master_key() */ + smp_store_release(&sb->s_master_keys, keyring); + return 0; +} + +void fscrypt_sb_free(struct super_block *sb) +{ + key_put(sb->s_master_keys); + sb->s_master_keys = NULL; +} + +/* + * Find the specified master key in ->s_master_keys. + * Returns ERR_PTR(-ENOKEY) if not found. + */ +struct key *fscrypt_find_master_key(struct super_block *sb, + const struct fscrypt_key_specifier *mk_spec) +{ + struct key *keyring; + char description[FSCRYPT_MK_DESCRIPTION_SIZE]; + + /* pairs with smp_store_release() in allocate_filesystem_keyring() */ + keyring = READ_ONCE(sb->s_master_keys); + if (keyring == NULL) + return ERR_PTR(-ENOKEY); /* No keyring yet, so no keys yet. */ + + format_mk_description(description, mk_spec); + return search_fscrypt_keyring(keyring, &key_type_fscrypt, description); +} + +static int allocate_master_key_users_keyring(struct fscrypt_master_key *mk) +{ + char description[FSCRYPT_MK_USERS_DESCRIPTION_SIZE]; + struct key *keyring; + + format_mk_users_keyring_description(description, + mk->mk_spec.u.identifier); + keyring = keyring_alloc(description, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + current_cred(), KEY_POS_SEARCH | + KEY_USR_SEARCH | KEY_USR_READ | KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); + if (IS_ERR(keyring)) + return PTR_ERR(keyring); + + mk->mk_users = keyring; + return 0; +} + +/* + * Find the current user's "key" in the master key's ->mk_users. + * Returns ERR_PTR(-ENOKEY) if not found. + */ +static struct key *find_master_key_user(struct fscrypt_master_key *mk) +{ + char description[FSCRYPT_MK_USER_DESCRIPTION_SIZE]; + + format_mk_user_description(description, mk->mk_spec.u.identifier); + return search_fscrypt_keyring(mk->mk_users, &key_type_fscrypt_user, + description); +} + +/* + * Give the current user a "key" in ->mk_users. This charges the user's quota + * and marks the master key as added by the current user, so that it cannot be + * removed by another user with the key. Either the master key's key->sem must + * be held for write, or the master key must be still undergoing initialization. + */ +static int add_master_key_user(struct fscrypt_master_key *mk) +{ + char description[FSCRYPT_MK_USER_DESCRIPTION_SIZE]; + struct key *mk_user; + int err; + + format_mk_user_description(description, mk->mk_spec.u.identifier); + mk_user = key_alloc(&key_type_fscrypt_user, description, + current_fsuid(), current_gid(), current_cred(), + KEY_POS_SEARCH | KEY_USR_VIEW, 0, NULL); + if (IS_ERR(mk_user)) + return PTR_ERR(mk_user); + + err = key_instantiate_and_link(mk_user, NULL, 0, mk->mk_users, NULL); + key_put(mk_user); + return err; +} + +/* + * Remove the current user's "key" from ->mk_users. + * The master key's key->sem must be held for write. + * + * Returns 0 if removed, -ENOKEY if not found, or another -errno code. + */ +static int remove_master_key_user(struct fscrypt_master_key *mk) +{ + struct key *mk_user; + int err; + + mk_user = find_master_key_user(mk); + if (IS_ERR(mk_user)) + return PTR_ERR(mk_user); + err = key_unlink(mk->mk_users, mk_user); + key_put(mk_user); + return err; +} + +/* + * Allocate a new fscrypt_master_key which contains the given secret, set it as + * the payload of a new 'struct key' of type fscrypt, and link the 'struct key' + * into the given keyring. Synchronized by fscrypt_add_key_mutex. + */ +static int add_new_master_key(struct fscrypt_master_key_secret *secret, + const struct fscrypt_key_specifier *mk_spec, + struct key *keyring) +{ + struct fscrypt_master_key *mk; + char description[FSCRYPT_MK_DESCRIPTION_SIZE]; + struct key *key; + int err; + + mk = kzalloc(sizeof(*mk), GFP_KERNEL); + if (!mk) + return -ENOMEM; + + mk->mk_spec = *mk_spec; + + move_master_key_secret(&mk->mk_secret, secret); + init_rwsem(&mk->mk_secret_sem); + + refcount_set(&mk->mk_refcount, 1); /* secret is present */ + INIT_LIST_HEAD(&mk->mk_decrypted_inodes); + spin_lock_init(&mk->mk_decrypted_inodes_lock); + + if (mk_spec->type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) { + err = allocate_master_key_users_keyring(mk); + if (err) + goto out_free_mk; + err = add_master_key_user(mk); + if (err) + goto out_free_mk; + } + + /* + * Note that we don't charge this key to anyone's quota, since when + * ->mk_users is in use those keys are charged instead, and otherwise + * (when ->mk_users isn't in use) only root can add these keys. + */ + format_mk_description(description, mk_spec); + key = key_alloc(&key_type_fscrypt, description, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), + KEY_POS_SEARCH | KEY_USR_SEARCH | KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA, NULL); + if (IS_ERR(key)) { + err = PTR_ERR(key); + goto out_free_mk; + } + err = key_instantiate_and_link(key, mk, sizeof(*mk), keyring, NULL); + key_put(key); + if (err) + goto out_free_mk; + + return 0; + +out_free_mk: + free_master_key(mk); + return err; +} + +#define KEY_DEAD 1 + +static int add_existing_master_key(struct fscrypt_master_key *mk, + struct fscrypt_master_key_secret *secret) +{ + struct key *mk_user; + bool rekey; + int err; + + /* + * If the current user is already in ->mk_users, then there's nothing to + * do. (Not applicable for v1 policy keys, which have NULL ->mk_users.) + */ + if (mk->mk_users) { + mk_user = find_master_key_user(mk); + if (mk_user != ERR_PTR(-ENOKEY)) { + if (IS_ERR(mk_user)) + return PTR_ERR(mk_user); + key_put(mk_user); + return 0; + } + } + + /* If we'll be re-adding ->mk_secret, try to take the reference. */ + rekey = !is_master_key_secret_present(&mk->mk_secret); + if (rekey && !refcount_inc_not_zero(&mk->mk_refcount)) + return KEY_DEAD; + + /* Add the current user to ->mk_users, if applicable. */ + if (mk->mk_users) { + err = add_master_key_user(mk); + if (err) { + if (rekey && refcount_dec_and_test(&mk->mk_refcount)) + return KEY_DEAD; + return err; + } + } + + /* Re-add the secret if needed. */ + if (rekey) { + down_write(&mk->mk_secret_sem); + move_master_key_secret(&mk->mk_secret, secret); + up_write(&mk->mk_secret_sem); + } + return 0; +} + +static int add_master_key(struct super_block *sb, + struct fscrypt_master_key_secret *secret, + const struct fscrypt_key_specifier *mk_spec) +{ + static DEFINE_MUTEX(fscrypt_add_key_mutex); + struct key *key; + int err; + + mutex_lock(&fscrypt_add_key_mutex); /* serialize find + link */ +retry: + key = fscrypt_find_master_key(sb, mk_spec); + if (IS_ERR(key)) { + err = PTR_ERR(key); + if (err != -ENOKEY) + goto out_unlock; + /* Didn't find the key in ->s_master_keys. Add it. */ + err = allocate_filesystem_keyring(sb); + if (err) + goto out_unlock; + err = add_new_master_key(secret, mk_spec, sb->s_master_keys); + } else { + /* + * Found the key in ->s_master_keys. Re-add the secret if + * needed, and add the user to ->mk_users if needed. + */ + down_write(&key->sem); + err = add_existing_master_key(key->payload.data[0], secret); + up_write(&key->sem); + if (err == KEY_DEAD) { + /* Key being removed or needs to be removed */ + key_invalidate(key); + key_put(key); + goto retry; + } + key_put(key); + } +out_unlock: + mutex_unlock(&fscrypt_add_key_mutex); + return err; +} + +/* + * Add a master encryption key to the filesystem, causing all files which were + * encrypted with it to appear "unlocked" (decrypted) when accessed. + * + * When adding a key for use by v1 encryption policies, this ioctl is + * privileged, and userspace must provide the 'key_descriptor'. + * + * When adding a key for use by v2+ encryption policies, this ioctl is + * unprivileged. This is needed, in general, to allow non-root users to use + * encryption without encountering the visibility problems of process-subscribed + * keyrings and the inability to properly remove keys. This works by having + * each key identified by its cryptographically secure hash --- the + * 'key_identifier'. The cryptographic hash ensures that a malicious user + * cannot add the wrong key for a given identifier. Furthermore, each added key + * is charged to the appropriate user's quota for the keyrings service, which + * prevents a malicious user from adding too many keys. Finally, we forbid a + * user from removing a key while other users have added it too, which prevents + * a user who knows another user's key from causing a denial-of-service by + * removing it at an inopportune time. (We tolerate that a user who knows a key + * can prevent other users from removing it.) + * + * For more details, see the "FS_IOC_ADD_ENCRYPTION_KEY" section of + * Documentation/filesystems/fscrypt.rst. + */ +int fscrypt_ioctl_add_key(struct file *filp, void __user *_uarg) +{ + struct super_block *sb = file_inode(filp)->i_sb; + struct fscrypt_add_key_arg __user *uarg = _uarg; + struct fscrypt_add_key_arg arg; + struct fscrypt_master_key_secret secret; + int err; + + if (copy_from_user(&arg, uarg, sizeof(arg))) + return -EFAULT; + + if (!valid_key_spec(&arg.key_spec)) + return -EINVAL; + + if (arg.raw_size < FSCRYPT_MIN_KEY_SIZE || + arg.raw_size > FSCRYPT_MAX_KEY_SIZE) + return -EINVAL; + + if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved))) + return -EINVAL; + + memset(&secret, 0, sizeof(secret)); + secret.size = arg.raw_size; + err = -EFAULT; + if (copy_from_user(secret.raw, uarg->raw, secret.size)) + goto out_wipe_secret; + + switch (arg.key_spec.type) { + case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: + /* + * Only root can add keys that are identified by an arbitrary + * descriptor rather than by a cryptographic hash --- since + * otherwise a malicious user could add the wrong key. + */ + err = -EACCES; + if (!capable(CAP_SYS_ADMIN)) + goto out_wipe_secret; + break; + case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER: + err = fscrypt_init_hkdf(&secret.hkdf, secret.raw, secret.size); + if (err) + goto out_wipe_secret; + + /* + * Now that the HKDF context is initialized, the raw key is no + * longer needed. + */ + memzero_explicit(secret.raw, secret.size); + + /* Calculate the key identifier and return it to userspace. */ + err = fscrypt_hkdf_expand(&secret.hkdf, + HKDF_CONTEXT_KEY_IDENTIFIER, + NULL, 0, arg.key_spec.u.identifier, + FSCRYPT_KEY_IDENTIFIER_SIZE); + if (err) + goto out_wipe_secret; + err = -EFAULT; + if (copy_to_user(uarg->key_spec.u.identifier, + arg.key_spec.u.identifier, + FSCRYPT_KEY_IDENTIFIER_SIZE)) + goto out_wipe_secret; + break; + default: + WARN_ON(1); + err = -EINVAL; + goto out_wipe_secret; + } + + err = add_master_key(sb, &secret, &arg.key_spec); +out_wipe_secret: + wipe_master_key_secret(&secret); + return err; +} +EXPORT_SYMBOL_GPL(fscrypt_ioctl_add_key); + +/* + * Verify that the current user has added a master key with the given identifier + * (returns -ENOKEY if not). This is needed to prevent a user from encrypting + * their files using some other user's key which they don't actually know. + * Cryptographically this isn't much of a problem, but the semantics of this + * would be a bit weird, so it's best to just forbid it. + * + * The system administrator (CAP_FOWNER) can override this, which should be + * enough for any use cases where encryption policies are being set using keys + * that were chosen ahead of time but aren't available at the moment. + * + * Note that the key may have already removed by the time this returns, but + * that's okay; we just care whether the key was there at some point. + * + * Return: 0 if the key is added, -ENOKEY if it isn't, or another -errno code + */ +int fscrypt_verify_key_added(struct super_block *sb, + const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) +{ + struct fscrypt_key_specifier mk_spec; + struct key *key, *mk_user; + struct fscrypt_master_key *mk; + int err; + + mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER; + memcpy(mk_spec.u.identifier, identifier, FSCRYPT_KEY_IDENTIFIER_SIZE); + + key = fscrypt_find_master_key(sb, &mk_spec); + if (IS_ERR(key)) { + err = PTR_ERR(key); + goto out; + } + mk = key->payload.data[0]; + mk_user = find_master_key_user(mk); + if (IS_ERR(mk_user)) { + err = PTR_ERR(mk_user); + } else { + key_put(mk_user); + err = 0; + } + key_put(key); +out: + if (err == -ENOKEY && capable(CAP_FOWNER)) + err = 0; + return err; +} + +/* + * Try to evict the inode's dentries from the dentry cache. If the inode is a + * directory, then it can have at most one dentry; however, that dentry may be + * pinned by child dentries, so first try to evict the children too. + */ +static void shrink_dcache_inode(struct inode *inode) +{ + struct dentry *dentry; + + if (S_ISDIR(inode->i_mode)) { + dentry = d_find_any_alias(inode); + if (dentry) { + shrink_dcache_parent(dentry); + dput(dentry); + } + } + d_prune_aliases(inode); +} + +static void evict_dentries_for_decrypted_inodes(struct fscrypt_master_key *mk) +{ + struct fscrypt_info *ci; + struct inode *inode; + struct inode *toput_inode = NULL; + + spin_lock(&mk->mk_decrypted_inodes_lock); + + list_for_each_entry(ci, &mk->mk_decrypted_inodes, ci_master_key_link) { + inode = ci->ci_inode; + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) { + spin_unlock(&inode->i_lock); + continue; + } + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(&mk->mk_decrypted_inodes_lock); + + shrink_dcache_inode(inode); + iput(toput_inode); + toput_inode = inode; + + spin_lock(&mk->mk_decrypted_inodes_lock); + } + + spin_unlock(&mk->mk_decrypted_inodes_lock); + iput(toput_inode); +} + +static int check_for_busy_inodes(struct super_block *sb, + struct fscrypt_master_key *mk) +{ + struct list_head *pos; + size_t busy_count = 0; + unsigned long ino; + struct dentry *dentry; + char _path[256]; + char *path = NULL; + + spin_lock(&mk->mk_decrypted_inodes_lock); + + list_for_each(pos, &mk->mk_decrypted_inodes) + busy_count++; + + if (busy_count == 0) { + spin_unlock(&mk->mk_decrypted_inodes_lock); + return 0; + } + + { + /* select an example file to show for debugging purposes */ + struct inode *inode = + list_first_entry(&mk->mk_decrypted_inodes, + struct fscrypt_info, + ci_master_key_link)->ci_inode; + ino = inode->i_ino; + dentry = d_find_alias(inode); + } + spin_unlock(&mk->mk_decrypted_inodes_lock); + + if (dentry) { + path = dentry_path(dentry, _path, sizeof(_path)); + dput(dentry); + } + if (IS_ERR_OR_NULL(path)) + path = "(unknown)"; + + fscrypt_warn(NULL, + "%s: %zu inode(s) still busy after removing key with %s %*phN, including ino %lu (%s)", + sb->s_id, busy_count, master_key_spec_type(&mk->mk_spec), + master_key_spec_len(&mk->mk_spec), (u8 *)&mk->mk_spec.u, + ino, path); + return -EBUSY; +} + +static int try_to_lock_encrypted_files(struct super_block *sb, + struct fscrypt_master_key *mk) +{ + int err1; + int err2; + + /* + * An inode can't be evicted while it is dirty or has dirty pages. + * Thus, we first have to clean the inodes in ->mk_decrypted_inodes. + * + * Just do it the easy way: call sync_filesystem(). It's overkill, but + * it works, and it's more important to minimize the amount of caches we + * drop than the amount of data we sync. Also, unprivileged users can + * already call sync_filesystem() via sys_syncfs() or sys_sync(). + */ + down_read(&sb->s_umount); + err1 = sync_filesystem(sb); + up_read(&sb->s_umount); + /* If a sync error occurs, still try to evict as much as possible. */ + + /* + * Inodes are pinned by their dentries, so we have to evict their + * dentries. shrink_dcache_sb() would suffice, but would be overkill + * and inappropriate for use by unprivileged users. So instead go + * through the inodes' alias lists and try to evict each dentry. + */ + evict_dentries_for_decrypted_inodes(mk); + + /* + * evict_dentries_for_decrypted_inodes() already iput() each inode in + * the list; any inodes for which that dropped the last reference will + * have been evicted due to fscrypt_drop_inode() detecting the key + * removal and telling the VFS to evict the inode. So to finish, we + * just need to check whether any inodes couldn't be evicted. + */ + err2 = check_for_busy_inodes(sb, mk); + + return err1 ?: err2; +} + +/* + * Try to remove an fscrypt master encryption key. + * + * FS_IOC_REMOVE_ENCRYPTION_KEY (all_users=false) removes the current user's + * claim to the key, then removes the key itself if no other users have claims. + * FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS (all_users=true) always removes the + * key itself. + * + * To "remove the key itself", first we wipe the actual master key secret, so + * that no more inodes can be unlocked with it. Then we try to evict all cached + * inodes that had been unlocked with the key. + * + * If all inodes were evicted, then we unlink the fscrypt_master_key from the + * keyring. Otherwise it remains in the keyring in the "incompletely removed" + * state (without the actual secret key) where it tracks the list of remaining + * inodes. Userspace can execute the ioctl again later to retry eviction, or + * alternatively can re-add the secret key again. + * + * For more details, see the "Removing keys" section of + * Documentation/filesystems/fscrypt.rst. + */ +static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users) +{ + struct super_block *sb = file_inode(filp)->i_sb; + struct fscrypt_remove_key_arg __user *uarg = _uarg; + struct fscrypt_remove_key_arg arg; + struct key *key; + struct fscrypt_master_key *mk; + u32 status_flags = 0; + int err; + bool dead; + + if (copy_from_user(&arg, uarg, sizeof(arg))) + return -EFAULT; + + if (!valid_key_spec(&arg.key_spec)) + return -EINVAL; + + if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved))) + return -EINVAL; + + /* + * Only root can add and remove keys that are identified by an arbitrary + * descriptor rather than by a cryptographic hash. + */ + if (arg.key_spec.type == FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR && + !capable(CAP_SYS_ADMIN)) + return -EACCES; + + /* Find the key being removed. */ + key = fscrypt_find_master_key(sb, &arg.key_spec); + if (IS_ERR(key)) + return PTR_ERR(key); + mk = key->payload.data[0]; + + down_write(&key->sem); + + /* If relevant, remove current user's (or all users) claim to the key */ + if (mk->mk_users && mk->mk_users->keys.nr_leaves_on_tree != 0) { + if (all_users) + err = keyring_clear(mk->mk_users); + else + err = remove_master_key_user(mk); + if (err) { + up_write(&key->sem); + goto out_put_key; + } + if (mk->mk_users->keys.nr_leaves_on_tree != 0) { + /* + * Other users have still added the key too. We removed + * the current user's claim to the key, but we still + * can't remove the key itself. + */ + status_flags |= + FSCRYPT_KEY_REMOVAL_STATUS_FLAG_OTHER_USERS; + err = 0; + up_write(&key->sem); + goto out_put_key; + } + } + + /* No user claims remaining. Go ahead and wipe the secret. */ + dead = false; + if (is_master_key_secret_present(&mk->mk_secret)) { + down_write(&mk->mk_secret_sem); + wipe_master_key_secret(&mk->mk_secret); + dead = refcount_dec_and_test(&mk->mk_refcount); + up_write(&mk->mk_secret_sem); + } + up_write(&key->sem); + if (dead) { + /* + * No inodes reference the key, and we wiped the secret, so the + * key object is free to be removed from the keyring. + */ + key_invalidate(key); + err = 0; + } else { + /* Some inodes still reference this key; try to evict them. */ + err = try_to_lock_encrypted_files(sb, mk); + if (err == -EBUSY) { + status_flags |= + FSCRYPT_KEY_REMOVAL_STATUS_FLAG_FILES_BUSY; + err = 0; + } + } + /* + * We return 0 if we successfully did something: removed a claim to the + * key, wiped the secret, or tried locking the files again. Users need + * to check the informational status flags if they care whether the key + * has been fully removed including all files locked. + */ +out_put_key: + key_put(key); + if (err == 0) + err = put_user(status_flags, &uarg->removal_status_flags); + return err; +} + +int fscrypt_ioctl_remove_key(struct file *filp, void __user *uarg) +{ + return do_remove_key(filp, uarg, false); +} +EXPORT_SYMBOL_GPL(fscrypt_ioctl_remove_key); + +int fscrypt_ioctl_remove_key_all_users(struct file *filp, void __user *uarg) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + return do_remove_key(filp, uarg, true); +} +EXPORT_SYMBOL_GPL(fscrypt_ioctl_remove_key_all_users); + +/* + * Retrieve the status of an fscrypt master encryption key. + * + * We set ->status to indicate whether the key is absent, present, or + * incompletely removed. "Incompletely removed" means that the master key + * secret has been removed, but some files which had been unlocked with it are + * still in use. This field allows applications to easily determine the state + * of an encrypted directory without using a hack such as trying to open a + * regular file in it (which can confuse the "incompletely removed" state with + * absent or present). + * + * In addition, for v2 policy keys we allow applications to determine, via + * ->status_flags and ->user_count, whether the key has been added by the + * current user, by other users, or by both. Most applications should not need + * this, since ordinarily only one user should know a given key. However, if a + * secret key is shared by multiple users, applications may wish to add an + * already-present key to prevent other users from removing it. This ioctl can + * be used to check whether that really is the case before the work is done to + * add the key --- which might e.g. require prompting the user for a passphrase. + * + * For more details, see the "FS_IOC_GET_ENCRYPTION_KEY_STATUS" section of + * Documentation/filesystems/fscrypt.rst. + */ +int fscrypt_ioctl_get_key_status(struct file *filp, void __user *uarg) +{ + struct super_block *sb = file_inode(filp)->i_sb; + struct fscrypt_get_key_status_arg arg; + struct key *key; + struct fscrypt_master_key *mk; + int err; + + if (copy_from_user(&arg, uarg, sizeof(arg))) + return -EFAULT; + + if (!valid_key_spec(&arg.key_spec)) + return -EINVAL; + + if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved))) + return -EINVAL; + + arg.status_flags = 0; + arg.user_count = 0; + memset(arg.__out_reserved, 0, sizeof(arg.__out_reserved)); + + key = fscrypt_find_master_key(sb, &arg.key_spec); + if (IS_ERR(key)) { + if (key != ERR_PTR(-ENOKEY)) + return PTR_ERR(key); + arg.status = FSCRYPT_KEY_STATUS_ABSENT; + err = 0; + goto out; + } + mk = key->payload.data[0]; + down_read(&key->sem); + + if (!is_master_key_secret_present(&mk->mk_secret)) { + arg.status = FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED; + err = 0; + goto out_release_key; + } + + arg.status = FSCRYPT_KEY_STATUS_PRESENT; + if (mk->mk_users) { + struct key *mk_user; + + arg.user_count = mk->mk_users->keys.nr_leaves_on_tree; + mk_user = find_master_key_user(mk); + if (!IS_ERR(mk_user)) { + arg.status_flags |= + FSCRYPT_KEY_STATUS_FLAG_ADDED_BY_SELF; + key_put(mk_user); + } else if (mk_user != ERR_PTR(-ENOKEY)) { + err = PTR_ERR(mk_user); + goto out_release_key; + } + } + err = 0; +out_release_key: + up_read(&key->sem); + key_put(key); +out: + if (!err && copy_to_user(uarg, &arg, sizeof(arg))) + err = -EFAULT; + return err; +} +EXPORT_SYMBOL_GPL(fscrypt_ioctl_get_key_status); + +int __init fscrypt_init_keyring(void) +{ + int err; + + err = register_key_type(&key_type_fscrypt); + if (err) + return err; + + err = register_key_type(&key_type_fscrypt_user); + if (err) + goto err_unregister_fscrypt; + + return 0; + +err_unregister_fscrypt: + unregister_key_type(&key_type_fscrypt); + return err; +} diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c new file mode 100644 index 000000000000..d71c2d6dd162 --- /dev/null +++ b/fs/crypto/keysetup.c @@ -0,0 +1,591 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Key setup facility for FS encryption support. + * + * Copyright (C) 2015, Google, Inc. + * + * Originally written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar. + * Heavily modified since then. + */ + +#include <crypto/aes.h> +#include <crypto/sha.h> +#include <crypto/skcipher.h> +#include <linux/key.h> + +#include "fscrypt_private.h" + +static struct crypto_shash *essiv_hash_tfm; + +static struct fscrypt_mode available_modes[] = { + [FSCRYPT_MODE_AES_256_XTS] = { + .friendly_name = "AES-256-XTS", + .cipher_str = "xts(aes)", + .keysize = 64, + .ivsize = 16, + }, + [FSCRYPT_MODE_AES_256_CTS] = { + .friendly_name = "AES-256-CTS-CBC", + .cipher_str = "cts(cbc(aes))", + .keysize = 32, + .ivsize = 16, + }, + [FSCRYPT_MODE_AES_128_CBC] = { + .friendly_name = "AES-128-CBC", + .cipher_str = "cbc(aes)", + .keysize = 16, + .ivsize = 16, + .needs_essiv = true, + }, + [FSCRYPT_MODE_AES_128_CTS] = { + .friendly_name = "AES-128-CTS-CBC", + .cipher_str = "cts(cbc(aes))", + .keysize = 16, + .ivsize = 16, + }, + [FSCRYPT_MODE_ADIANTUM] = { + .friendly_name = "Adiantum", + .cipher_str = "adiantum(xchacha12,aes)", + .keysize = 32, + .ivsize = 32, + }, +}; + +static struct fscrypt_mode * +select_encryption_mode(const union fscrypt_policy *policy, + const struct inode *inode) +{ + if (S_ISREG(inode->i_mode)) + return &available_modes[fscrypt_policy_contents_mode(policy)]; + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + return &available_modes[fscrypt_policy_fnames_mode(policy)]; + + WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n", + inode->i_ino, (inode->i_mode & S_IFMT)); + return ERR_PTR(-EINVAL); +} + +/* Create a symmetric cipher object for the given encryption mode and key */ +struct crypto_skcipher *fscrypt_allocate_skcipher(struct fscrypt_mode *mode, + const u8 *raw_key, + const struct inode *inode) +{ + struct crypto_skcipher *tfm; + int err; + + tfm = crypto_alloc_skcipher(mode->cipher_str, 0, 0); + if (IS_ERR(tfm)) { + if (PTR_ERR(tfm) == -ENOENT) { + fscrypt_warn(inode, + "Missing crypto API support for %s (API name: \"%s\")", + mode->friendly_name, mode->cipher_str); + return ERR_PTR(-ENOPKG); + } + fscrypt_err(inode, "Error allocating '%s' transform: %ld", + mode->cipher_str, PTR_ERR(tfm)); + return tfm; + } + if (unlikely(!mode->logged_impl_name)) { + /* + * fscrypt performance can vary greatly depending on which + * crypto algorithm implementation is used. Help people debug + * performance problems by logging the ->cra_driver_name the + * first time a mode is used. Note that multiple threads can + * race here, but it doesn't really matter. + */ + mode->logged_impl_name = true; + pr_info("fscrypt: %s using implementation \"%s\"\n", + mode->friendly_name, + crypto_skcipher_alg(tfm)->base.cra_driver_name); + } + crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); + err = crypto_skcipher_setkey(tfm, raw_key, mode->keysize); + if (err) + goto err_free_tfm; + + return tfm; + +err_free_tfm: + crypto_free_skcipher(tfm); + return ERR_PTR(err); +} + +static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt) +{ + struct crypto_shash *tfm = READ_ONCE(essiv_hash_tfm); + + /* init hash transform on demand */ + if (unlikely(!tfm)) { + struct crypto_shash *prev_tfm; + + tfm = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(tfm)) { + if (PTR_ERR(tfm) == -ENOENT) { + fscrypt_warn(NULL, + "Missing crypto API support for SHA-256"); + return -ENOPKG; + } + fscrypt_err(NULL, + "Error allocating SHA-256 transform: %ld", + PTR_ERR(tfm)); + return PTR_ERR(tfm); + } + prev_tfm = cmpxchg(&essiv_hash_tfm, NULL, tfm); + if (prev_tfm) { + crypto_free_shash(tfm); + tfm = prev_tfm; + } + } + + { + SHASH_DESC_ON_STACK(desc, tfm); + desc->tfm = tfm; + + return crypto_shash_digest(desc, key, keysize, salt); + } +} + +static int init_essiv_generator(struct fscrypt_info *ci, const u8 *raw_key, + int keysize) +{ + int err; + struct crypto_cipher *essiv_tfm; + u8 salt[SHA256_DIGEST_SIZE]; + + if (WARN_ON(ci->ci_mode->ivsize != AES_BLOCK_SIZE)) + return -EINVAL; + + essiv_tfm = crypto_alloc_cipher("aes", 0, 0); + if (IS_ERR(essiv_tfm)) + return PTR_ERR(essiv_tfm); + + ci->ci_essiv_tfm = essiv_tfm; + + err = derive_essiv_salt(raw_key, keysize, salt); + if (err) + goto out; + + /* + * Using SHA256 to derive the salt/key will result in AES-256 being + * used for IV generation. File contents encryption will still use the + * configured keysize (AES-128) nevertheless. + */ + err = crypto_cipher_setkey(essiv_tfm, salt, sizeof(salt)); + if (err) + goto out; + +out: + memzero_explicit(salt, sizeof(salt)); + return err; +} + +/* Given the per-file key, set up the file's crypto transform object(s) */ +int fscrypt_set_derived_key(struct fscrypt_info *ci, const u8 *derived_key) +{ + struct fscrypt_mode *mode = ci->ci_mode; + struct crypto_skcipher *ctfm; + int err; + + ctfm = fscrypt_allocate_skcipher(mode, derived_key, ci->ci_inode); + if (IS_ERR(ctfm)) + return PTR_ERR(ctfm); + + ci->ci_ctfm = ctfm; + + if (mode->needs_essiv) { + err = init_essiv_generator(ci, derived_key, mode->keysize); + if (err) { + fscrypt_warn(ci->ci_inode, + "Error initializing ESSIV generator: %d", + err); + return err; + } + } + return 0; +} + +static int setup_per_mode_key(struct fscrypt_info *ci, + struct fscrypt_master_key *mk) +{ + struct fscrypt_mode *mode = ci->ci_mode; + u8 mode_num = mode - available_modes; + struct crypto_skcipher *tfm, *prev_tfm; + u8 mode_key[FSCRYPT_MAX_KEY_SIZE]; + int err; + + if (WARN_ON(mode_num >= ARRAY_SIZE(mk->mk_mode_keys))) + return -EINVAL; + + /* pairs with cmpxchg() below */ + tfm = READ_ONCE(mk->mk_mode_keys[mode_num]); + if (likely(tfm != NULL)) + goto done; + + BUILD_BUG_ON(sizeof(mode_num) != 1); + err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, + HKDF_CONTEXT_PER_MODE_KEY, + &mode_num, sizeof(mode_num), + mode_key, mode->keysize); + if (err) + return err; + tfm = fscrypt_allocate_skcipher(mode, mode_key, ci->ci_inode); + memzero_explicit(mode_key, mode->keysize); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + /* pairs with READ_ONCE() above */ + prev_tfm = cmpxchg(&mk->mk_mode_keys[mode_num], NULL, tfm); + if (prev_tfm != NULL) { + crypto_free_skcipher(tfm); + tfm = prev_tfm; + } +done: + ci->ci_ctfm = tfm; + return 0; +} + +static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, + struct fscrypt_master_key *mk) +{ + u8 derived_key[FSCRYPT_MAX_KEY_SIZE]; + int err; + + if (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) { + /* + * DIRECT_KEY: instead of deriving per-file keys, the per-file + * nonce will be included in all the IVs. But unlike v1 + * policies, for v2 policies in this case we don't encrypt with + * the master key directly but rather derive a per-mode key. + * This ensures that the master key is consistently used only + * for HKDF, avoiding key reuse issues. + */ + if (!fscrypt_mode_supports_direct_key(ci->ci_mode)) { + fscrypt_warn(ci->ci_inode, + "Direct key flag not allowed with %s", + ci->ci_mode->friendly_name); + return -EINVAL; + } + return setup_per_mode_key(ci, mk); + } + + err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, + HKDF_CONTEXT_PER_FILE_KEY, + ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE, + derived_key, ci->ci_mode->keysize); + if (err) + return err; + + err = fscrypt_set_derived_key(ci, derived_key); + memzero_explicit(derived_key, ci->ci_mode->keysize); + return err; +} + +/* + * Find the master key, then set up the inode's actual encryption key. + * + * If the master key is found in the filesystem-level keyring, then the + * corresponding 'struct key' is returned in *master_key_ret with + * ->mk_secret_sem read-locked. This is needed to ensure that only one task + * links the fscrypt_info into ->mk_decrypted_inodes (as multiple tasks may race + * to create an fscrypt_info for the same inode), and to synchronize the master + * key being removed with a new inode starting to use it. + */ +static int setup_file_encryption_key(struct fscrypt_info *ci, + struct key **master_key_ret) +{ + struct key *key; + struct fscrypt_master_key *mk = NULL; + struct fscrypt_key_specifier mk_spec; + int err; + + switch (ci->ci_policy.version) { + case FSCRYPT_POLICY_V1: + mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR; + memcpy(mk_spec.u.descriptor, + ci->ci_policy.v1.master_key_descriptor, + FSCRYPT_KEY_DESCRIPTOR_SIZE); + break; + case FSCRYPT_POLICY_V2: + mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER; + memcpy(mk_spec.u.identifier, + ci->ci_policy.v2.master_key_identifier, + FSCRYPT_KEY_IDENTIFIER_SIZE); + break; + default: + WARN_ON(1); + return -EINVAL; + } + + key = fscrypt_find_master_key(ci->ci_inode->i_sb, &mk_spec); + if (IS_ERR(key)) { + if (key != ERR_PTR(-ENOKEY) || + ci->ci_policy.version != FSCRYPT_POLICY_V1) + return PTR_ERR(key); + + /* + * As a legacy fallback for v1 policies, search for the key in + * the current task's subscribed keyrings too. Don't move this + * to before the search of ->s_master_keys, since users + * shouldn't be able to override filesystem-level keys. + */ + return fscrypt_setup_v1_file_key_via_subscribed_keyrings(ci); + } + + mk = key->payload.data[0]; + down_read(&mk->mk_secret_sem); + + /* Has the secret been removed (via FS_IOC_REMOVE_ENCRYPTION_KEY)? */ + if (!is_master_key_secret_present(&mk->mk_secret)) { + err = -ENOKEY; + goto out_release_key; + } + + /* + * Require that the master key be at least as long as the derived key. + * Otherwise, the derived key cannot possibly contain as much entropy as + * that required by the encryption mode it will be used for. For v1 + * policies it's also required for the KDF to work at all. + */ + if (mk->mk_secret.size < ci->ci_mode->keysize) { + fscrypt_warn(NULL, + "key with %s %*phN is too short (got %u bytes, need %u+ bytes)", + master_key_spec_type(&mk_spec), + master_key_spec_len(&mk_spec), (u8 *)&mk_spec.u, + mk->mk_secret.size, ci->ci_mode->keysize); + err = -ENOKEY; + goto out_release_key; + } + + switch (ci->ci_policy.version) { + case FSCRYPT_POLICY_V1: + err = fscrypt_setup_v1_file_key(ci, mk->mk_secret.raw); + break; + case FSCRYPT_POLICY_V2: + err = fscrypt_setup_v2_file_key(ci, mk); + break; + default: + WARN_ON(1); + err = -EINVAL; + break; + } + if (err) + goto out_release_key; + + *master_key_ret = key; + return 0; + +out_release_key: + up_read(&mk->mk_secret_sem); + key_put(key); + return err; +} + +static void put_crypt_info(struct fscrypt_info *ci) +{ + struct key *key; + + if (!ci) + return; + + if (ci->ci_direct_key) { + fscrypt_put_direct_key(ci->ci_direct_key); + } else if ((ci->ci_ctfm != NULL || ci->ci_essiv_tfm != NULL) && + !fscrypt_is_direct_key_policy(&ci->ci_policy)) { + crypto_free_skcipher(ci->ci_ctfm); + crypto_free_cipher(ci->ci_essiv_tfm); + } + + key = ci->ci_master_key; + if (key) { + struct fscrypt_master_key *mk = key->payload.data[0]; + + /* + * Remove this inode from the list of inodes that were unlocked + * with the master key. + * + * In addition, if we're removing the last inode from a key that + * already had its secret removed, invalidate the key so that it + * gets removed from ->s_master_keys. + */ + spin_lock(&mk->mk_decrypted_inodes_lock); + list_del(&ci->ci_master_key_link); + spin_unlock(&mk->mk_decrypted_inodes_lock); + if (refcount_dec_and_test(&mk->mk_refcount)) + key_invalidate(key); + key_put(key); + } + kmem_cache_free(fscrypt_info_cachep, ci); +} + +int fscrypt_get_encryption_info(struct inode *inode) +{ + struct fscrypt_info *crypt_info; + union fscrypt_context ctx; + struct fscrypt_mode *mode; + struct key *master_key = NULL; + int res; + + if (fscrypt_has_encryption_key(inode)) + return 0; + + res = fscrypt_initialize(inode->i_sb->s_cop->flags); + if (res) + return res; + + res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (res < 0) { + if (!fscrypt_dummy_context_enabled(inode) || + IS_ENCRYPTED(inode)) { + fscrypt_warn(inode, + "Error %d getting encryption context", + res); + return res; + } + /* Fake up a context for an unencrypted directory */ + memset(&ctx, 0, sizeof(ctx)); + ctx.version = FSCRYPT_CONTEXT_V1; + ctx.v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; + ctx.v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; + memset(ctx.v1.master_key_descriptor, 0x42, + FSCRYPT_KEY_DESCRIPTOR_SIZE); + res = sizeof(ctx.v1); + } + + crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_NOFS); + if (!crypt_info) + return -ENOMEM; + + crypt_info->ci_inode = inode; + + res = fscrypt_policy_from_context(&crypt_info->ci_policy, &ctx, res); + if (res) { + fscrypt_warn(inode, + "Unrecognized or corrupt encryption context"); + goto out; + } + + switch (ctx.version) { + case FSCRYPT_CONTEXT_V1: + memcpy(crypt_info->ci_nonce, ctx.v1.nonce, + FS_KEY_DERIVATION_NONCE_SIZE); + break; + case FSCRYPT_CONTEXT_V2: + memcpy(crypt_info->ci_nonce, ctx.v2.nonce, + FS_KEY_DERIVATION_NONCE_SIZE); + break; + default: + WARN_ON(1); + res = -EINVAL; + goto out; + } + + if (!fscrypt_supported_policy(&crypt_info->ci_policy, inode)) { + res = -EINVAL; + goto out; + } + + mode = select_encryption_mode(&crypt_info->ci_policy, inode); + if (IS_ERR(mode)) { + res = PTR_ERR(mode); + goto out; + } + WARN_ON(mode->ivsize > FSCRYPT_MAX_IV_SIZE); + crypt_info->ci_mode = mode; + + res = setup_file_encryption_key(crypt_info, &master_key); + if (res) + goto out; + + if (cmpxchg_release(&inode->i_crypt_info, NULL, crypt_info) == NULL) { + if (master_key) { + struct fscrypt_master_key *mk = + master_key->payload.data[0]; + + refcount_inc(&mk->mk_refcount); + crypt_info->ci_master_key = key_get(master_key); + spin_lock(&mk->mk_decrypted_inodes_lock); + list_add(&crypt_info->ci_master_key_link, + &mk->mk_decrypted_inodes); + spin_unlock(&mk->mk_decrypted_inodes_lock); + } + crypt_info = NULL; + } + res = 0; +out: + if (master_key) { + struct fscrypt_master_key *mk = master_key->payload.data[0]; + + up_read(&mk->mk_secret_sem); + key_put(master_key); + } + if (res == -ENOKEY) + res = 0; + put_crypt_info(crypt_info); + return res; +} +EXPORT_SYMBOL(fscrypt_get_encryption_info); + +/** + * fscrypt_put_encryption_info - free most of an inode's fscrypt data + * + * Free the inode's fscrypt_info. Filesystems must call this when the inode is + * being evicted. An RCU grace period need not have elapsed yet. + */ +void fscrypt_put_encryption_info(struct inode *inode) +{ + put_crypt_info(inode->i_crypt_info); + inode->i_crypt_info = NULL; +} +EXPORT_SYMBOL(fscrypt_put_encryption_info); + +/** + * fscrypt_free_inode - free an inode's fscrypt data requiring RCU delay + * + * Free the inode's cached decrypted symlink target, if any. Filesystems must + * call this after an RCU grace period, just before they free the inode. + */ +void fscrypt_free_inode(struct inode *inode) +{ + if (IS_ENCRYPTED(inode) && S_ISLNK(inode->i_mode)) { + kfree(inode->i_link); + inode->i_link = NULL; + } +} +EXPORT_SYMBOL(fscrypt_free_inode); + +/** + * fscrypt_drop_inode - check whether the inode's master key has been removed + * + * Filesystems supporting fscrypt must call this from their ->drop_inode() + * method so that encrypted inodes are evicted as soon as they're no longer in + * use and their master key has been removed. + * + * Return: 1 if fscrypt wants the inode to be evicted now, otherwise 0 + */ +int fscrypt_drop_inode(struct inode *inode) +{ + const struct fscrypt_info *ci = READ_ONCE(inode->i_crypt_info); + const struct fscrypt_master_key *mk; + + /* + * If ci is NULL, then the inode doesn't have an encryption key set up + * so it's irrelevant. If ci_master_key is NULL, then the master key + * was provided via the legacy mechanism of the process-subscribed + * keyrings, so we don't know whether it's been removed or not. + */ + if (!ci || !ci->ci_master_key) + return 0; + mk = ci->ci_master_key->payload.data[0]; + + /* + * Note: since we aren't holding ->mk_secret_sem, the result here can + * immediately become outdated. But there's no correctness problem with + * unnecessarily evicting. Nor is there a correctness problem with not + * evicting while iput() is racing with the key being removed, since + * then the thread removing the key will either evict the inode itself + * or will correctly detect that it wasn't evicted due to the race. + */ + return !is_master_key_secret_present(&mk->mk_secret); +} +EXPORT_SYMBOL_GPL(fscrypt_drop_inode); diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c new file mode 100644 index 000000000000..ad1a36c370c3 --- /dev/null +++ b/fs/crypto/keysetup_v1.c @@ -0,0 +1,340 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Key setup for v1 encryption policies + * + * Copyright 2015, 2019 Google LLC + */ + +/* + * This file implements compatibility functions for the original encryption + * policy version ("v1"), including: + * + * - Deriving per-file keys using the AES-128-ECB based KDF + * (rather than the new method of using HKDF-SHA512) + * + * - Retrieving fscrypt master keys from process-subscribed keyrings + * (rather than the new method of using a filesystem-level keyring) + * + * - Handling policies with the DIRECT_KEY flag set using a master key table + * (rather than the new method of implementing DIRECT_KEY with per-mode keys + * managed alongside the master keys in the filesystem-level keyring) + */ + +#include <crypto/algapi.h> +#include <crypto/skcipher.h> +#include <keys/user-type.h> +#include <linux/hashtable.h> +#include <linux/scatterlist.h> + +#include "fscrypt_private.h" + +/* Table of keys referenced by DIRECT_KEY policies */ +static DEFINE_HASHTABLE(fscrypt_direct_keys, 6); /* 6 bits = 64 buckets */ +static DEFINE_SPINLOCK(fscrypt_direct_keys_lock); + +/* + * v1 key derivation function. This generates the derived key by encrypting the + * master key with AES-128-ECB using the nonce as the AES key. This provides a + * unique derived key with sufficient entropy for each inode. However, it's + * nonstandard, non-extensible, doesn't evenly distribute the entropy from the + * master key, and is trivially reversible: an attacker who compromises a + * derived key can "decrypt" it to get back to the master key, then derive any + * other key. For all new code, use HKDF instead. + * + * The master key must be at least as long as the derived key. If the master + * key is longer, then only the first 'derived_keysize' bytes are used. + */ +static int derive_key_aes(const u8 *master_key, + const u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE], + u8 *derived_key, unsigned int derived_keysize) +{ + int res = 0; + struct skcipher_request *req = NULL; + DECLARE_CRYPTO_WAIT(wait); + struct scatterlist src_sg, dst_sg; + struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0); + + if (IS_ERR(tfm)) { + res = PTR_ERR(tfm); + tfm = NULL; + goto out; + } + crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); + req = skcipher_request_alloc(tfm, GFP_NOFS); + if (!req) { + res = -ENOMEM; + goto out; + } + skcipher_request_set_callback(req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + crypto_req_done, &wait); + res = crypto_skcipher_setkey(tfm, nonce, FS_KEY_DERIVATION_NONCE_SIZE); + if (res < 0) + goto out; + + sg_init_one(&src_sg, master_key, derived_keysize); + sg_init_one(&dst_sg, derived_key, derived_keysize); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, derived_keysize, + NULL); + res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); +out: + skcipher_request_free(req); + crypto_free_skcipher(tfm); + return res; +} + +/* + * Search the current task's subscribed keyrings for a "logon" key with + * description prefix:descriptor, and if found acquire a read lock on it and + * return a pointer to its validated payload in *payload_ret. + */ +static struct key * +find_and_lock_process_key(const char *prefix, + const u8 descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE], + unsigned int min_keysize, + const struct fscrypt_key **payload_ret) +{ + char *description; + struct key *key; + const struct user_key_payload *ukp; + const struct fscrypt_key *payload; + + description = kasprintf(GFP_NOFS, "%s%*phN", prefix, + FSCRYPT_KEY_DESCRIPTOR_SIZE, descriptor); + if (!description) + return ERR_PTR(-ENOMEM); + + key = request_key(&key_type_logon, description, NULL); + kfree(description); + if (IS_ERR(key)) + return key; + + down_read(&key->sem); + ukp = user_key_payload_locked(key); + + if (!ukp) /* was the key revoked before we acquired its semaphore? */ + goto invalid; + + payload = (const struct fscrypt_key *)ukp->data; + + if (ukp->datalen != sizeof(struct fscrypt_key) || + payload->size < 1 || payload->size > FSCRYPT_MAX_KEY_SIZE) { + fscrypt_warn(NULL, + "key with description '%s' has invalid payload", + key->description); + goto invalid; + } + + if (payload->size < min_keysize) { + fscrypt_warn(NULL, + "key with description '%s' is too short (got %u bytes, need %u+ bytes)", + key->description, payload->size, min_keysize); + goto invalid; + } + + *payload_ret = payload; + return key; + +invalid: + up_read(&key->sem); + key_put(key); + return ERR_PTR(-ENOKEY); +} + +/* Master key referenced by DIRECT_KEY policy */ +struct fscrypt_direct_key { + struct hlist_node dk_node; + refcount_t dk_refcount; + const struct fscrypt_mode *dk_mode; + struct crypto_skcipher *dk_ctfm; + u8 dk_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE]; + u8 dk_raw[FSCRYPT_MAX_KEY_SIZE]; +}; + +static void free_direct_key(struct fscrypt_direct_key *dk) +{ + if (dk) { + crypto_free_skcipher(dk->dk_ctfm); + kzfree(dk); + } +} + +void fscrypt_put_direct_key(struct fscrypt_direct_key *dk) +{ + if (!refcount_dec_and_lock(&dk->dk_refcount, &fscrypt_direct_keys_lock)) + return; + hash_del(&dk->dk_node); + spin_unlock(&fscrypt_direct_keys_lock); + + free_direct_key(dk); +} + +/* + * Find/insert the given key into the fscrypt_direct_keys table. If found, it + * is returned with elevated refcount, and 'to_insert' is freed if non-NULL. If + * not found, 'to_insert' is inserted and returned if it's non-NULL; otherwise + * NULL is returned. + */ +static struct fscrypt_direct_key * +find_or_insert_direct_key(struct fscrypt_direct_key *to_insert, + const u8 *raw_key, const struct fscrypt_info *ci) +{ + unsigned long hash_key; + struct fscrypt_direct_key *dk; + + /* + * Careful: to avoid potentially leaking secret key bytes via timing + * information, we must key the hash table by descriptor rather than by + * raw key, and use crypto_memneq() when comparing raw keys. + */ + + BUILD_BUG_ON(sizeof(hash_key) > FSCRYPT_KEY_DESCRIPTOR_SIZE); + memcpy(&hash_key, ci->ci_policy.v1.master_key_descriptor, + sizeof(hash_key)); + + spin_lock(&fscrypt_direct_keys_lock); + hash_for_each_possible(fscrypt_direct_keys, dk, dk_node, hash_key) { + if (memcmp(ci->ci_policy.v1.master_key_descriptor, + dk->dk_descriptor, FSCRYPT_KEY_DESCRIPTOR_SIZE) != 0) + continue; + if (ci->ci_mode != dk->dk_mode) + continue; + if (crypto_memneq(raw_key, dk->dk_raw, ci->ci_mode->keysize)) + continue; + /* using existing tfm with same (descriptor, mode, raw_key) */ + refcount_inc(&dk->dk_refcount); + spin_unlock(&fscrypt_direct_keys_lock); + free_direct_key(to_insert); + return dk; + } + if (to_insert) + hash_add(fscrypt_direct_keys, &to_insert->dk_node, hash_key); + spin_unlock(&fscrypt_direct_keys_lock); + return to_insert; +} + +/* Prepare to encrypt directly using the master key in the given mode */ +static struct fscrypt_direct_key * +fscrypt_get_direct_key(const struct fscrypt_info *ci, const u8 *raw_key) +{ + struct fscrypt_direct_key *dk; + int err; + + /* Is there already a tfm for this key? */ + dk = find_or_insert_direct_key(NULL, raw_key, ci); + if (dk) + return dk; + + /* Nope, allocate one. */ + dk = kzalloc(sizeof(*dk), GFP_NOFS); + if (!dk) + return ERR_PTR(-ENOMEM); + refcount_set(&dk->dk_refcount, 1); + dk->dk_mode = ci->ci_mode; + dk->dk_ctfm = fscrypt_allocate_skcipher(ci->ci_mode, raw_key, + ci->ci_inode); + if (IS_ERR(dk->dk_ctfm)) { + err = PTR_ERR(dk->dk_ctfm); + dk->dk_ctfm = NULL; + goto err_free_dk; + } + memcpy(dk->dk_descriptor, ci->ci_policy.v1.master_key_descriptor, + FSCRYPT_KEY_DESCRIPTOR_SIZE); + memcpy(dk->dk_raw, raw_key, ci->ci_mode->keysize); + + return find_or_insert_direct_key(dk, raw_key, ci); + +err_free_dk: + free_direct_key(dk); + return ERR_PTR(err); +} + +/* v1 policy, DIRECT_KEY: use the master key directly */ +static int setup_v1_file_key_direct(struct fscrypt_info *ci, + const u8 *raw_master_key) +{ + const struct fscrypt_mode *mode = ci->ci_mode; + struct fscrypt_direct_key *dk; + + if (!fscrypt_mode_supports_direct_key(mode)) { + fscrypt_warn(ci->ci_inode, + "Direct key mode not allowed with %s", + mode->friendly_name); + return -EINVAL; + } + + if (ci->ci_policy.v1.contents_encryption_mode != + ci->ci_policy.v1.filenames_encryption_mode) { + fscrypt_warn(ci->ci_inode, + "Direct key mode not allowed with different contents and filenames modes"); + return -EINVAL; + } + + /* ESSIV implies 16-byte IVs which implies !DIRECT_KEY */ + if (WARN_ON(mode->needs_essiv)) + return -EINVAL; + + dk = fscrypt_get_direct_key(ci, raw_master_key); + if (IS_ERR(dk)) + return PTR_ERR(dk); + ci->ci_direct_key = dk; + ci->ci_ctfm = dk->dk_ctfm; + return 0; +} + +/* v1 policy, !DIRECT_KEY: derive the file's encryption key */ +static int setup_v1_file_key_derived(struct fscrypt_info *ci, + const u8 *raw_master_key) +{ + u8 *derived_key; + int err; + + /* + * This cannot be a stack buffer because it will be passed to the + * scatterlist crypto API during derive_key_aes(). + */ + derived_key = kmalloc(ci->ci_mode->keysize, GFP_NOFS); + if (!derived_key) + return -ENOMEM; + + err = derive_key_aes(raw_master_key, ci->ci_nonce, + derived_key, ci->ci_mode->keysize); + if (err) + goto out; + + err = fscrypt_set_derived_key(ci, derived_key); +out: + kzfree(derived_key); + return err; +} + +int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, const u8 *raw_master_key) +{ + if (ci->ci_policy.v1.flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) + return setup_v1_file_key_direct(ci, raw_master_key); + else + return setup_v1_file_key_derived(ci, raw_master_key); +} + +int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci) +{ + struct key *key; + const struct fscrypt_key *payload; + int err; + + key = find_and_lock_process_key(FSCRYPT_KEY_DESC_PREFIX, + ci->ci_policy.v1.master_key_descriptor, + ci->ci_mode->keysize, &payload); + if (key == ERR_PTR(-ENOKEY) && ci->ci_inode->i_sb->s_cop->key_prefix) { + key = find_and_lock_process_key(ci->ci_inode->i_sb->s_cop->key_prefix, + ci->ci_policy.v1.master_key_descriptor, + ci->ci_mode->keysize, &payload); + } + if (IS_ERR(key)) + return PTR_ERR(key); + + err = fscrypt_setup_v1_file_key(ci, payload->raw); + up_read(&key->sem); + key_put(key); + return err; +} diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 4941fe8471ce..4072ba644595 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -5,8 +5,9 @@ * Copyright (C) 2015, Google, Inc. * Copyright (C) 2015, Motorola Mobility. * - * Written by Michael Halcrow, 2015. + * Originally written by Michael Halcrow, 2015. * Modified by Jaegeuk Kim, 2015. + * Modified by Eric Biggers, 2019 for v2 policy support. */ #include <linux/random.h> @@ -14,70 +15,303 @@ #include <linux/mount.h> #include "fscrypt_private.h" -/* - * check whether an encryption policy is consistent with an encryption context +/** + * fscrypt_policies_equal - check whether two encryption policies are the same + * + * Return: %true if equal, else %false + */ +bool fscrypt_policies_equal(const union fscrypt_policy *policy1, + const union fscrypt_policy *policy2) +{ + if (policy1->version != policy2->version) + return false; + + return !memcmp(policy1, policy2, fscrypt_policy_size(policy1)); +} + +/** + * fscrypt_supported_policy - check whether an encryption policy is supported + * + * Given an encryption policy, check whether all its encryption modes and other + * settings are supported by this kernel. (But we don't currently don't check + * for crypto API support here, so attempting to use an algorithm not configured + * into the crypto API will still fail later.) + * + * Return: %true if supported, else %false + */ +bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, + const struct inode *inode) +{ + switch (policy_u->version) { + case FSCRYPT_POLICY_V1: { + const struct fscrypt_policy_v1 *policy = &policy_u->v1; + + if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, + policy->filenames_encryption_mode)) { + fscrypt_warn(inode, + "Unsupported encryption modes (contents %d, filenames %d)", + policy->contents_encryption_mode, + policy->filenames_encryption_mode); + return false; + } + + if (policy->flags & ~FSCRYPT_POLICY_FLAGS_VALID) { + fscrypt_warn(inode, + "Unsupported encryption flags (0x%02x)", + policy->flags); + return false; + } + + return true; + } + case FSCRYPT_POLICY_V2: { + const struct fscrypt_policy_v2 *policy = &policy_u->v2; + + if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, + policy->filenames_encryption_mode)) { + fscrypt_warn(inode, + "Unsupported encryption modes (contents %d, filenames %d)", + policy->contents_encryption_mode, + policy->filenames_encryption_mode); + return false; + } + + if (policy->flags & ~FSCRYPT_POLICY_FLAGS_VALID) { + fscrypt_warn(inode, + "Unsupported encryption flags (0x%02x)", + policy->flags); + return false; + } + + if (memchr_inv(policy->__reserved, 0, + sizeof(policy->__reserved))) { + fscrypt_warn(inode, + "Reserved bits set in encryption policy"); + return false; + } + + return true; + } + } + return false; +} + +/** + * fscrypt_new_context_from_policy - create a new fscrypt_context from a policy + * + * Create an fscrypt_context for an inode that is being assigned the given + * encryption policy. A new nonce is randomly generated. + * + * Return: the size of the new context in bytes. */ -static bool is_encryption_context_consistent_with_policy( - const struct fscrypt_context *ctx, - const struct fscrypt_policy *policy) +static int fscrypt_new_context_from_policy(union fscrypt_context *ctx_u, + const union fscrypt_policy *policy_u) { - return memcmp(ctx->master_key_descriptor, policy->master_key_descriptor, - FS_KEY_DESCRIPTOR_SIZE) == 0 && - (ctx->flags == policy->flags) && - (ctx->contents_encryption_mode == - policy->contents_encryption_mode) && - (ctx->filenames_encryption_mode == - policy->filenames_encryption_mode); + memset(ctx_u, 0, sizeof(*ctx_u)); + + switch (policy_u->version) { + case FSCRYPT_POLICY_V1: { + const struct fscrypt_policy_v1 *policy = &policy_u->v1; + struct fscrypt_context_v1 *ctx = &ctx_u->v1; + + ctx->version = FSCRYPT_CONTEXT_V1; + ctx->contents_encryption_mode = + policy->contents_encryption_mode; + ctx->filenames_encryption_mode = + policy->filenames_encryption_mode; + ctx->flags = policy->flags; + memcpy(ctx->master_key_descriptor, + policy->master_key_descriptor, + sizeof(ctx->master_key_descriptor)); + get_random_bytes(ctx->nonce, sizeof(ctx->nonce)); + return sizeof(*ctx); + } + case FSCRYPT_POLICY_V2: { + const struct fscrypt_policy_v2 *policy = &policy_u->v2; + struct fscrypt_context_v2 *ctx = &ctx_u->v2; + + ctx->version = FSCRYPT_CONTEXT_V2; + ctx->contents_encryption_mode = + policy->contents_encryption_mode; + ctx->filenames_encryption_mode = + policy->filenames_encryption_mode; + ctx->flags = policy->flags; + memcpy(ctx->master_key_identifier, + policy->master_key_identifier, + sizeof(ctx->master_key_identifier)); + get_random_bytes(ctx->nonce, sizeof(ctx->nonce)); + return sizeof(*ctx); + } + } + BUG(); } -static int create_encryption_context_from_policy(struct inode *inode, - const struct fscrypt_policy *policy) +/** + * fscrypt_policy_from_context - convert an fscrypt_context to an fscrypt_policy + * + * Given an fscrypt_context, build the corresponding fscrypt_policy. + * + * Return: 0 on success, or -EINVAL if the fscrypt_context has an unrecognized + * version number or size. + * + * This does *not* validate the settings within the policy itself, e.g. the + * modes, flags, and reserved bits. Use fscrypt_supported_policy() for that. + */ +int fscrypt_policy_from_context(union fscrypt_policy *policy_u, + const union fscrypt_context *ctx_u, + int ctx_size) { - struct fscrypt_context ctx; + memset(policy_u, 0, sizeof(*policy_u)); + + if (ctx_size <= 0 || ctx_size != fscrypt_context_size(ctx_u)) + return -EINVAL; + + switch (ctx_u->version) { + case FSCRYPT_CONTEXT_V1: { + const struct fscrypt_context_v1 *ctx = &ctx_u->v1; + struct fscrypt_policy_v1 *policy = &policy_u->v1; + + policy->version = FSCRYPT_POLICY_V1; + policy->contents_encryption_mode = + ctx->contents_encryption_mode; + policy->filenames_encryption_mode = + ctx->filenames_encryption_mode; + policy->flags = ctx->flags; + memcpy(policy->master_key_descriptor, + ctx->master_key_descriptor, + sizeof(policy->master_key_descriptor)); + return 0; + } + case FSCRYPT_CONTEXT_V2: { + const struct fscrypt_context_v2 *ctx = &ctx_u->v2; + struct fscrypt_policy_v2 *policy = &policy_u->v2; + + policy->version = FSCRYPT_POLICY_V2; + policy->contents_encryption_mode = + ctx->contents_encryption_mode; + policy->filenames_encryption_mode = + ctx->filenames_encryption_mode; + policy->flags = ctx->flags; + memcpy(policy->__reserved, ctx->__reserved, + sizeof(policy->__reserved)); + memcpy(policy->master_key_identifier, + ctx->master_key_identifier, + sizeof(policy->master_key_identifier)); + return 0; + } + } + /* unreachable */ + return -EINVAL; +} + +/* Retrieve an inode's encryption policy */ +static int fscrypt_get_policy(struct inode *inode, union fscrypt_policy *policy) +{ + const struct fscrypt_info *ci; + union fscrypt_context ctx; + int ret; + + ci = READ_ONCE(inode->i_crypt_info); + if (ci) { + /* key available, use the cached policy */ + *policy = ci->ci_policy; + return 0; + } + + if (!IS_ENCRYPTED(inode)) + return -ENODATA; - ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; - memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, - FS_KEY_DESCRIPTOR_SIZE); + ret = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (ret < 0) + return (ret == -ERANGE) ? -EINVAL : ret; - if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, - policy->filenames_encryption_mode)) + return fscrypt_policy_from_context(policy, &ctx, ret); +} + +static int set_encryption_policy(struct inode *inode, + const union fscrypt_policy *policy) +{ + union fscrypt_context ctx; + int ctxsize; + int err; + + if (!fscrypt_supported_policy(policy, inode)) return -EINVAL; - if (policy->flags & ~FS_POLICY_FLAGS_VALID) + switch (policy->version) { + case FSCRYPT_POLICY_V1: + /* + * The original encryption policy version provided no way of + * verifying that the correct master key was supplied, which was + * insecure in scenarios where multiple users have access to the + * same encrypted files (even just read-only access). The new + * encryption policy version fixes this and also implies use of + * an improved key derivation function and allows non-root users + * to securely remove keys. So as long as compatibility with + * old kernels isn't required, it is recommended to use the new + * policy version for all new encrypted directories. + */ + pr_warn_once("%s (pid %d) is setting deprecated v1 encryption policy; recommend upgrading to v2.\n", + current->comm, current->pid); + break; + case FSCRYPT_POLICY_V2: + err = fscrypt_verify_key_added(inode->i_sb, + policy->v2.master_key_identifier); + if (err) + return err; + break; + default: + WARN_ON(1); return -EINVAL; + } - ctx.contents_encryption_mode = policy->contents_encryption_mode; - ctx.filenames_encryption_mode = policy->filenames_encryption_mode; - ctx.flags = policy->flags; - BUILD_BUG_ON(sizeof(ctx.nonce) != FS_KEY_DERIVATION_NONCE_SIZE); - get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE); + ctxsize = fscrypt_new_context_from_policy(&ctx, policy); - return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL); + return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, NULL); } int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) { - struct fscrypt_policy policy; + union fscrypt_policy policy; + union fscrypt_policy existing_policy; struct inode *inode = file_inode(filp); + u8 version; + int size; int ret; - struct fscrypt_context ctx; - if (copy_from_user(&policy, arg, sizeof(policy))) + if (get_user(policy.version, (const u8 __user *)arg)) return -EFAULT; + size = fscrypt_policy_size(&policy); + if (size <= 0) + return -EINVAL; + + /* + * We should just copy the remaining 'size - 1' bytes here, but a + * bizarre bug in gcc 7 and earlier (fixed by gcc r255731) causes gcc to + * think that size can be 0 here (despite the check above!) *and* that + * it's a compile-time constant. Thus it would think copy_from_user() + * is passed compile-time constant ULONG_MAX, causing the compile-time + * buffer overflow check to fail, breaking the build. This only occurred + * when building an i386 kernel with -Os and branch profiling enabled. + * + * Work around it by just copying the first byte again... + */ + version = policy.version; + if (copy_from_user(&policy, arg, size)) + return -EFAULT; + policy.version = version; + if (!inode_owner_or_capable(inode)) return -EACCES; - if (policy.version != 0) - return -EINVAL; - ret = mnt_want_write_file(filp); if (ret) return ret; inode_lock(inode); - ret = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + ret = fscrypt_get_policy(inode, &existing_policy); if (ret == -ENODATA) { if (!S_ISDIR(inode->i_mode)) ret = -ENOTDIR; @@ -86,14 +320,10 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) else if (!inode->i_sb->s_cop->empty_dir(inode)) ret = -ENOTEMPTY; else - ret = create_encryption_context_from_policy(inode, - &policy); - } else if (ret == sizeof(ctx) && - is_encryption_context_consistent_with_policy(&ctx, - &policy)) { - /* The file already uses the same encryption policy. */ - ret = 0; - } else if (ret >= 0 || ret == -ERANGE) { + ret = set_encryption_policy(inode, &policy); + } else if (ret == -EINVAL || + (ret == 0 && !fscrypt_policies_equal(&policy, + &existing_policy))) { /* The file already uses a different encryption policy. */ ret = -EEXIST; } @@ -105,37 +335,57 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) } EXPORT_SYMBOL(fscrypt_ioctl_set_policy); +/* Original ioctl version; can only get the original policy version */ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) { - struct inode *inode = file_inode(filp); - struct fscrypt_context ctx; - struct fscrypt_policy policy; - int res; + union fscrypt_policy policy; + int err; - if (!IS_ENCRYPTED(inode)) - return -ENODATA; + err = fscrypt_get_policy(file_inode(filp), &policy); + if (err) + return err; - res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); - if (res < 0 && res != -ERANGE) - return res; - if (res != sizeof(ctx)) - return -EINVAL; - if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1) + if (policy.version != FSCRYPT_POLICY_V1) return -EINVAL; - policy.version = 0; - policy.contents_encryption_mode = ctx.contents_encryption_mode; - policy.filenames_encryption_mode = ctx.filenames_encryption_mode; - policy.flags = ctx.flags; - memcpy(policy.master_key_descriptor, ctx.master_key_descriptor, - FS_KEY_DESCRIPTOR_SIZE); - - if (copy_to_user(arg, &policy, sizeof(policy))) + if (copy_to_user(arg, &policy, sizeof(policy.v1))) return -EFAULT; return 0; } EXPORT_SYMBOL(fscrypt_ioctl_get_policy); +/* Extended ioctl version; can get policies of any version */ +int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *uarg) +{ + struct fscrypt_get_policy_ex_arg arg; + union fscrypt_policy *policy = (union fscrypt_policy *)&arg.policy; + size_t policy_size; + int err; + + /* arg is policy_size, then policy */ + BUILD_BUG_ON(offsetof(typeof(arg), policy_size) != 0); + BUILD_BUG_ON(offsetofend(typeof(arg), policy_size) != + offsetof(typeof(arg), policy)); + BUILD_BUG_ON(sizeof(arg.policy) != sizeof(*policy)); + + err = fscrypt_get_policy(file_inode(filp), policy); + if (err) + return err; + policy_size = fscrypt_policy_size(policy); + + if (copy_from_user(&arg, uarg, sizeof(arg.policy_size))) + return -EFAULT; + + if (policy_size > arg.policy_size) + return -EOVERFLOW; + arg.policy_size = policy_size; + + if (copy_to_user(uarg, &arg, sizeof(arg.policy_size) + policy_size)) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL_GPL(fscrypt_ioctl_get_policy_ex); + /** * fscrypt_has_permitted_context() - is a file's encryption policy permitted * within its directory? @@ -157,10 +407,8 @@ EXPORT_SYMBOL(fscrypt_ioctl_get_policy); */ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) { - const struct fscrypt_operations *cops = parent->i_sb->s_cop; - const struct fscrypt_info *parent_ci, *child_ci; - struct fscrypt_context parent_ctx, child_ctx; - int res; + union fscrypt_policy parent_policy, child_policy; + int err; /* No restrictions on file types which are never encrypted */ if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) && @@ -190,41 +438,22 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) * In any case, if an unexpected error occurs, fall back to "forbidden". */ - res = fscrypt_get_encryption_info(parent); - if (res) + err = fscrypt_get_encryption_info(parent); + if (err) return 0; - res = fscrypt_get_encryption_info(child); - if (res) + err = fscrypt_get_encryption_info(child); + if (err) return 0; - parent_ci = READ_ONCE(parent->i_crypt_info); - child_ci = READ_ONCE(child->i_crypt_info); - - if (parent_ci && child_ci) { - return memcmp(parent_ci->ci_master_key_descriptor, - child_ci->ci_master_key_descriptor, - FS_KEY_DESCRIPTOR_SIZE) == 0 && - (parent_ci->ci_data_mode == child_ci->ci_data_mode) && - (parent_ci->ci_filename_mode == - child_ci->ci_filename_mode) && - (parent_ci->ci_flags == child_ci->ci_flags); - } - res = cops->get_context(parent, &parent_ctx, sizeof(parent_ctx)); - if (res != sizeof(parent_ctx)) + err = fscrypt_get_policy(parent, &parent_policy); + if (err) return 0; - res = cops->get_context(child, &child_ctx, sizeof(child_ctx)); - if (res != sizeof(child_ctx)) + err = fscrypt_get_policy(child, &child_policy); + if (err) return 0; - return memcmp(parent_ctx.master_key_descriptor, - child_ctx.master_key_descriptor, - FS_KEY_DESCRIPTOR_SIZE) == 0 && - (parent_ctx.contents_encryption_mode == - child_ctx.contents_encryption_mode) && - (parent_ctx.filenames_encryption_mode == - child_ctx.filenames_encryption_mode) && - (parent_ctx.flags == child_ctx.flags); + return fscrypt_policies_equal(&parent_policy, &child_policy); } EXPORT_SYMBOL(fscrypt_has_permitted_context); @@ -240,7 +469,8 @@ EXPORT_SYMBOL(fscrypt_has_permitted_context); int fscrypt_inherit_context(struct inode *parent, struct inode *child, void *fs_data, bool preload) { - struct fscrypt_context ctx; + union fscrypt_context ctx; + int ctxsize; struct fscrypt_info *ci; int res; @@ -252,16 +482,10 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child, if (ci == NULL) return -ENOKEY; - ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; - ctx.contents_encryption_mode = ci->ci_data_mode; - ctx.filenames_encryption_mode = ci->ci_filename_mode; - ctx.flags = ci->ci_flags; - memcpy(ctx.master_key_descriptor, ci->ci_master_key_descriptor, - FS_KEY_DESCRIPTOR_SIZE); - get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE); + ctxsize = fscrypt_new_context_from_policy(&ctx, &ci->ci_policy); + BUILD_BUG_ON(sizeof(ctx) != FSCRYPT_SET_CONTEXT_MAX_SIZE); - res = parent->i_sb->s_cop->set_context(child, &ctx, - sizeof(ctx), fs_data); + res = parent->i_sb->s_cop->set_context(child, &ctx, ctxsize, fs_data); if (res) return res; return preload ? fscrypt_get_encryption_info(child): 0; diff --git a/fs/d_path.c b/fs/d_path.c index a7d0a96b35ce..0f1fc1743302 100644 --- a/fs/d_path.c +++ b/fs/d_path.c @@ -116,8 +116,10 @@ restart: vfsmnt = &mnt->mnt; continue; } - if (!error) - error = is_mounted(vfsmnt) ? 1 : 2; + if (is_mounted(vfsmnt) && !is_anon_ns(mnt->mnt_ns)) + error = 1; // absolute root + else + error = 2; // detached or not attached yet break; } parent = dentry->d_parent; diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index beeadca23b05..42e5a766d33c 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -622,7 +622,7 @@ void devpts_pty_kill(struct dentry *dentry) dentry->d_fsdata = NULL; drop_nlink(dentry->d_inode); fsnotify_unlink(d_inode(dentry->d_parent), dentry); - d_delete(dentry); + d_drop(dentry); dput(dentry); /* d_alloc_name() in devpts_pty_new() */ } diff --git a/fs/efs/super.c b/fs/efs/super.c index 867fc24dee20..4a6ebff2af76 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -257,6 +257,8 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) if (!sb) return -ENOMEM; s->s_fs_info = sb; + s->s_time_min = 0; + s->s_time_max = U32_MAX; s->s_magic = EFS_SUPER_MAGIC; if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) { diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig new file mode 100644 index 000000000000..9d634d3a1845 --- /dev/null +++ b/fs/erofs/Kconfig @@ -0,0 +1,91 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config EROFS_FS + tristate "EROFS filesystem support" + depends on BLOCK + help + EROFS (Enhanced Read-Only File System) is a lightweight + read-only file system with modern designs (eg. page-sized + blocks, inline xattrs/data, etc.) for scenarios which need + high-performance read-only requirements, e.g. Android OS + for mobile phones and LIVECDs. + + It also provides fixed-sized output compression support, + which improves storage density, keeps relatively higher + compression ratios, which is more useful to achieve high + performance for embedded devices with limited memory. + + If unsure, say N. + +config EROFS_FS_DEBUG + bool "EROFS debugging feature" + depends on EROFS_FS + help + Print debugging messages and enable more BUG_ONs which check + filesystem consistency and find potential issues aggressively, + which can be used for Android eng build, for example. + + For daily use, say N. + +config EROFS_FS_XATTR + bool "EROFS extended attributes" + depends on EROFS_FS + default y + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + <http://acl.bestbits.at/> for details). + + If unsure, say N. + +config EROFS_FS_POSIX_ACL + bool "EROFS Access Control Lists" + depends on EROFS_FS_XATTR + select FS_POSIX_ACL + default y + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website <http://acl.bestbits.at/>. + + If you don't know what Access Control Lists are, say N. + +config EROFS_FS_SECURITY + bool "EROFS Security Labels" + depends on EROFS_FS_XATTR + default y + help + Security labels provide an access control facility to support Linux + Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO + Linux. This option enables an extended attribute handler for file + security labels in the erofs filesystem, so that it requires enabling + the extended attribute support in advance. + + If you are not using a security module, say N. + +config EROFS_FS_ZIP + bool "EROFS Data Compression Support" + depends on EROFS_FS + select LZ4_DECOMPRESS + default y + help + Enable fixed-sized output compression for EROFS. + + If you don't want to enable compression feature, say N. + +config EROFS_FS_CLUSTER_PAGE_LIMIT + int "EROFS Cluster Pages Hard Limit" + depends on EROFS_FS_ZIP + range 1 256 + default "1" + help + Indicates maximum # of pages of a compressed + physical cluster. + + For example, if files in a image were compressed + into 8k-unit, hard limit should not be configured + less than 2. Otherwise, the image will be refused + to mount on this kernel. + diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile new file mode 100644 index 000000000000..46f2aa4ba46c --- /dev/null +++ b/fs/erofs/Makefile @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0-only + +EROFS_VERSION = "1.0" + +ccflags-y += -DEROFS_VERSION=\"$(EROFS_VERSION)\" + +obj-$(CONFIG_EROFS_FS) += erofs.o +erofs-objs := super.o inode.o data.o namei.o dir.o utils.o +erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o +erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o + diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h new file mode 100644 index 000000000000..07d279fd5d67 --- /dev/null +++ b/fs/erofs/compress.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2019 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#ifndef __EROFS_FS_COMPRESS_H +#define __EROFS_FS_COMPRESS_H + +#include "internal.h" + +enum { + Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX, + Z_EROFS_COMPRESSION_RUNTIME_MAX +}; + +struct z_erofs_decompress_req { + struct super_block *sb; + struct page **in, **out; + + unsigned short pageofs_out; + unsigned int inputsize, outputsize; + + /* indicate the algorithm will be used for decompression */ + unsigned int alg; + bool inplace_io, partial_decoding; +}; + +/* + * - 0x5A110C8D ('sallocated', Z_EROFS_MAPPING_STAGING) - + * used to mark temporary allocated pages from other + * file/cached pages and NULL mapping pages. + */ +#define Z_EROFS_MAPPING_STAGING ((void *)0x5A110C8D) + +/* check if a page is marked as staging */ +static inline bool z_erofs_page_is_staging(struct page *page) +{ + return page->mapping == Z_EROFS_MAPPING_STAGING; +} + +static inline bool z_erofs_put_stagingpage(struct list_head *pagepool, + struct page *page) +{ + if (!z_erofs_page_is_staging(page)) + return false; + + /* staging pages should not be used by others at the same time */ + if (page_ref_count(page) > 1) + put_page(page); + else + list_add(&page->lru, pagepool); + return true; +} + +int z_erofs_decompress(struct z_erofs_decompress_req *rq, + struct list_head *pagepool); + +#endif + diff --git a/fs/erofs/data.c b/fs/erofs/data.c new file mode 100644 index 000000000000..8a9fcbd0e8ac --- /dev/null +++ b/fs/erofs/data.c @@ -0,0 +1,360 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#include "internal.h" +#include <linux/prefetch.h> + +#include <trace/events/erofs.h> + +static void erofs_readendio(struct bio *bio) +{ + struct bio_vec *bvec; + blk_status_t err = bio->bi_status; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) { + struct page *page = bvec->bv_page; + + /* page is already locked */ + DBG_BUGON(PageUptodate(page)); + + if (err) + SetPageError(page); + else + SetPageUptodate(page); + + unlock_page(page); + /* page could be reclaimed now */ + } + bio_put(bio); +} + +struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr) +{ + struct inode *const bd_inode = sb->s_bdev->bd_inode; + struct address_space *const mapping = bd_inode->i_mapping; + + return read_cache_page_gfp(mapping, blkaddr, + mapping_gfp_constraint(mapping, ~__GFP_FS)); +} + +static int erofs_map_blocks_flatmode(struct inode *inode, + struct erofs_map_blocks *map, + int flags) +{ + int err = 0; + erofs_blk_t nblocks, lastblk; + u64 offset = map->m_la; + struct erofs_inode *vi = EROFS_I(inode); + bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE); + + trace_erofs_map_blocks_flatmode_enter(inode, map, flags); + + nblocks = DIV_ROUND_UP(inode->i_size, PAGE_SIZE); + lastblk = nblocks - tailendpacking; + + if (offset >= inode->i_size) { + /* leave out-of-bound access unmapped */ + map->m_flags = 0; + map->m_plen = 0; + goto out; + } + + /* there is no hole in flatmode */ + map->m_flags = EROFS_MAP_MAPPED; + + if (offset < blknr_to_addr(lastblk)) { + map->m_pa = blknr_to_addr(vi->raw_blkaddr) + map->m_la; + map->m_plen = blknr_to_addr(lastblk) - offset; + } else if (tailendpacking) { + /* 2 - inode inline B: inode, [xattrs], inline last blk... */ + struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); + + map->m_pa = iloc(sbi, vi->nid) + vi->inode_isize + + vi->xattr_isize + erofs_blkoff(map->m_la); + map->m_plen = inode->i_size - offset; + + /* inline data should be located in one meta block */ + if (erofs_blkoff(map->m_pa) + map->m_plen > PAGE_SIZE) { + erofs_err(inode->i_sb, + "inline data cross block boundary @ nid %llu", + vi->nid); + DBG_BUGON(1); + err = -EFSCORRUPTED; + goto err_out; + } + + map->m_flags |= EROFS_MAP_META; + } else { + erofs_err(inode->i_sb, + "internal error @ nid: %llu (size %llu), m_la 0x%llx", + vi->nid, inode->i_size, map->m_la); + DBG_BUGON(1); + err = -EIO; + goto err_out; + } + +out: + map->m_llen = map->m_plen; + +err_out: + trace_erofs_map_blocks_flatmode_exit(inode, map, flags, 0); + return err; +} + +int erofs_map_blocks(struct inode *inode, + struct erofs_map_blocks *map, int flags) +{ + if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) { + int err = z_erofs_map_blocks_iter(inode, map, flags); + + if (map->mpage) { + put_page(map->mpage); + map->mpage = NULL; + } + return err; + } + return erofs_map_blocks_flatmode(inode, map, flags); +} + +static inline struct bio *erofs_read_raw_page(struct bio *bio, + struct address_space *mapping, + struct page *page, + erofs_off_t *last_block, + unsigned int nblocks, + bool ra) +{ + struct inode *const inode = mapping->host; + struct super_block *const sb = inode->i_sb; + erofs_off_t current_block = (erofs_off_t)page->index; + int err; + + DBG_BUGON(!nblocks); + + if (PageUptodate(page)) { + err = 0; + goto has_updated; + } + + /* note that for readpage case, bio also equals to NULL */ + if (bio && + /* not continuous */ + *last_block + 1 != current_block) { +submit_bio_retry: + submit_bio(bio); + bio = NULL; + } + + if (!bio) { + struct erofs_map_blocks map = { + .m_la = blknr_to_addr(current_block), + }; + erofs_blk_t blknr; + unsigned int blkoff; + + err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); + if (err) + goto err_out; + + /* zero out the holed page */ + if (!(map.m_flags & EROFS_MAP_MAPPED)) { + zero_user_segment(page, 0, PAGE_SIZE); + SetPageUptodate(page); + + /* imply err = 0, see erofs_map_blocks */ + goto has_updated; + } + + /* for RAW access mode, m_plen must be equal to m_llen */ + DBG_BUGON(map.m_plen != map.m_llen); + + blknr = erofs_blknr(map.m_pa); + blkoff = erofs_blkoff(map.m_pa); + + /* deal with inline page */ + if (map.m_flags & EROFS_MAP_META) { + void *vsrc, *vto; + struct page *ipage; + + DBG_BUGON(map.m_plen > PAGE_SIZE); + + ipage = erofs_get_meta_page(inode->i_sb, blknr); + + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto err_out; + } + + vsrc = kmap_atomic(ipage); + vto = kmap_atomic(page); + memcpy(vto, vsrc + blkoff, map.m_plen); + memset(vto + map.m_plen, 0, PAGE_SIZE - map.m_plen); + kunmap_atomic(vto); + kunmap_atomic(vsrc); + flush_dcache_page(page); + + SetPageUptodate(page); + /* TODO: could we unlock the page earlier? */ + unlock_page(ipage); + put_page(ipage); + + /* imply err = 0, see erofs_map_blocks */ + goto has_updated; + } + + /* pa must be block-aligned for raw reading */ + DBG_BUGON(erofs_blkoff(map.m_pa)); + + /* max # of continuous pages */ + if (nblocks > DIV_ROUND_UP(map.m_plen, PAGE_SIZE)) + nblocks = DIV_ROUND_UP(map.m_plen, PAGE_SIZE); + if (nblocks > BIO_MAX_PAGES) + nblocks = BIO_MAX_PAGES; + + bio = bio_alloc(GFP_NOIO, nblocks); + + bio->bi_end_io = erofs_readendio; + bio_set_dev(bio, sb->s_bdev); + bio->bi_iter.bi_sector = (sector_t)blknr << + LOG_SECTORS_PER_BLOCK; + bio->bi_opf = REQ_OP_READ; + } + + err = bio_add_page(bio, page, PAGE_SIZE, 0); + /* out of the extent or bio is full */ + if (err < PAGE_SIZE) + goto submit_bio_retry; + + *last_block = current_block; + + /* shift in advance in case of it followed by too many gaps */ + if (bio->bi_iter.bi_size >= bio->bi_max_vecs * PAGE_SIZE) { + /* err should reassign to 0 after submitting */ + err = 0; + goto submit_bio_out; + } + + return bio; + +err_out: + /* for sync reading, set page error immediately */ + if (!ra) { + SetPageError(page); + ClearPageUptodate(page); + } +has_updated: + unlock_page(page); + + /* if updated manually, continuous pages has a gap */ + if (bio) +submit_bio_out: + submit_bio(bio); + return err ? ERR_PTR(err) : NULL; +} + +/* + * since we dont have write or truncate flows, so no inode + * locking needs to be held at the moment. + */ +static int erofs_raw_access_readpage(struct file *file, struct page *page) +{ + erofs_off_t last_block; + struct bio *bio; + + trace_erofs_readpage(page, true); + + bio = erofs_read_raw_page(NULL, page->mapping, + page, &last_block, 1, false); + + if (IS_ERR(bio)) + return PTR_ERR(bio); + + DBG_BUGON(bio); /* since we have only one bio -- must be NULL */ + return 0; +} + +static int erofs_raw_access_readpages(struct file *filp, + struct address_space *mapping, + struct list_head *pages, + unsigned int nr_pages) +{ + erofs_off_t last_block; + struct bio *bio = NULL; + gfp_t gfp = readahead_gfp_mask(mapping); + struct page *page = list_last_entry(pages, struct page, lru); + + trace_erofs_readpages(mapping->host, page, nr_pages, true); + + for (; nr_pages; --nr_pages) { + page = list_entry(pages->prev, struct page, lru); + + prefetchw(&page->flags); + list_del(&page->lru); + + if (!add_to_page_cache_lru(page, mapping, page->index, gfp)) { + bio = erofs_read_raw_page(bio, mapping, page, + &last_block, nr_pages, true); + + /* all the page errors are ignored when readahead */ + if (IS_ERR(bio)) { + pr_err("%s, readahead error at page %lu of nid %llu\n", + __func__, page->index, + EROFS_I(mapping->host)->nid); + + bio = NULL; + } + } + + /* pages could still be locked */ + put_page(page); + } + DBG_BUGON(!list_empty(pages)); + + /* the rare case (end in gaps) */ + if (bio) + submit_bio(bio); + return 0; +} + +static int erofs_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + struct erofs_map_blocks map = { + .m_la = iblock << 9, + }; + int err; + + err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); + if (err) + return err; + + if (map.m_flags & EROFS_MAP_MAPPED) + bh->b_blocknr = erofs_blknr(map.m_pa); + + return err; +} + +static sector_t erofs_bmap(struct address_space *mapping, sector_t block) +{ + struct inode *inode = mapping->host; + + if (EROFS_I(inode)->datalayout == EROFS_INODE_FLAT_INLINE) { + erofs_blk_t blks = i_size_read(inode) >> LOG_BLOCK_SIZE; + + if (block >> LOG_SECTORS_PER_BLOCK >= blks) + return 0; + } + + return generic_block_bmap(mapping, block, erofs_get_block); +} + +/* for uncompressed (aligned) files and raw access for other files */ +const struct address_space_operations erofs_raw_access_aops = { + .readpage = erofs_raw_access_readpage, + .readpages = erofs_raw_access_readpages, + .bmap = erofs_bmap, +}; + diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c new file mode 100644 index 000000000000..19f89f9fb10c --- /dev/null +++ b/fs/erofs/decompressor.c @@ -0,0 +1,338 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2019 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#include "compress.h" +#include <linux/module.h> +#include <linux/lz4.h> + +#ifndef LZ4_DISTANCE_MAX /* history window size */ +#define LZ4_DISTANCE_MAX 65535 /* set to maximum value by default */ +#endif + +#define LZ4_MAX_DISTANCE_PAGES (DIV_ROUND_UP(LZ4_DISTANCE_MAX, PAGE_SIZE) + 1) +#ifndef LZ4_DECOMPRESS_INPLACE_MARGIN +#define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize) (((srcsize) >> 8) + 32) +#endif + +struct z_erofs_decompressor { + /* + * if destpages have sparsed pages, fill them with bounce pages. + * it also check whether destpages indicate continuous physical memory. + */ + int (*prepare_destpages)(struct z_erofs_decompress_req *rq, + struct list_head *pagepool); + int (*decompress)(struct z_erofs_decompress_req *rq, u8 *out); + char *name; +}; + +static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq, + struct list_head *pagepool) +{ + const unsigned int nr = + PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; + struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL }; + unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES, + BITS_PER_LONG)] = { 0 }; + void *kaddr = NULL; + unsigned int i, j, top; + + top = 0; + for (i = j = 0; i < nr; ++i, ++j) { + struct page *const page = rq->out[i]; + struct page *victim; + + if (j >= LZ4_MAX_DISTANCE_PAGES) + j = 0; + + /* 'valid' bounced can only be tested after a complete round */ + if (test_bit(j, bounced)) { + DBG_BUGON(i < LZ4_MAX_DISTANCE_PAGES); + DBG_BUGON(top >= LZ4_MAX_DISTANCE_PAGES); + availables[top++] = rq->out[i - LZ4_MAX_DISTANCE_PAGES]; + } + + if (page) { + __clear_bit(j, bounced); + if (kaddr) { + if (kaddr + PAGE_SIZE == page_address(page)) + kaddr += PAGE_SIZE; + else + kaddr = NULL; + } else if (!i) { + kaddr = page_address(page); + } + continue; + } + kaddr = NULL; + __set_bit(j, bounced); + + if (top) { + victim = availables[--top]; + get_page(victim); + } else { + victim = erofs_allocpage(pagepool, GFP_KERNEL, false); + if (!victim) + return -ENOMEM; + victim->mapping = Z_EROFS_MAPPING_STAGING; + } + rq->out[i] = victim; + } + return kaddr ? 1 : 0; +} + +static void *generic_copy_inplace_data(struct z_erofs_decompress_req *rq, + u8 *src, unsigned int pageofs_in) +{ + /* + * if in-place decompression is ongoing, those decompressed + * pages should be copied in order to avoid being overlapped. + */ + struct page **in = rq->in; + u8 *const tmp = erofs_get_pcpubuf(0); + u8 *tmpp = tmp; + unsigned int inlen = rq->inputsize - pageofs_in; + unsigned int count = min_t(uint, inlen, PAGE_SIZE - pageofs_in); + + while (tmpp < tmp + inlen) { + if (!src) + src = kmap_atomic(*in); + memcpy(tmpp, src + pageofs_in, count); + kunmap_atomic(src); + src = NULL; + tmpp += count; + pageofs_in = 0; + count = PAGE_SIZE; + ++in; + } + return tmp; +} + +static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out) +{ + unsigned int inputmargin, inlen; + u8 *src; + bool copied, support_0padding; + int ret; + + if (rq->inputsize > PAGE_SIZE) + return -EOPNOTSUPP; + + src = kmap_atomic(*rq->in); + inputmargin = 0; + support_0padding = false; + + /* decompression inplace is only safe when 0padding is enabled */ + if (EROFS_SB(rq->sb)->feature_incompat & + EROFS_FEATURE_INCOMPAT_LZ4_0PADDING) { + support_0padding = true; + + while (!src[inputmargin & ~PAGE_MASK]) + if (!(++inputmargin & ~PAGE_MASK)) + break; + + if (inputmargin >= rq->inputsize) { + kunmap_atomic(src); + return -EIO; + } + } + + copied = false; + inlen = rq->inputsize - inputmargin; + if (rq->inplace_io) { + const uint oend = (rq->pageofs_out + + rq->outputsize) & ~PAGE_MASK; + const uint nr = PAGE_ALIGN(rq->pageofs_out + + rq->outputsize) >> PAGE_SHIFT; + + if (rq->partial_decoding || !support_0padding || + rq->out[nr - 1] != rq->in[0] || + rq->inputsize - oend < + LZ4_DECOMPRESS_INPLACE_MARGIN(inlen)) { + src = generic_copy_inplace_data(rq, src, inputmargin); + inputmargin = 0; + copied = true; + } + } + + ret = LZ4_decompress_safe_partial(src + inputmargin, out, + inlen, rq->outputsize, + rq->outputsize); + if (ret < 0) { + erofs_err(rq->sb, "failed to decompress, in[%u, %u] out[%u]", + inlen, inputmargin, rq->outputsize); + WARN_ON(1); + print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET, + 16, 1, src + inputmargin, inlen, true); + print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET, + 16, 1, out, rq->outputsize, true); + ret = -EIO; + } + + if (copied) + erofs_put_pcpubuf(src); + else + kunmap_atomic(src); + return ret; +} + +static struct z_erofs_decompressor decompressors[] = { + [Z_EROFS_COMPRESSION_SHIFTED] = { + .name = "shifted" + }, + [Z_EROFS_COMPRESSION_LZ4] = { + .prepare_destpages = z_erofs_lz4_prepare_destpages, + .decompress = z_erofs_lz4_decompress, + .name = "lz4" + }, +}; + +static void copy_from_pcpubuf(struct page **out, const char *dst, + unsigned short pageofs_out, + unsigned int outputsize) +{ + const char *end = dst + outputsize; + const unsigned int righthalf = PAGE_SIZE - pageofs_out; + const char *cur = dst - pageofs_out; + + while (cur < end) { + struct page *const page = *out++; + + if (page) { + char *buf = kmap_atomic(page); + + if (cur >= dst) { + memcpy(buf, cur, min_t(uint, PAGE_SIZE, + end - cur)); + } else { + memcpy(buf + pageofs_out, cur + pageofs_out, + min_t(uint, righthalf, end - cur)); + } + kunmap_atomic(buf); + } + cur += PAGE_SIZE; + } +} + +static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq, + struct list_head *pagepool) +{ + const unsigned int nrpages_out = + PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; + const struct z_erofs_decompressor *alg = decompressors + rq->alg; + unsigned int dst_maptype; + void *dst; + int ret, i; + + if (nrpages_out == 1 && !rq->inplace_io) { + DBG_BUGON(!*rq->out); + dst = kmap_atomic(*rq->out); + dst_maptype = 0; + goto dstmap_out; + } + + /* + * For the case of small output size (especially much less + * than PAGE_SIZE), memcpy the decompressed data rather than + * compressed data is preferred. + */ + if (rq->outputsize <= PAGE_SIZE * 7 / 8) { + dst = erofs_get_pcpubuf(0); + if (IS_ERR(dst)) + return PTR_ERR(dst); + + rq->inplace_io = false; + ret = alg->decompress(rq, dst); + if (!ret) + copy_from_pcpubuf(rq->out, dst, rq->pageofs_out, + rq->outputsize); + + erofs_put_pcpubuf(dst); + return ret; + } + + ret = alg->prepare_destpages(rq, pagepool); + if (ret < 0) { + return ret; + } else if (ret) { + dst = page_address(*rq->out); + dst_maptype = 1; + goto dstmap_out; + } + + i = 0; + while (1) { + dst = vm_map_ram(rq->out, nrpages_out, -1, PAGE_KERNEL); + + /* retry two more times (totally 3 times) */ + if (dst || ++i >= 3) + break; + vm_unmap_aliases(); + } + + if (!dst) + return -ENOMEM; + + dst_maptype = 2; + +dstmap_out: + ret = alg->decompress(rq, dst + rq->pageofs_out); + + if (!dst_maptype) + kunmap_atomic(dst); + else if (dst_maptype == 2) + vm_unmap_ram(dst, nrpages_out); + return ret; +} + +static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq, + struct list_head *pagepool) +{ + const unsigned int nrpages_out = + PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; + const unsigned int righthalf = PAGE_SIZE - rq->pageofs_out; + unsigned char *src, *dst; + + if (nrpages_out > 2) { + DBG_BUGON(1); + return -EIO; + } + + if (rq->out[0] == *rq->in) { + DBG_BUGON(nrpages_out != 1); + return 0; + } + + src = kmap_atomic(*rq->in); + if (!rq->out[0]) { + dst = NULL; + } else { + dst = kmap_atomic(rq->out[0]); + memcpy(dst + rq->pageofs_out, src, righthalf); + } + + if (rq->out[1] == *rq->in) { + memmove(src, src + righthalf, rq->pageofs_out); + } else if (nrpages_out == 2) { + if (dst) + kunmap_atomic(dst); + DBG_BUGON(!rq->out[1]); + dst = kmap_atomic(rq->out[1]); + memcpy(dst, src + righthalf, rq->pageofs_out); + } + if (dst) + kunmap_atomic(dst); + kunmap_atomic(src); + return 0; +} + +int z_erofs_decompress(struct z_erofs_decompress_req *rq, + struct list_head *pagepool) +{ + if (rq->alg == Z_EROFS_COMPRESSION_SHIFTED) + return z_erofs_shifted_transform(rq, pagepool); + return z_erofs_decompress_generic(rq, pagepool); +} + diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c new file mode 100644 index 000000000000..d28c623dfef9 --- /dev/null +++ b/fs/erofs/dir.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#include "internal.h" + +static void debug_one_dentry(unsigned char d_type, const char *de_name, + unsigned int de_namelen) +{ +#ifdef CONFIG_EROFS_FS_DEBUG + /* since the on-disk name could not have the trailing '\0' */ + unsigned char dbg_namebuf[EROFS_NAME_LEN + 1]; + + memcpy(dbg_namebuf, de_name, de_namelen); + dbg_namebuf[de_namelen] = '\0'; + + erofs_dbg("found dirent %s de_len %u d_type %d", dbg_namebuf, + de_namelen, d_type); +#endif +} + +static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, + void *dentry_blk, unsigned int *ofs, + unsigned int nameoff, unsigned int maxsize) +{ + struct erofs_dirent *de = dentry_blk + *ofs; + const struct erofs_dirent *end = dentry_blk + nameoff; + + while (de < end) { + const char *de_name; + unsigned int de_namelen; + unsigned char d_type; + + d_type = fs_ftype_to_dtype(de->file_type); + + nameoff = le16_to_cpu(de->nameoff); + de_name = (char *)dentry_blk + nameoff; + + /* the last dirent in the block? */ + if (de + 1 >= end) + de_namelen = strnlen(de_name, maxsize - nameoff); + else + de_namelen = le16_to_cpu(de[1].nameoff) - nameoff; + + /* a corrupted entry is found */ + if (nameoff + de_namelen > maxsize || + de_namelen > EROFS_NAME_LEN) { + erofs_err(dir->i_sb, "bogus dirent @ nid %llu", + EROFS_I(dir)->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; + } + + debug_one_dentry(d_type, de_name, de_namelen); + if (!dir_emit(ctx, de_name, de_namelen, + le64_to_cpu(de->nid), d_type)) + /* stopped by some reason */ + return 1; + ++de; + *ofs += sizeof(struct erofs_dirent); + } + *ofs = maxsize; + return 0; +} + +static int erofs_readdir(struct file *f, struct dir_context *ctx) +{ + struct inode *dir = file_inode(f); + struct address_space *mapping = dir->i_mapping; + const size_t dirsize = i_size_read(dir); + unsigned int i = ctx->pos / EROFS_BLKSIZ; + unsigned int ofs = ctx->pos % EROFS_BLKSIZ; + int err = 0; + bool initial = true; + + while (ctx->pos < dirsize) { + struct page *dentry_page; + struct erofs_dirent *de; + unsigned int nameoff, maxsize; + + dentry_page = read_mapping_page(mapping, i, NULL); + if (dentry_page == ERR_PTR(-ENOMEM)) { + err = -ENOMEM; + break; + } else if (IS_ERR(dentry_page)) { + erofs_err(dir->i_sb, + "fail to readdir of logical block %u of nid %llu", + i, EROFS_I(dir)->nid); + err = -EFSCORRUPTED; + break; + } + + de = (struct erofs_dirent *)kmap(dentry_page); + + nameoff = le16_to_cpu(de->nameoff); + + if (nameoff < sizeof(struct erofs_dirent) || + nameoff >= PAGE_SIZE) { + erofs_err(dir->i_sb, + "invalid de[0].nameoff %u @ nid %llu", + nameoff, EROFS_I(dir)->nid); + err = -EFSCORRUPTED; + goto skip_this; + } + + maxsize = min_t(unsigned int, + dirsize - ctx->pos + ofs, PAGE_SIZE); + + /* search dirents at the arbitrary position */ + if (initial) { + initial = false; + + ofs = roundup(ofs, sizeof(struct erofs_dirent)); + if (ofs >= nameoff) + goto skip_this; + } + + err = erofs_fill_dentries(dir, ctx, de, &ofs, + nameoff, maxsize); +skip_this: + kunmap(dentry_page); + + put_page(dentry_page); + + ctx->pos = blknr_to_addr(i) + ofs; + + if (err) + break; + ++i; + ofs = 0; + } + return err < 0 ? err : 0; +} + +const struct file_operations erofs_dir_fops = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate_shared = erofs_readdir, +}; + diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h new file mode 100644 index 000000000000..b1ee5654750d --- /dev/null +++ b/fs/erofs/erofs_fs.h @@ -0,0 +1,316 @@ +/* SPDX-License-Identifier: GPL-2.0-only OR Apache-2.0 */ +/* + * EROFS (Enhanced ROM File System) on-disk format definition + * + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#ifndef __EROFS_FS_H +#define __EROFS_FS_H + +#define EROFS_SUPER_OFFSET 1024 + +/* + * Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should + * be incompatible with this kernel version. + */ +#define EROFS_FEATURE_INCOMPAT_LZ4_0PADDING 0x00000001 +#define EROFS_ALL_FEATURE_INCOMPAT EROFS_FEATURE_INCOMPAT_LZ4_0PADDING + +/* 128-byte erofs on-disk super block */ +struct erofs_super_block { + __le32 magic; /* file system magic number */ + __le32 checksum; /* crc32c(super_block) */ + __le32 feature_compat; + __u8 blkszbits; /* support block_size == PAGE_SIZE only */ + __u8 reserved; + + __le16 root_nid; /* nid of root directory */ + __le64 inos; /* total valid ino # (== f_files - f_favail) */ + + __le64 build_time; /* inode v1 time derivation */ + __le32 build_time_nsec; /* inode v1 time derivation in nano scale */ + __le32 blocks; /* used for statfs */ + __le32 meta_blkaddr; /* start block address of metadata area */ + __le32 xattr_blkaddr; /* start block address of shared xattr area */ + __u8 uuid[16]; /* 128-bit uuid for volume */ + __u8 volume_name[16]; /* volume name */ + __le32 feature_incompat; + + __u8 reserved2[44]; +}; + +/* + * erofs inode datalayout (i_format in on-disk inode): + * 0 - inode plain without inline data A: + * inode, [xattrs], ... | ... | no-holed data + * 1 - inode VLE compression B (legacy): + * inode, [xattrs], extents ... | ... + * 2 - inode plain with inline data C: + * inode, [xattrs], last_inline_data, ... | ... | no-holed data + * 3 - inode compression D: + * inode, [xattrs], map_header, extents ... | ... + * 4~7 - reserved + */ +enum { + EROFS_INODE_FLAT_PLAIN = 0, + EROFS_INODE_FLAT_COMPRESSION_LEGACY = 1, + EROFS_INODE_FLAT_INLINE = 2, + EROFS_INODE_FLAT_COMPRESSION = 3, + EROFS_INODE_DATALAYOUT_MAX +}; + +static inline bool erofs_inode_is_data_compressed(unsigned int datamode) +{ + return datamode == EROFS_INODE_FLAT_COMPRESSION || + datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY; +} + +/* bit definitions of inode i_advise */ +#define EROFS_I_VERSION_BITS 1 +#define EROFS_I_DATALAYOUT_BITS 3 + +#define EROFS_I_VERSION_BIT 0 +#define EROFS_I_DATALAYOUT_BIT 1 + +/* 32-byte reduced form of an ondisk inode */ +struct erofs_inode_compact { + __le16 i_format; /* inode format hints */ + +/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */ + __le16 i_xattr_icount; + __le16 i_mode; + __le16 i_nlink; + __le32 i_size; + __le32 i_reserved; + union { + /* file total compressed blocks for data mapping 1 */ + __le32 compressed_blocks; + __le32 raw_blkaddr; + + /* for device files, used to indicate old/new device # */ + __le32 rdev; + } i_u; + __le32 i_ino; /* only used for 32-bit stat compatibility */ + __le16 i_uid; + __le16 i_gid; + __le32 i_reserved2; +}; + +/* 32 bytes on-disk inode */ +#define EROFS_INODE_LAYOUT_COMPACT 0 +/* 64 bytes on-disk inode */ +#define EROFS_INODE_LAYOUT_EXTENDED 1 + +/* 64-byte complete form of an ondisk inode */ +struct erofs_inode_extended { + __le16 i_format; /* inode format hints */ + +/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */ + __le16 i_xattr_icount; + __le16 i_mode; + __le16 i_reserved; + __le64 i_size; + union { + /* file total compressed blocks for data mapping 1 */ + __le32 compressed_blocks; + __le32 raw_blkaddr; + + /* for device files, used to indicate old/new device # */ + __le32 rdev; + } i_u; + + /* only used for 32-bit stat compatibility */ + __le32 i_ino; + + __le32 i_uid; + __le32 i_gid; + __le64 i_ctime; + __le32 i_ctime_nsec; + __le32 i_nlink; + __u8 i_reserved2[16]; +}; + +#define EROFS_MAX_SHARED_XATTRS (128) +/* h_shared_count between 129 ... 255 are special # */ +#define EROFS_SHARED_XATTR_EXTENT (255) + +/* + * inline xattrs (n == i_xattr_icount): + * erofs_xattr_ibody_header(1) + (n - 1) * 4 bytes + * 12 bytes / \ + * / \ + * /-----------------------\ + * | erofs_xattr_entries+ | + * +-----------------------+ + * inline xattrs must starts in erofs_xattr_ibody_header, + * for read-only fs, no need to introduce h_refcount + */ +struct erofs_xattr_ibody_header { + __le32 h_reserved; + __u8 h_shared_count; + __u8 h_reserved2[7]; + __le32 h_shared_xattrs[0]; /* shared xattr id array */ +}; + +/* Name indexes */ +#define EROFS_XATTR_INDEX_USER 1 +#define EROFS_XATTR_INDEX_POSIX_ACL_ACCESS 2 +#define EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT 3 +#define EROFS_XATTR_INDEX_TRUSTED 4 +#define EROFS_XATTR_INDEX_LUSTRE 5 +#define EROFS_XATTR_INDEX_SECURITY 6 + +/* xattr entry (for both inline & shared xattrs) */ +struct erofs_xattr_entry { + __u8 e_name_len; /* length of name */ + __u8 e_name_index; /* attribute name index */ + __le16 e_value_size; /* size of attribute value */ + /* followed by e_name and e_value */ + char e_name[0]; /* attribute name */ +}; + +static inline unsigned int erofs_xattr_ibody_size(__le16 i_xattr_icount) +{ + if (!i_xattr_icount) + return 0; + + return sizeof(struct erofs_xattr_ibody_header) + + sizeof(__u32) * (le16_to_cpu(i_xattr_icount) - 1); +} + +#define EROFS_XATTR_ALIGN(size) round_up(size, sizeof(struct erofs_xattr_entry)) + +static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e) +{ + return EROFS_XATTR_ALIGN(sizeof(struct erofs_xattr_entry) + + e->e_name_len + le16_to_cpu(e->e_value_size)); +} + +/* available compression algorithm types (for h_algorithmtype) */ +enum { + Z_EROFS_COMPRESSION_LZ4 = 0, + Z_EROFS_COMPRESSION_MAX +}; + +/* + * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on) + * e.g. for 4k logical cluster size, 4B if compacted 2B is off; + * (4B) + 2B + (4B) if compacted 2B is on. + */ +#define Z_EROFS_ADVISE_COMPACTED_2B_BIT 0 + +#define Z_EROFS_ADVISE_COMPACTED_2B (1 << Z_EROFS_ADVISE_COMPACTED_2B_BIT) + +struct z_erofs_map_header { + __le32 h_reserved1; + __le16 h_advise; + /* + * bit 0-3 : algorithm type of head 1 (logical cluster type 01); + * bit 4-7 : algorithm type of head 2 (logical cluster type 11). + */ + __u8 h_algorithmtype; + /* + * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096; + * bit 3-4 : (physical - logical) cluster bits of head 1: + * For example, if logical clustersize = 4096, 1 for 8192. + * bit 5-7 : (physical - logical) cluster bits of head 2. + */ + __u8 h_clusterbits; +}; + +#define Z_EROFS_VLE_LEGACY_HEADER_PADDING 8 + +/* + * Fixed-sized output compression ondisk Logical Extent cluster type: + * 0 - literal (uncompressed) cluster + * 1 - compressed cluster (for the head logical cluster) + * 2 - compressed cluster (for the other logical clusters) + * + * In detail, + * 0 - literal (uncompressed) cluster, + * di_advise = 0 + * di_clusterofs = the literal data offset of the cluster + * di_blkaddr = the blkaddr of the literal cluster + * + * 1 - compressed cluster (for the head logical cluster) + * di_advise = 1 + * di_clusterofs = the decompressed data offset of the cluster + * di_blkaddr = the blkaddr of the compressed cluster + * + * 2 - compressed cluster (for the other logical clusters) + * di_advise = 2 + * di_clusterofs = + * the decompressed data offset in its own head cluster + * di_u.delta[0] = distance to its corresponding head cluster + * di_u.delta[1] = distance to its corresponding tail cluster + * (di_advise could be 0, 1 or 2) + */ +enum { + Z_EROFS_VLE_CLUSTER_TYPE_PLAIN = 0, + Z_EROFS_VLE_CLUSTER_TYPE_HEAD = 1, + Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD = 2, + Z_EROFS_VLE_CLUSTER_TYPE_RESERVED = 3, + Z_EROFS_VLE_CLUSTER_TYPE_MAX +}; + +#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS 2 +#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT 0 + +struct z_erofs_vle_decompressed_index { + __le16 di_advise; + /* where to decompress in the head cluster */ + __le16 di_clusterofs; + + union { + /* for the head cluster */ + __le32 blkaddr; + /* + * for the rest clusters + * eg. for 4k page-sized cluster, maximum 4K*64k = 256M) + * [0] - pointing to the head cluster + * [1] - pointing to the tail cluster + */ + __le16 delta[2]; + } di_u; +}; + +#define Z_EROFS_VLE_LEGACY_INDEX_ALIGN(size) \ + (round_up(size, sizeof(struct z_erofs_vle_decompressed_index)) + \ + sizeof(struct z_erofs_map_header) + Z_EROFS_VLE_LEGACY_HEADER_PADDING) + +/* dirent sorts in alphabet order, thus we can do binary search */ +struct erofs_dirent { + __le64 nid; /* node number */ + __le16 nameoff; /* start offset of file name */ + __u8 file_type; /* file type */ + __u8 reserved; /* reserved */ +} __packed; + +/* + * EROFS file types should match generic FT_* types and + * it seems no need to add BUILD_BUG_ONs since potential + * unmatchness will break other fses as well... + */ + +#define EROFS_NAME_LEN 255 + +/* check the EROFS on-disk layout strictly at compile time */ +static inline void erofs_check_ondisk_layout_definitions(void) +{ + BUILD_BUG_ON(sizeof(struct erofs_super_block) != 128); + BUILD_BUG_ON(sizeof(struct erofs_inode_compact) != 32); + BUILD_BUG_ON(sizeof(struct erofs_inode_extended) != 64); + BUILD_BUG_ON(sizeof(struct erofs_xattr_ibody_header) != 12); + BUILD_BUG_ON(sizeof(struct erofs_xattr_entry) != 4); + BUILD_BUG_ON(sizeof(struct z_erofs_map_header) != 8); + BUILD_BUG_ON(sizeof(struct z_erofs_vle_decompressed_index) != 8); + BUILD_BUG_ON(sizeof(struct erofs_dirent) != 12); + + BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) < + Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1); +} + +#endif + diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c new file mode 100644 index 000000000000..3350ab65d892 --- /dev/null +++ b/fs/erofs/inode.c @@ -0,0 +1,337 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#include "xattr.h" + +#include <trace/events/erofs.h> + +/* no locking */ +static int erofs_read_inode(struct inode *inode, void *data) +{ + struct erofs_inode *vi = EROFS_I(inode); + struct erofs_inode_compact *dic = data; + struct erofs_inode_extended *die; + + const unsigned int ifmt = le16_to_cpu(dic->i_format); + struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); + erofs_blk_t nblks = 0; + + vi->datalayout = erofs_inode_datalayout(ifmt); + + if (vi->datalayout >= EROFS_INODE_DATALAYOUT_MAX) { + erofs_err(inode->i_sb, "unsupported datalayout %u of nid %llu", + vi->datalayout, vi->nid); + DBG_BUGON(1); + return -EOPNOTSUPP; + } + + switch (erofs_inode_version(ifmt)) { + case EROFS_INODE_LAYOUT_EXTENDED: + die = data; + + vi->inode_isize = sizeof(struct erofs_inode_extended); + vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount); + + inode->i_mode = le16_to_cpu(die->i_mode); + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + vi->raw_blkaddr = le32_to_cpu(die->i_u.raw_blkaddr); + break; + case S_IFCHR: + case S_IFBLK: + inode->i_rdev = + new_decode_dev(le32_to_cpu(die->i_u.rdev)); + break; + case S_IFIFO: + case S_IFSOCK: + inode->i_rdev = 0; + break; + default: + goto bogusimode; + } + i_uid_write(inode, le32_to_cpu(die->i_uid)); + i_gid_write(inode, le32_to_cpu(die->i_gid)); + set_nlink(inode, le32_to_cpu(die->i_nlink)); + + /* ns timestamp */ + inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = + le64_to_cpu(die->i_ctime); + inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = + le32_to_cpu(die->i_ctime_nsec); + + inode->i_size = le64_to_cpu(die->i_size); + + /* total blocks for compressed files */ + if (erofs_inode_is_data_compressed(vi->datalayout)) + nblks = le32_to_cpu(die->i_u.compressed_blocks); + break; + case EROFS_INODE_LAYOUT_COMPACT: + vi->inode_isize = sizeof(struct erofs_inode_compact); + vi->xattr_isize = erofs_xattr_ibody_size(dic->i_xattr_icount); + + inode->i_mode = le16_to_cpu(dic->i_mode); + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + vi->raw_blkaddr = le32_to_cpu(dic->i_u.raw_blkaddr); + break; + case S_IFCHR: + case S_IFBLK: + inode->i_rdev = + new_decode_dev(le32_to_cpu(dic->i_u.rdev)); + break; + case S_IFIFO: + case S_IFSOCK: + inode->i_rdev = 0; + break; + default: + goto bogusimode; + } + i_uid_write(inode, le16_to_cpu(dic->i_uid)); + i_gid_write(inode, le16_to_cpu(dic->i_gid)); + set_nlink(inode, le16_to_cpu(dic->i_nlink)); + + /* use build time to derive all file time */ + inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = + sbi->build_time; + inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = + sbi->build_time_nsec; + + inode->i_size = le32_to_cpu(dic->i_size); + if (erofs_inode_is_data_compressed(vi->datalayout)) + nblks = le32_to_cpu(dic->i_u.compressed_blocks); + break; + default: + erofs_err(inode->i_sb, + "unsupported on-disk inode version %u of nid %llu", + erofs_inode_version(ifmt), vi->nid); + DBG_BUGON(1); + return -EOPNOTSUPP; + } + + if (!nblks) + /* measure inode.i_blocks as generic filesystems */ + inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9; + else + inode->i_blocks = nblks << LOG_SECTORS_PER_BLOCK; + return 0; + +bogusimode: + erofs_err(inode->i_sb, "bogus i_mode (%o) @ nid %llu", + inode->i_mode, vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; +} + +static int erofs_fill_symlink(struct inode *inode, void *data, + unsigned int m_pofs) +{ + struct erofs_inode *vi = EROFS_I(inode); + char *lnk; + + /* if it cannot be handled with fast symlink scheme */ + if (vi->datalayout != EROFS_INODE_FLAT_INLINE || + inode->i_size >= PAGE_SIZE) { + inode->i_op = &erofs_symlink_iops; + return 0; + } + + lnk = kmalloc(inode->i_size + 1, GFP_KERNEL); + if (!lnk) + return -ENOMEM; + + m_pofs += vi->inode_isize + vi->xattr_isize; + /* inline symlink data shouldn't cross page boundary as well */ + if (m_pofs + inode->i_size > PAGE_SIZE) { + kfree(lnk); + erofs_err(inode->i_sb, + "inline data cross block boundary @ nid %llu", + vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; + } + + memcpy(lnk, data + m_pofs, inode->i_size); + lnk[inode->i_size] = '\0'; + + inode->i_link = lnk; + inode->i_op = &erofs_fast_symlink_iops; + return 0; +} + +static int erofs_fill_inode(struct inode *inode, int isdir) +{ + struct super_block *sb = inode->i_sb; + struct erofs_inode *vi = EROFS_I(inode); + struct page *page; + void *data; + int err; + erofs_blk_t blkaddr; + unsigned int ofs; + erofs_off_t inode_loc; + + trace_erofs_fill_inode(inode, isdir); + inode_loc = iloc(EROFS_SB(sb), vi->nid); + blkaddr = erofs_blknr(inode_loc); + ofs = erofs_blkoff(inode_loc); + + erofs_dbg("%s, reading inode nid %llu at %u of blkaddr %u", + __func__, vi->nid, ofs, blkaddr); + + page = erofs_get_meta_page(sb, blkaddr); + + if (IS_ERR(page)) { + erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld", + vi->nid, PTR_ERR(page)); + return PTR_ERR(page); + } + + DBG_BUGON(!PageUptodate(page)); + data = page_address(page); + + err = erofs_read_inode(inode, data + ofs); + if (err) + goto out_unlock; + + /* setup the new inode */ + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_op = &erofs_generic_iops; + inode->i_fop = &generic_ro_fops; + break; + case S_IFDIR: + inode->i_op = &erofs_dir_iops; + inode->i_fop = &erofs_dir_fops; + break; + case S_IFLNK: + err = erofs_fill_symlink(inode, data, ofs); + if (err) + goto out_unlock; + inode_nohighmem(inode); + break; + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + inode->i_op = &erofs_generic_iops; + init_special_inode(inode, inode->i_mode, inode->i_rdev); + goto out_unlock; + default: + err = -EFSCORRUPTED; + goto out_unlock; + } + + if (erofs_inode_is_data_compressed(vi->datalayout)) { + err = z_erofs_fill_inode(inode); + goto out_unlock; + } + inode->i_mapping->a_ops = &erofs_raw_access_aops; + +out_unlock: + unlock_page(page); + put_page(page); + return err; +} + +/* + * erofs nid is 64bits, but i_ino is 'unsigned long', therefore + * we should do more for 32-bit platform to find the right inode. + */ +static int erofs_ilookup_test_actor(struct inode *inode, void *opaque) +{ + const erofs_nid_t nid = *(erofs_nid_t *)opaque; + + return EROFS_I(inode)->nid == nid; +} + +static int erofs_iget_set_actor(struct inode *inode, void *opaque) +{ + const erofs_nid_t nid = *(erofs_nid_t *)opaque; + + inode->i_ino = erofs_inode_hash(nid); + return 0; +} + +static inline struct inode *erofs_iget_locked(struct super_block *sb, + erofs_nid_t nid) +{ + const unsigned long hashval = erofs_inode_hash(nid); + + return iget5_locked(sb, hashval, erofs_ilookup_test_actor, + erofs_iget_set_actor, &nid); +} + +struct inode *erofs_iget(struct super_block *sb, + erofs_nid_t nid, + bool isdir) +{ + struct inode *inode = erofs_iget_locked(sb, nid); + + if (!inode) + return ERR_PTR(-ENOMEM); + + if (inode->i_state & I_NEW) { + int err; + struct erofs_inode *vi = EROFS_I(inode); + + vi->nid = nid; + + err = erofs_fill_inode(inode, isdir); + if (!err) + unlock_new_inode(inode); + else { + iget_failed(inode); + inode = ERR_PTR(err); + } + } + return inode; +} + +int erofs_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) +{ + struct inode *const inode = d_inode(path->dentry); + + if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) + stat->attributes |= STATX_ATTR_COMPRESSED; + + stat->attributes |= STATX_ATTR_IMMUTABLE; + stat->attributes_mask |= (STATX_ATTR_COMPRESSED | + STATX_ATTR_IMMUTABLE); + + generic_fillattr(inode, stat); + return 0; +} + +const struct inode_operations erofs_generic_iops = { + .getattr = erofs_getattr, +#ifdef CONFIG_EROFS_FS_XATTR + .listxattr = erofs_listxattr, +#endif + .get_acl = erofs_get_acl, +}; + +const struct inode_operations erofs_symlink_iops = { + .get_link = page_get_link, + .getattr = erofs_getattr, +#ifdef CONFIG_EROFS_FS_XATTR + .listxattr = erofs_listxattr, +#endif + .get_acl = erofs_get_acl, +}; + +const struct inode_operations erofs_fast_symlink_iops = { + .get_link = simple_get_link, + .getattr = erofs_getattr, +#ifdef CONFIG_EROFS_FS_XATTR + .listxattr = erofs_listxattr, +#endif + .get_acl = erofs_get_acl, +}; + diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h new file mode 100644 index 000000000000..544a453f3076 --- /dev/null +++ b/fs/erofs/internal.h @@ -0,0 +1,431 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#ifndef __EROFS_INTERNAL_H +#define __EROFS_INTERNAL_H + +#include <linux/fs.h> +#include <linux/dcache.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/bio.h> +#include <linux/buffer_head.h> +#include <linux/magic.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include "erofs_fs.h" + +/* redefine pr_fmt "erofs: " */ +#undef pr_fmt +#define pr_fmt(fmt) "erofs: " fmt + +__printf(3, 4) void _erofs_err(struct super_block *sb, + const char *function, const char *fmt, ...); +#define erofs_err(sb, fmt, ...) \ + _erofs_err(sb, __func__, fmt "\n", ##__VA_ARGS__) +__printf(3, 4) void _erofs_info(struct super_block *sb, + const char *function, const char *fmt, ...); +#define erofs_info(sb, fmt, ...) \ + _erofs_info(sb, __func__, fmt "\n", ##__VA_ARGS__) +#ifdef CONFIG_EROFS_FS_DEBUG +#define erofs_dbg(x, ...) pr_debug(x "\n", ##__VA_ARGS__) +#define DBG_BUGON BUG_ON +#else +#define erofs_dbg(x, ...) ((void)0) +#define DBG_BUGON(x) ((void)(x)) +#endif /* !CONFIG_EROFS_FS_DEBUG */ + +/* EROFS_SUPER_MAGIC_V1 to represent the whole file system */ +#define EROFS_SUPER_MAGIC EROFS_SUPER_MAGIC_V1 + +typedef u64 erofs_nid_t; +typedef u64 erofs_off_t; +/* data type for filesystem-wide blocks number */ +typedef u32 erofs_blk_t; + +struct erofs_sb_info { +#ifdef CONFIG_EROFS_FS_ZIP + /* list for all registered superblocks, mainly for shrinker */ + struct list_head list; + struct mutex umount_mutex; + + /* the dedicated workstation for compression */ + struct radix_tree_root workstn_tree; + + /* threshold for decompression synchronously */ + unsigned int max_sync_decompress_pages; + + unsigned int shrinker_run_no; + + /* current strategy of how to use managed cache */ + unsigned char cache_strategy; + + /* pseudo inode to manage cached pages */ + struct inode *managed_cache; +#endif /* CONFIG_EROFS_FS_ZIP */ + u32 blocks; + u32 meta_blkaddr; +#ifdef CONFIG_EROFS_FS_XATTR + u32 xattr_blkaddr; +#endif + + /* inode slot unit size in bit shift */ + unsigned char islotbits; + + u32 build_time_nsec; + u64 build_time; + + /* what we really care is nid, rather than ino.. */ + erofs_nid_t root_nid; + /* used for statfs, f_files - f_favail */ + u64 inos; + + u8 uuid[16]; /* 128-bit uuid for volume */ + u8 volume_name[16]; /* volume name */ + u32 feature_incompat; + + unsigned int mount_opt; +}; + +#define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info) +#define EROFS_I_SB(inode) ((struct erofs_sb_info *)(inode)->i_sb->s_fs_info) + +/* Mount flags set via mount options or defaults */ +#define EROFS_MOUNT_XATTR_USER 0x00000010 +#define EROFS_MOUNT_POSIX_ACL 0x00000020 + +#define clear_opt(sbi, option) ((sbi)->mount_opt &= ~EROFS_MOUNT_##option) +#define set_opt(sbi, option) ((sbi)->mount_opt |= EROFS_MOUNT_##option) +#define test_opt(sbi, option) ((sbi)->mount_opt & EROFS_MOUNT_##option) + +#ifdef CONFIG_EROFS_FS_ZIP +enum { + EROFS_ZIP_CACHE_DISABLED, + EROFS_ZIP_CACHE_READAHEAD, + EROFS_ZIP_CACHE_READAROUND +}; + +#define EROFS_LOCKED_MAGIC (INT_MIN | 0xE0F510CCL) + +/* basic unit of the workstation of a super_block */ +struct erofs_workgroup { + /* the workgroup index in the workstation */ + pgoff_t index; + + /* overall workgroup reference count */ + atomic_t refcount; +}; + +#if defined(CONFIG_SMP) +static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp, + int val) +{ + preempt_disable(); + if (val != atomic_cmpxchg(&grp->refcount, val, EROFS_LOCKED_MAGIC)) { + preempt_enable(); + return false; + } + return true; +} + +static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp, + int orig_val) +{ + /* + * other observers should notice all modifications + * in the freezing period. + */ + smp_mb(); + atomic_set(&grp->refcount, orig_val); + preempt_enable(); +} + +static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp) +{ + return atomic_cond_read_relaxed(&grp->refcount, + VAL != EROFS_LOCKED_MAGIC); +} +#else +static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp, + int val) +{ + preempt_disable(); + /* no need to spin on UP platforms, let's just disable preemption. */ + if (val != atomic_read(&grp->refcount)) { + preempt_enable(); + return false; + } + return true; +} + +static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp, + int orig_val) +{ + preempt_enable(); +} + +static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp) +{ + int v = atomic_read(&grp->refcount); + + /* workgroup is never freezed on uniprocessor systems */ + DBG_BUGON(v == EROFS_LOCKED_MAGIC); + return v; +} +#endif /* !CONFIG_SMP */ + +/* hard limit of pages per compressed cluster */ +#define Z_EROFS_CLUSTER_MAX_PAGES (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT) +#define EROFS_PCPUBUF_NR_PAGES Z_EROFS_CLUSTER_MAX_PAGES +#else +#define EROFS_PCPUBUF_NR_PAGES 0 +#endif /* !CONFIG_EROFS_FS_ZIP */ + +/* we strictly follow PAGE_SIZE and no buffer head yet */ +#define LOG_BLOCK_SIZE PAGE_SHIFT + +#undef LOG_SECTORS_PER_BLOCK +#define LOG_SECTORS_PER_BLOCK (PAGE_SHIFT - 9) + +#undef SECTORS_PER_BLOCK +#define SECTORS_PER_BLOCK (1 << SECTORS_PER_BLOCK) + +#define EROFS_BLKSIZ (1 << LOG_BLOCK_SIZE) + +#if (EROFS_BLKSIZ % 4096 || !EROFS_BLKSIZ) +#error erofs cannot be used in this platform +#endif + +#define ROOT_NID(sb) ((sb)->root_nid) + +#define erofs_blknr(addr) ((addr) / EROFS_BLKSIZ) +#define erofs_blkoff(addr) ((addr) % EROFS_BLKSIZ) +#define blknr_to_addr(nr) ((erofs_off_t)(nr) * EROFS_BLKSIZ) + +static inline erofs_off_t iloc(struct erofs_sb_info *sbi, erofs_nid_t nid) +{ + return blknr_to_addr(sbi->meta_blkaddr) + (nid << sbi->islotbits); +} + +/* atomic flag definitions */ +#define EROFS_I_EA_INITED_BIT 0 +#define EROFS_I_Z_INITED_BIT 1 + +/* bitlock definitions (arranged in reverse order) */ +#define EROFS_I_BL_XATTR_BIT (BITS_PER_LONG - 1) +#define EROFS_I_BL_Z_BIT (BITS_PER_LONG - 2) + +struct erofs_inode { + erofs_nid_t nid; + + /* atomic flags (including bitlocks) */ + unsigned long flags; + + unsigned char datalayout; + unsigned char inode_isize; + unsigned short xattr_isize; + + unsigned int xattr_shared_count; + unsigned int *xattr_shared_xattrs; + + union { + erofs_blk_t raw_blkaddr; +#ifdef CONFIG_EROFS_FS_ZIP + struct { + unsigned short z_advise; + unsigned char z_algorithmtype[2]; + unsigned char z_logical_clusterbits; + unsigned char z_physical_clusterbits[2]; + }; +#endif /* CONFIG_EROFS_FS_ZIP */ + }; + /* the corresponding vfs inode */ + struct inode vfs_inode; +}; + +#define EROFS_I(ptr) \ + container_of(ptr, struct erofs_inode, vfs_inode) + +static inline unsigned long erofs_inode_datablocks(struct inode *inode) +{ + /* since i_size cannot be changed */ + return DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ); +} + +static inline unsigned int erofs_bitrange(unsigned int value, unsigned int bit, + unsigned int bits) +{ + + return (value >> bit) & ((1 << bits) - 1); +} + + +static inline unsigned int erofs_inode_version(unsigned int value) +{ + return erofs_bitrange(value, EROFS_I_VERSION_BIT, + EROFS_I_VERSION_BITS); +} + +static inline unsigned int erofs_inode_datalayout(unsigned int value) +{ + return erofs_bitrange(value, EROFS_I_DATALAYOUT_BIT, + EROFS_I_DATALAYOUT_BITS); +} + +extern const struct super_operations erofs_sops; + +extern const struct address_space_operations erofs_raw_access_aops; +#ifdef CONFIG_EROFS_FS_ZIP +extern const struct address_space_operations z_erofs_vle_normalaccess_aops; +#endif + +/* + * Logical to physical block mapping, used by erofs_map_blocks() + * + * Different with other file systems, it is used for 2 access modes: + * + * 1) RAW access mode: + * + * Users pass a valid (m_lblk, m_lofs -- usually 0) pair, + * and get the valid m_pblk, m_pofs and the longest m_len(in bytes). + * + * Note that m_lblk in the RAW access mode refers to the number of + * the compressed ondisk block rather than the uncompressed + * in-memory block for the compressed file. + * + * m_pofs equals to m_lofs except for the inline data page. + * + * 2) Normal access mode: + * + * If the inode is not compressed, it has no difference with + * the RAW access mode. However, if the inode is compressed, + * users should pass a valid (m_lblk, m_lofs) pair, and get + * the needed m_pblk, m_pofs, m_len to get the compressed data + * and the updated m_lblk, m_lofs which indicates the start + * of the corresponding uncompressed data in the file. + */ +enum { + BH_Zipped = BH_PrivateStart, + BH_FullMapped, +}; + +/* Has a disk mapping */ +#define EROFS_MAP_MAPPED (1 << BH_Mapped) +/* Located in metadata (could be copied from bd_inode) */ +#define EROFS_MAP_META (1 << BH_Meta) +/* The extent has been compressed */ +#define EROFS_MAP_ZIPPED (1 << BH_Zipped) +/* The length of extent is full */ +#define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped) + +struct erofs_map_blocks { + erofs_off_t m_pa, m_la; + u64 m_plen, m_llen; + + unsigned int m_flags; + + struct page *mpage; +}; + +/* Flags used by erofs_map_blocks() */ +#define EROFS_GET_BLOCKS_RAW 0x0001 + +/* zmap.c */ +#ifdef CONFIG_EROFS_FS_ZIP +int z_erofs_fill_inode(struct inode *inode); +int z_erofs_map_blocks_iter(struct inode *inode, + struct erofs_map_blocks *map, + int flags); +#else +static inline int z_erofs_fill_inode(struct inode *inode) { return -EOPNOTSUPP; } +static inline int z_erofs_map_blocks_iter(struct inode *inode, + struct erofs_map_blocks *map, + int flags) +{ + return -EOPNOTSUPP; +} +#endif /* !CONFIG_EROFS_FS_ZIP */ + +/* data.c */ +struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr); + +int erofs_map_blocks(struct inode *, struct erofs_map_blocks *, int); + +/* inode.c */ +static inline unsigned long erofs_inode_hash(erofs_nid_t nid) +{ +#if BITS_PER_LONG == 32 + return (nid >> 32) ^ (nid & 0xffffffff); +#else + return nid; +#endif +} + +extern const struct inode_operations erofs_generic_iops; +extern const struct inode_operations erofs_symlink_iops; +extern const struct inode_operations erofs_fast_symlink_iops; + +struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid, bool dir); +int erofs_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags); + +/* namei.c */ +extern const struct inode_operations erofs_dir_iops; + +int erofs_namei(struct inode *dir, struct qstr *name, + erofs_nid_t *nid, unsigned int *d_type); + +/* dir.c */ +extern const struct file_operations erofs_dir_fops; + +/* utils.c / zdata.c */ +struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp, bool nofail); + +#if (EROFS_PCPUBUF_NR_PAGES > 0) +void *erofs_get_pcpubuf(unsigned int pagenr); +#define erofs_put_pcpubuf(buf) do { \ + (void)&(buf); \ + preempt_enable(); \ +} while (0) +#else +static inline void *erofs_get_pcpubuf(unsigned int pagenr) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +#define erofs_put_pcpubuf(buf) do {} while (0) +#endif + +#ifdef CONFIG_EROFS_FS_ZIP +int erofs_workgroup_put(struct erofs_workgroup *grp); +struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, + pgoff_t index, bool *tag); +int erofs_register_workgroup(struct super_block *sb, + struct erofs_workgroup *grp, bool tag); +void erofs_workgroup_free_rcu(struct erofs_workgroup *grp); +void erofs_shrinker_register(struct super_block *sb); +void erofs_shrinker_unregister(struct super_block *sb); +int __init erofs_init_shrinker(void); +void erofs_exit_shrinker(void); +int __init z_erofs_init_zip_subsystem(void); +void z_erofs_exit_zip_subsystem(void); +int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, + struct erofs_workgroup *egrp); +int erofs_try_to_free_cached_page(struct address_space *mapping, + struct page *page); +#else +static inline void erofs_shrinker_register(struct super_block *sb) {} +static inline void erofs_shrinker_unregister(struct super_block *sb) {} +static inline int erofs_init_shrinker(void) { return 0; } +static inline void erofs_exit_shrinker(void) {} +static inline int z_erofs_init_zip_subsystem(void) { return 0; } +static inline void z_erofs_exit_zip_subsystem(void) {} +#endif /* !CONFIG_EROFS_FS_ZIP */ + +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ + +#endif /* __EROFS_INTERNAL_H */ + diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c new file mode 100644 index 000000000000..3abbecbf73de --- /dev/null +++ b/fs/erofs/namei.c @@ -0,0 +1,252 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#include "xattr.h" + +#include <trace/events/erofs.h> + +struct erofs_qstr { + const unsigned char *name; + const unsigned char *end; +}; + +/* based on the end of qn is accurate and it must have the trailing '\0' */ +static inline int erofs_dirnamecmp(const struct erofs_qstr *qn, + const struct erofs_qstr *qd, + unsigned int *matched) +{ + unsigned int i = *matched; + + /* + * on-disk error, let's only BUG_ON in the debugging mode. + * otherwise, it will return 1 to just skip the invalid name + * and go on (in consideration of the lookup performance). + */ + DBG_BUGON(qd->name > qd->end); + + /* qd could not have trailing '\0' */ + /* However it is absolutely safe if < qd->end */ + while (qd->name + i < qd->end && qd->name[i] != '\0') { + if (qn->name[i] != qd->name[i]) { + *matched = i; + return qn->name[i] > qd->name[i] ? 1 : -1; + } + ++i; + } + *matched = i; + /* See comments in __d_alloc on the terminating NUL character */ + return qn->name[i] == '\0' ? 0 : 1; +} + +#define nameoff_from_disk(off, sz) (le16_to_cpu(off) & ((sz) - 1)) + +static struct erofs_dirent *find_target_dirent(struct erofs_qstr *name, + u8 *data, + unsigned int dirblksize, + const int ndirents) +{ + int head, back; + unsigned int startprfx, endprfx; + struct erofs_dirent *const de = (struct erofs_dirent *)data; + + /* since the 1st dirent has been evaluated previously */ + head = 1; + back = ndirents - 1; + startprfx = endprfx = 0; + + while (head <= back) { + const int mid = head + (back - head) / 2; + const int nameoff = nameoff_from_disk(de[mid].nameoff, + dirblksize); + unsigned int matched = min(startprfx, endprfx); + struct erofs_qstr dname = { + .name = data + nameoff, + .end = mid >= ndirents - 1 ? + data + dirblksize : + data + nameoff_from_disk(de[mid + 1].nameoff, + dirblksize) + }; + + /* string comparison without already matched prefix */ + int ret = erofs_dirnamecmp(name, &dname, &matched); + + if (!ret) { + return de + mid; + } else if (ret > 0) { + head = mid + 1; + startprfx = matched; + } else { + back = mid - 1; + endprfx = matched; + } + } + + return ERR_PTR(-ENOENT); +} + +static struct page *find_target_block_classic(struct inode *dir, + struct erofs_qstr *name, + int *_ndirents) +{ + unsigned int startprfx, endprfx; + int head, back; + struct address_space *const mapping = dir->i_mapping; + struct page *candidate = ERR_PTR(-ENOENT); + + startprfx = endprfx = 0; + head = 0; + back = erofs_inode_datablocks(dir) - 1; + + while (head <= back) { + const int mid = head + (back - head) / 2; + struct page *page = read_mapping_page(mapping, mid, NULL); + + if (!IS_ERR(page)) { + struct erofs_dirent *de = kmap_atomic(page); + const int nameoff = nameoff_from_disk(de->nameoff, + EROFS_BLKSIZ); + const int ndirents = nameoff / sizeof(*de); + int diff; + unsigned int matched; + struct erofs_qstr dname; + + if (!ndirents) { + kunmap_atomic(de); + put_page(page); + erofs_err(dir->i_sb, + "corrupted dir block %d @ nid %llu", + mid, EROFS_I(dir)->nid); + DBG_BUGON(1); + page = ERR_PTR(-EFSCORRUPTED); + goto out; + } + + matched = min(startprfx, endprfx); + + dname.name = (u8 *)de + nameoff; + if (ndirents == 1) + dname.end = (u8 *)de + EROFS_BLKSIZ; + else + dname.end = (u8 *)de + + nameoff_from_disk(de[1].nameoff, + EROFS_BLKSIZ); + + /* string comparison without already matched prefix */ + diff = erofs_dirnamecmp(name, &dname, &matched); + kunmap_atomic(de); + + if (!diff) { + *_ndirents = 0; + goto out; + } else if (diff > 0) { + head = mid + 1; + startprfx = matched; + + if (!IS_ERR(candidate)) + put_page(candidate); + candidate = page; + *_ndirents = ndirents; + } else { + put_page(page); + + back = mid - 1; + endprfx = matched; + } + continue; + } +out: /* free if the candidate is valid */ + if (!IS_ERR(candidate)) + put_page(candidate); + return page; + } + return candidate; +} + +int erofs_namei(struct inode *dir, + struct qstr *name, + erofs_nid_t *nid, unsigned int *d_type) +{ + int ndirents; + struct page *page; + void *data; + struct erofs_dirent *de; + struct erofs_qstr qn; + + if (!dir->i_size) + return -ENOENT; + + qn.name = name->name; + qn.end = name->name + name->len; + + ndirents = 0; + page = find_target_block_classic(dir, &qn, &ndirents); + + if (IS_ERR(page)) + return PTR_ERR(page); + + data = kmap_atomic(page); + /* the target page has been mapped */ + if (ndirents) + de = find_target_dirent(&qn, data, EROFS_BLKSIZ, ndirents); + else + de = (struct erofs_dirent *)data; + + if (!IS_ERR(de)) { + *nid = le64_to_cpu(de->nid); + *d_type = de->file_type; + } + + kunmap_atomic(data); + put_page(page); + + return PTR_ERR_OR_ZERO(de); +} + +/* NOTE: i_mutex is already held by vfs */ +static struct dentry *erofs_lookup(struct inode *dir, + struct dentry *dentry, + unsigned int flags) +{ + int err; + erofs_nid_t nid; + unsigned int d_type; + struct inode *inode; + + DBG_BUGON(!d_really_is_negative(dentry)); + /* dentry must be unhashed in lookup, no need to worry about */ + DBG_BUGON(!d_unhashed(dentry)); + + trace_erofs_lookup(dir, dentry, flags); + + /* file name exceeds fs limit */ + if (dentry->d_name.len > EROFS_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + /* false uninitialized warnings on gcc 4.8.x */ + err = erofs_namei(dir, &dentry->d_name, &nid, &d_type); + + if (err == -ENOENT) { + /* negative dentry */ + inode = NULL; + } else if (err) { + inode = ERR_PTR(err); + } else { + erofs_dbg("%s, %s (nid %llu) found, d_type %u", __func__, + dentry->d_name.name, nid, d_type); + inode = erofs_iget(dir->i_sb, nid, d_type == FT_DIR); + } + return d_splice_alias(inode, dentry); +} + +const struct inode_operations erofs_dir_iops = { + .lookup = erofs_lookup, + .getattr = erofs_getattr, +#ifdef CONFIG_EROFS_FS_XATTR + .listxattr = erofs_listxattr, +#endif + .get_acl = erofs_get_acl, +}; + diff --git a/fs/erofs/super.c b/fs/erofs/super.c new file mode 100644 index 000000000000..caf9a95173b0 --- /dev/null +++ b/fs/erofs/super.c @@ -0,0 +1,615 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#include <linux/module.h> +#include <linux/buffer_head.h> +#include <linux/statfs.h> +#include <linux/parser.h> +#include <linux/seq_file.h> +#include "xattr.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/erofs.h> + +static struct kmem_cache *erofs_inode_cachep __read_mostly; + +void _erofs_err(struct super_block *sb, const char *function, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + pr_err("(device %s): %s: %pV", sb->s_id, function, &vaf); + va_end(args); +} + +void _erofs_info(struct super_block *sb, const char *function, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + pr_info("(device %s): %pV", sb->s_id, &vaf); + va_end(args); +} + +static void erofs_inode_init_once(void *ptr) +{ + struct erofs_inode *vi = ptr; + + inode_init_once(&vi->vfs_inode); +} + +static struct inode *erofs_alloc_inode(struct super_block *sb) +{ + struct erofs_inode *vi = + kmem_cache_alloc(erofs_inode_cachep, GFP_KERNEL); + + if (!vi) + return NULL; + + /* zero out everything except vfs_inode */ + memset(vi, 0, offsetof(struct erofs_inode, vfs_inode)); + return &vi->vfs_inode; +} + +static void erofs_free_inode(struct inode *inode) +{ + struct erofs_inode *vi = EROFS_I(inode); + + /* be careful of RCU symlink path */ + if (inode->i_op == &erofs_fast_symlink_iops) + kfree(inode->i_link); + kfree(vi->xattr_shared_xattrs); + + kmem_cache_free(erofs_inode_cachep, vi); +} + +static bool check_layout_compatibility(struct super_block *sb, + struct erofs_super_block *dsb) +{ + const unsigned int feature = le32_to_cpu(dsb->feature_incompat); + + EROFS_SB(sb)->feature_incompat = feature; + + /* check if current kernel meets all mandatory requirements */ + if (feature & (~EROFS_ALL_FEATURE_INCOMPAT)) { + erofs_err(sb, + "unidentified incompatible feature %x, please upgrade kernel version", + feature & ~EROFS_ALL_FEATURE_INCOMPAT); + return false; + } + return true; +} + +static int erofs_read_superblock(struct super_block *sb) +{ + struct erofs_sb_info *sbi; + struct page *page; + struct erofs_super_block *dsb; + unsigned int blkszbits; + void *data; + int ret; + + page = read_mapping_page(sb->s_bdev->bd_inode->i_mapping, 0, NULL); + if (!page) { + erofs_err(sb, "cannot read erofs superblock"); + return -EIO; + } + + sbi = EROFS_SB(sb); + + data = kmap_atomic(page); + dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET); + + ret = -EINVAL; + if (le32_to_cpu(dsb->magic) != EROFS_SUPER_MAGIC_V1) { + erofs_err(sb, "cannot find valid erofs superblock"); + goto out; + } + + blkszbits = dsb->blkszbits; + /* 9(512 bytes) + LOG_SECTORS_PER_BLOCK == LOG_BLOCK_SIZE */ + if (blkszbits != LOG_BLOCK_SIZE) { + erofs_err(sb, "blksize %u isn't supported on this platform", + 1 << blkszbits); + goto out; + } + + if (!check_layout_compatibility(sb, dsb)) + goto out; + + sbi->blocks = le32_to_cpu(dsb->blocks); + sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr); +#ifdef CONFIG_EROFS_FS_XATTR + sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr); +#endif + sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact)); + sbi->root_nid = le16_to_cpu(dsb->root_nid); + sbi->inos = le64_to_cpu(dsb->inos); + + sbi->build_time = le64_to_cpu(dsb->build_time); + sbi->build_time_nsec = le32_to_cpu(dsb->build_time_nsec); + + memcpy(&sb->s_uuid, dsb->uuid, sizeof(dsb->uuid)); + + ret = strscpy(sbi->volume_name, dsb->volume_name, + sizeof(dsb->volume_name)); + if (ret < 0) { /* -E2BIG */ + erofs_err(sb, "bad volume name without NIL terminator"); + ret = -EFSCORRUPTED; + goto out; + } + ret = 0; +out: + kunmap_atomic(data); + put_page(page); + return ret; +} + +#ifdef CONFIG_EROFS_FS_ZIP +static int erofs_build_cache_strategy(struct super_block *sb, + substring_t *args) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + const char *cs = match_strdup(args); + int err = 0; + + if (!cs) { + erofs_err(sb, "Not enough memory to store cache strategy"); + return -ENOMEM; + } + + if (!strcmp(cs, "disabled")) { + sbi->cache_strategy = EROFS_ZIP_CACHE_DISABLED; + } else if (!strcmp(cs, "readahead")) { + sbi->cache_strategy = EROFS_ZIP_CACHE_READAHEAD; + } else if (!strcmp(cs, "readaround")) { + sbi->cache_strategy = EROFS_ZIP_CACHE_READAROUND; + } else { + erofs_err(sb, "Unrecognized cache strategy \"%s\"", cs); + err = -EINVAL; + } + kfree(cs); + return err; +} +#else +static int erofs_build_cache_strategy(struct super_block *sb, + substring_t *args) +{ + erofs_info(sb, "EROFS compression is disabled, so cache strategy is ignored"); + return 0; +} +#endif + +/* set up default EROFS parameters */ +static void erofs_default_options(struct erofs_sb_info *sbi) +{ +#ifdef CONFIG_EROFS_FS_ZIP + sbi->cache_strategy = EROFS_ZIP_CACHE_READAROUND; + sbi->max_sync_decompress_pages = 3; +#endif +#ifdef CONFIG_EROFS_FS_XATTR + set_opt(sbi, XATTR_USER); +#endif +#ifdef CONFIG_EROFS_FS_POSIX_ACL + set_opt(sbi, POSIX_ACL); +#endif +} + +enum { + Opt_user_xattr, + Opt_nouser_xattr, + Opt_acl, + Opt_noacl, + Opt_cache_strategy, + Opt_err +}; + +static match_table_t erofs_tokens = { + {Opt_user_xattr, "user_xattr"}, + {Opt_nouser_xattr, "nouser_xattr"}, + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_cache_strategy, "cache_strategy=%s"}, + {Opt_err, NULL} +}; + +static int erofs_parse_options(struct super_block *sb, char *options) +{ + substring_t args[MAX_OPT_ARGS]; + char *p; + int err; + + if (!options) + return 0; + + while ((p = strsep(&options, ","))) { + int token; + + if (!*p) + continue; + + args[0].to = args[0].from = NULL; + token = match_token(p, erofs_tokens, args); + + switch (token) { +#ifdef CONFIG_EROFS_FS_XATTR + case Opt_user_xattr: + set_opt(EROFS_SB(sb), XATTR_USER); + break; + case Opt_nouser_xattr: + clear_opt(EROFS_SB(sb), XATTR_USER); + break; +#else + case Opt_user_xattr: + erofs_info(sb, "user_xattr options not supported"); + break; + case Opt_nouser_xattr: + erofs_info(sb, "nouser_xattr options not supported"); + break; +#endif +#ifdef CONFIG_EROFS_FS_POSIX_ACL + case Opt_acl: + set_opt(EROFS_SB(sb), POSIX_ACL); + break; + case Opt_noacl: + clear_opt(EROFS_SB(sb), POSIX_ACL); + break; +#else + case Opt_acl: + erofs_info(sb, "acl options not supported"); + break; + case Opt_noacl: + erofs_info(sb, "noacl options not supported"); + break; +#endif + case Opt_cache_strategy: + err = erofs_build_cache_strategy(sb, args); + if (err) + return err; + break; + default: + erofs_err(sb, "Unrecognized mount option \"%s\" or missing value", p); + return -EINVAL; + } + } + return 0; +} + +#ifdef CONFIG_EROFS_FS_ZIP +static const struct address_space_operations managed_cache_aops; + +static int erofs_managed_cache_releasepage(struct page *page, gfp_t gfp_mask) +{ + int ret = 1; /* 0 - busy */ + struct address_space *const mapping = page->mapping; + + DBG_BUGON(!PageLocked(page)); + DBG_BUGON(mapping->a_ops != &managed_cache_aops); + + if (PagePrivate(page)) + ret = erofs_try_to_free_cached_page(mapping, page); + + return ret; +} + +static void erofs_managed_cache_invalidatepage(struct page *page, + unsigned int offset, + unsigned int length) +{ + const unsigned int stop = length + offset; + + DBG_BUGON(!PageLocked(page)); + + /* Check for potential overflow in debug mode */ + DBG_BUGON(stop > PAGE_SIZE || stop < length); + + if (offset == 0 && stop == PAGE_SIZE) + while (!erofs_managed_cache_releasepage(page, GFP_NOFS)) + cond_resched(); +} + +static const struct address_space_operations managed_cache_aops = { + .releasepage = erofs_managed_cache_releasepage, + .invalidatepage = erofs_managed_cache_invalidatepage, +}; + +static int erofs_init_managed_cache(struct super_block *sb) +{ + struct erofs_sb_info *const sbi = EROFS_SB(sb); + struct inode *const inode = new_inode(sb); + + if (!inode) + return -ENOMEM; + + set_nlink(inode, 1); + inode->i_size = OFFSET_MAX; + + inode->i_mapping->a_ops = &managed_cache_aops; + mapping_set_gfp_mask(inode->i_mapping, + GFP_NOFS | __GFP_HIGHMEM | __GFP_MOVABLE); + sbi->managed_cache = inode; + return 0; +} +#else +static int erofs_init_managed_cache(struct super_block *sb) { return 0; } +#endif + +static int erofs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct inode *inode; + struct erofs_sb_info *sbi; + int err; + + sb->s_magic = EROFS_SUPER_MAGIC; + + if (!sb_set_blocksize(sb, EROFS_BLKSIZ)) { + erofs_err(sb, "failed to set erofs blksize"); + return -EINVAL; + } + + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + sb->s_fs_info = sbi; + err = erofs_read_superblock(sb); + if (err) + return err; + + sb->s_flags |= SB_RDONLY | SB_NOATIME; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_time_gran = 1; + + sb->s_op = &erofs_sops; + +#ifdef CONFIG_EROFS_FS_XATTR + sb->s_xattr = erofs_xattr_handlers; +#endif + /* set erofs default mount options */ + erofs_default_options(sbi); + + err = erofs_parse_options(sb, data); + if (err) + return err; + + if (test_opt(sbi, POSIX_ACL)) + sb->s_flags |= SB_POSIXACL; + else + sb->s_flags &= ~SB_POSIXACL; + +#ifdef CONFIG_EROFS_FS_ZIP + INIT_RADIX_TREE(&sbi->workstn_tree, GFP_ATOMIC); +#endif + + /* get the root inode */ + inode = erofs_iget(sb, ROOT_NID(sbi), true); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + if (!S_ISDIR(inode->i_mode)) { + erofs_err(sb, "rootino(nid %llu) is not a directory(i_mode %o)", + ROOT_NID(sbi), inode->i_mode); + iput(inode); + return -EINVAL; + } + + sb->s_root = d_make_root(inode); + if (!sb->s_root) + return -ENOMEM; + + erofs_shrinker_register(sb); + /* sb->s_umount is already locked, SB_ACTIVE and SB_BORN are not set */ + err = erofs_init_managed_cache(sb); + if (err) + return err; + + erofs_info(sb, "mounted with opts: %s, root inode @ nid %llu.", + (char *)data, ROOT_NID(sbi)); + return 0; +} + +static struct dentry *erofs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, erofs_fill_super); +} + +/* + * could be triggered after deactivate_locked_super() + * is called, thus including umount and failed to initialize. + */ +static void erofs_kill_sb(struct super_block *sb) +{ + struct erofs_sb_info *sbi; + + WARN_ON(sb->s_magic != EROFS_SUPER_MAGIC); + + kill_block_super(sb); + + sbi = EROFS_SB(sb); + if (!sbi) + return; + kfree(sbi); + sb->s_fs_info = NULL; +} + +/* called when ->s_root is non-NULL */ +static void erofs_put_super(struct super_block *sb) +{ + struct erofs_sb_info *const sbi = EROFS_SB(sb); + + DBG_BUGON(!sbi); + + erofs_shrinker_unregister(sb); +#ifdef CONFIG_EROFS_FS_ZIP + iput(sbi->managed_cache); + sbi->managed_cache = NULL; +#endif +} + +static struct file_system_type erofs_fs_type = { + .owner = THIS_MODULE, + .name = "erofs", + .mount = erofs_mount, + .kill_sb = erofs_kill_sb, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("erofs"); + +static int __init erofs_module_init(void) +{ + int err; + + erofs_check_ondisk_layout_definitions(); + + erofs_inode_cachep = kmem_cache_create("erofs_inode", + sizeof(struct erofs_inode), 0, + SLAB_RECLAIM_ACCOUNT, + erofs_inode_init_once); + if (!erofs_inode_cachep) { + err = -ENOMEM; + goto icache_err; + } + + err = erofs_init_shrinker(); + if (err) + goto shrinker_err; + + err = z_erofs_init_zip_subsystem(); + if (err) + goto zip_err; + + err = register_filesystem(&erofs_fs_type); + if (err) + goto fs_err; + + return 0; + +fs_err: + z_erofs_exit_zip_subsystem(); +zip_err: + erofs_exit_shrinker(); +shrinker_err: + kmem_cache_destroy(erofs_inode_cachep); +icache_err: + return err; +} + +static void __exit erofs_module_exit(void) +{ + unregister_filesystem(&erofs_fs_type); + z_erofs_exit_zip_subsystem(); + erofs_exit_shrinker(); + + /* Ensure all RCU free inodes are safe before cache is destroyed. */ + rcu_barrier(); + kmem_cache_destroy(erofs_inode_cachep); +} + +/* get filesystem statistics */ +static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct erofs_sb_info *sbi = EROFS_SB(sb); + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + buf->f_type = sb->s_magic; + buf->f_bsize = EROFS_BLKSIZ; + buf->f_blocks = sbi->blocks; + buf->f_bfree = buf->f_bavail = 0; + + buf->f_files = ULLONG_MAX; + buf->f_ffree = ULLONG_MAX - sbi->inos; + + buf->f_namelen = EROFS_NAME_LEN; + + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + return 0; +} + +static int erofs_show_options(struct seq_file *seq, struct dentry *root) +{ + struct erofs_sb_info *sbi __maybe_unused = EROFS_SB(root->d_sb); + +#ifdef CONFIG_EROFS_FS_XATTR + if (test_opt(sbi, XATTR_USER)) + seq_puts(seq, ",user_xattr"); + else + seq_puts(seq, ",nouser_xattr"); +#endif +#ifdef CONFIG_EROFS_FS_POSIX_ACL + if (test_opt(sbi, POSIX_ACL)) + seq_puts(seq, ",acl"); + else + seq_puts(seq, ",noacl"); +#endif +#ifdef CONFIG_EROFS_FS_ZIP + if (sbi->cache_strategy == EROFS_ZIP_CACHE_DISABLED) { + seq_puts(seq, ",cache_strategy=disabled"); + } else if (sbi->cache_strategy == EROFS_ZIP_CACHE_READAHEAD) { + seq_puts(seq, ",cache_strategy=readahead"); + } else if (sbi->cache_strategy == EROFS_ZIP_CACHE_READAROUND) { + seq_puts(seq, ",cache_strategy=readaround"); + } else { + seq_puts(seq, ",cache_strategy=(unknown)"); + DBG_BUGON(1); + } +#endif + return 0; +} + +static int erofs_remount(struct super_block *sb, int *flags, char *data) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + unsigned int org_mnt_opt = sbi->mount_opt; + int err; + + DBG_BUGON(!sb_rdonly(sb)); + err = erofs_parse_options(sb, data); + if (err) + goto out; + + if (test_opt(sbi, POSIX_ACL)) + sb->s_flags |= SB_POSIXACL; + else + sb->s_flags &= ~SB_POSIXACL; + + *flags |= SB_RDONLY; + return 0; +out: + sbi->mount_opt = org_mnt_opt; + return err; +} + +const struct super_operations erofs_sops = { + .put_super = erofs_put_super, + .alloc_inode = erofs_alloc_inode, + .free_inode = erofs_free_inode, + .statfs = erofs_statfs, + .show_options = erofs_show_options, + .remount_fs = erofs_remount, +}; + +module_init(erofs_module_init); +module_exit(erofs_module_exit); + +MODULE_DESCRIPTION("Enhanced ROM File System"); +MODULE_AUTHOR("Gao Xiang, Chao Yu, Miao Xie, CONSUMER BG, HUAWEI Inc."); +MODULE_LICENSE("GPL"); + diff --git a/fs/erofs/tagptr.h b/fs/erofs/tagptr.h new file mode 100644 index 000000000000..a72897c86744 --- /dev/null +++ b/fs/erofs/tagptr.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * A tagged pointer implementation + * + * Copyright (C) 2018 Gao Xiang <gaoxiang25@huawei.com> + */ +#ifndef __EROFS_FS_TAGPTR_H +#define __EROFS_FS_TAGPTR_H + +#include <linux/types.h> +#include <linux/build_bug.h> + +/* + * the name of tagged pointer types are tagptr{1, 2, 3...}_t + * avoid directly using the internal structs __tagptr{1, 2, 3...} + */ +#define __MAKE_TAGPTR(n) \ +typedef struct __tagptr##n { \ + uintptr_t v; \ +} tagptr##n##_t; + +__MAKE_TAGPTR(1) +__MAKE_TAGPTR(2) +__MAKE_TAGPTR(3) +__MAKE_TAGPTR(4) + +#undef __MAKE_TAGPTR + +extern void __compiletime_error("bad tagptr tags") + __bad_tagptr_tags(void); + +extern void __compiletime_error("bad tagptr type") + __bad_tagptr_type(void); + +/* fix the broken usage of "#define tagptr2_t tagptr3_t" by users */ +#define __tagptr_mask_1(ptr, n) \ + __builtin_types_compatible_p(typeof(ptr), struct __tagptr##n) ? \ + (1UL << (n)) - 1 : + +#define __tagptr_mask(ptr) (\ + __tagptr_mask_1(ptr, 1) ( \ + __tagptr_mask_1(ptr, 2) ( \ + __tagptr_mask_1(ptr, 3) ( \ + __tagptr_mask_1(ptr, 4) ( \ + __bad_tagptr_type(), 0))))) + +/* generate a tagged pointer from a raw value */ +#define tagptr_init(type, val) \ + ((typeof(type)){ .v = (uintptr_t)(val) }) + +/* + * directly cast a tagged pointer to the native pointer type, which + * could be used for backward compatibility of existing code. + */ +#define tagptr_cast_ptr(tptr) ((void *)(tptr).v) + +/* encode tagged pointers */ +#define tagptr_fold(type, ptr, _tags) ({ \ + const typeof(_tags) tags = (_tags); \ + if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(type))) \ + __bad_tagptr_tags(); \ +tagptr_init(type, (uintptr_t)(ptr) | tags); }) + +/* decode tagged pointers */ +#define tagptr_unfold_ptr(tptr) \ + ((void *)((tptr).v & ~__tagptr_mask(tptr))) + +#define tagptr_unfold_tags(tptr) \ + ((tptr).v & __tagptr_mask(tptr)) + +/* operations for the tagger pointer */ +#define tagptr_eq(_tptr1, _tptr2) ({ \ + typeof(_tptr1) tptr1 = (_tptr1); \ + typeof(_tptr2) tptr2 = (_tptr2); \ + (void)(&tptr1 == &tptr2); \ +(tptr1).v == (tptr2).v; }) + +/* lock-free CAS operation */ +#define tagptr_cmpxchg(_ptptr, _o, _n) ({ \ + typeof(_ptptr) ptptr = (_ptptr); \ + typeof(_o) o = (_o); \ + typeof(_n) n = (_n); \ + (void)(&o == &n); \ + (void)(&o == ptptr); \ +tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); }) + +/* wrap WRITE_ONCE if atomic update is needed */ +#define tagptr_replace_tags(_ptptr, tags) ({ \ + typeof(_ptptr) ptptr = (_ptptr); \ + *ptptr = tagptr_fold(*ptptr, tagptr_unfold_ptr(*ptptr), tags); \ +*ptptr; }) + +#define tagptr_set_tags(_ptptr, _tags) ({ \ + typeof(_ptptr) ptptr = (_ptptr); \ + const typeof(_tags) tags = (_tags); \ + if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \ + __bad_tagptr_tags(); \ + ptptr->v |= tags; \ +*ptptr; }) + +#define tagptr_clear_tags(_ptptr, _tags) ({ \ + typeof(_ptptr) ptptr = (_ptptr); \ + const typeof(_tags) tags = (_tags); \ + if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \ + __bad_tagptr_tags(); \ + ptptr->v &= ~tags; \ +*ptptr; }) + +#endif /* __EROFS_FS_TAGPTR_H */ + diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c new file mode 100644 index 000000000000..d92b3e753a6f --- /dev/null +++ b/fs/erofs/utils.c @@ -0,0 +1,333 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#include "internal.h" +#include <linux/pagevec.h> + +struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp, bool nofail) +{ + struct page *page; + + if (!list_empty(pool)) { + page = lru_to_page(pool); + DBG_BUGON(page_ref_count(page) != 1); + list_del(&page->lru); + } else { + page = alloc_pages(gfp | (nofail ? __GFP_NOFAIL : 0), 0); + } + return page; +} + +#if (EROFS_PCPUBUF_NR_PAGES > 0) +static struct { + u8 data[PAGE_SIZE * EROFS_PCPUBUF_NR_PAGES]; +} ____cacheline_aligned_in_smp erofs_pcpubuf[NR_CPUS]; + +void *erofs_get_pcpubuf(unsigned int pagenr) +{ + preempt_disable(); + return &erofs_pcpubuf[smp_processor_id()].data[pagenr * PAGE_SIZE]; +} +#endif + +#ifdef CONFIG_EROFS_FS_ZIP +/* global shrink count (for all mounted EROFS instances) */ +static atomic_long_t erofs_global_shrink_cnt; + +#define __erofs_workgroup_get(grp) atomic_inc(&(grp)->refcount) +#define __erofs_workgroup_put(grp) atomic_dec(&(grp)->refcount) + +static int erofs_workgroup_get(struct erofs_workgroup *grp) +{ + int o; + +repeat: + o = erofs_wait_on_workgroup_freezed(grp); + if (o <= 0) + return -1; + + if (atomic_cmpxchg(&grp->refcount, o, o + 1) != o) + goto repeat; + + /* decrease refcount paired by erofs_workgroup_put */ + if (o == 1) + atomic_long_dec(&erofs_global_shrink_cnt); + return 0; +} + +struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, + pgoff_t index, bool *tag) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + struct erofs_workgroup *grp; + +repeat: + rcu_read_lock(); + grp = radix_tree_lookup(&sbi->workstn_tree, index); + if (grp) { + *tag = xa_pointer_tag(grp); + grp = xa_untag_pointer(grp); + + if (erofs_workgroup_get(grp)) { + /* prefer to relax rcu read side */ + rcu_read_unlock(); + goto repeat; + } + + DBG_BUGON(index != grp->index); + } + rcu_read_unlock(); + return grp; +} + +int erofs_register_workgroup(struct super_block *sb, + struct erofs_workgroup *grp, + bool tag) +{ + struct erofs_sb_info *sbi; + int err; + + /* grp shouldn't be broken or used before */ + if (atomic_read(&grp->refcount) != 1) { + DBG_BUGON(1); + return -EINVAL; + } + + err = radix_tree_preload(GFP_NOFS); + if (err) + return err; + + sbi = EROFS_SB(sb); + xa_lock(&sbi->workstn_tree); + + grp = xa_tag_pointer(grp, tag); + + /* + * Bump up reference count before making this workgroup + * visible to other users in order to avoid potential UAF + * without serialized by workstn_lock. + */ + __erofs_workgroup_get(grp); + + err = radix_tree_insert(&sbi->workstn_tree, grp->index, grp); + if (err) + /* + * it's safe to decrease since the workgroup isn't visible + * and refcount >= 2 (cannot be freezed). + */ + __erofs_workgroup_put(grp); + + xa_unlock(&sbi->workstn_tree); + radix_tree_preload_end(); + return err; +} + +static void __erofs_workgroup_free(struct erofs_workgroup *grp) +{ + atomic_long_dec(&erofs_global_shrink_cnt); + erofs_workgroup_free_rcu(grp); +} + +int erofs_workgroup_put(struct erofs_workgroup *grp) +{ + int count = atomic_dec_return(&grp->refcount); + + if (count == 1) + atomic_long_inc(&erofs_global_shrink_cnt); + else if (!count) + __erofs_workgroup_free(grp); + return count; +} + +static void erofs_workgroup_unfreeze_final(struct erofs_workgroup *grp) +{ + erofs_workgroup_unfreeze(grp, 0); + __erofs_workgroup_free(grp); +} + +static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, + struct erofs_workgroup *grp, + bool cleanup) +{ + /* + * If managed cache is on, refcount of workgroups + * themselves could be < 0 (freezed). In other words, + * there is no guarantee that all refcounts > 0. + */ + if (!erofs_workgroup_try_to_freeze(grp, 1)) + return false; + + /* + * Note that all cached pages should be unattached + * before deleted from the radix tree. Otherwise some + * cached pages could be still attached to the orphan + * old workgroup when the new one is available in the tree. + */ + if (erofs_try_to_free_all_cached_pages(sbi, grp)) { + erofs_workgroup_unfreeze(grp, 1); + return false; + } + + /* + * It's impossible to fail after the workgroup is freezed, + * however in order to avoid some race conditions, add a + * DBG_BUGON to observe this in advance. + */ + DBG_BUGON(xa_untag_pointer(radix_tree_delete(&sbi->workstn_tree, + grp->index)) != grp); + + /* + * If managed cache is on, last refcount should indicate + * the related workstation. + */ + erofs_workgroup_unfreeze_final(grp); + return true; +} + +static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi, + unsigned long nr_shrink, + bool cleanup) +{ + pgoff_t first_index = 0; + void *batch[PAGEVEC_SIZE]; + unsigned int freed = 0; + + int i, found; +repeat: + xa_lock(&sbi->workstn_tree); + + found = radix_tree_gang_lookup(&sbi->workstn_tree, + batch, first_index, PAGEVEC_SIZE); + + for (i = 0; i < found; ++i) { + struct erofs_workgroup *grp = xa_untag_pointer(batch[i]); + + first_index = grp->index + 1; + + /* try to shrink each valid workgroup */ + if (!erofs_try_to_release_workgroup(sbi, grp, cleanup)) + continue; + + ++freed; + if (!--nr_shrink) + break; + } + xa_unlock(&sbi->workstn_tree); + + if (i && nr_shrink) + goto repeat; + return freed; +} + +/* protected by 'erofs_sb_list_lock' */ +static unsigned int shrinker_run_no; + +/* protects the mounted 'erofs_sb_list' */ +static DEFINE_SPINLOCK(erofs_sb_list_lock); +static LIST_HEAD(erofs_sb_list); + +void erofs_shrinker_register(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + + mutex_init(&sbi->umount_mutex); + + spin_lock(&erofs_sb_list_lock); + list_add(&sbi->list, &erofs_sb_list); + spin_unlock(&erofs_sb_list_lock); +} + +void erofs_shrinker_unregister(struct super_block *sb) +{ + struct erofs_sb_info *const sbi = EROFS_SB(sb); + + mutex_lock(&sbi->umount_mutex); + erofs_shrink_workstation(sbi, ~0UL, true); + + spin_lock(&erofs_sb_list_lock); + list_del(&sbi->list); + spin_unlock(&erofs_sb_list_lock); + mutex_unlock(&sbi->umount_mutex); +} + +static unsigned long erofs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + return atomic_long_read(&erofs_global_shrink_cnt); +} + +static unsigned long erofs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct erofs_sb_info *sbi; + struct list_head *p; + + unsigned long nr = sc->nr_to_scan; + unsigned int run_no; + unsigned long freed = 0; + + spin_lock(&erofs_sb_list_lock); + do { + run_no = ++shrinker_run_no; + } while (run_no == 0); + + /* Iterate over all mounted superblocks and try to shrink them */ + p = erofs_sb_list.next; + while (p != &erofs_sb_list) { + sbi = list_entry(p, struct erofs_sb_info, list); + + /* + * We move the ones we do to the end of the list, so we stop + * when we see one we have already done. + */ + if (sbi->shrinker_run_no == run_no) + break; + + if (!mutex_trylock(&sbi->umount_mutex)) { + p = p->next; + continue; + } + + spin_unlock(&erofs_sb_list_lock); + sbi->shrinker_run_no = run_no; + + freed += erofs_shrink_workstation(sbi, nr, false); + + spin_lock(&erofs_sb_list_lock); + /* Get the next list element before we move this one */ + p = p->next; + + /* + * Move this one to the end of the list to provide some + * fairness. + */ + list_move_tail(&sbi->list, &erofs_sb_list); + mutex_unlock(&sbi->umount_mutex); + + if (freed >= nr) + break; + } + spin_unlock(&erofs_sb_list_lock); + return freed; +} + +static struct shrinker erofs_shrinker_info = { + .scan_objects = erofs_shrink_scan, + .count_objects = erofs_shrink_count, + .seeks = DEFAULT_SEEKS, +}; + +int __init erofs_init_shrinker(void) +{ + return register_shrinker(&erofs_shrinker_info); +} + +void erofs_exit_shrinker(void) +{ + unregister_shrinker(&erofs_shrinker_info); +} +#endif /* !CONFIG_EROFS_FS_ZIP */ + diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c new file mode 100644 index 000000000000..a13a78725c57 --- /dev/null +++ b/fs/erofs/xattr.c @@ -0,0 +1,704 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#include <linux/security.h> +#include "xattr.h" + +struct xattr_iter { + struct super_block *sb; + struct page *page; + void *kaddr; + + erofs_blk_t blkaddr; + unsigned int ofs; +}; + +static inline void xattr_iter_end(struct xattr_iter *it, bool atomic) +{ + /* the only user of kunmap() is 'init_inode_xattrs' */ + if (!atomic) + kunmap(it->page); + else + kunmap_atomic(it->kaddr); + + unlock_page(it->page); + put_page(it->page); +} + +static inline void xattr_iter_end_final(struct xattr_iter *it) +{ + if (!it->page) + return; + + xattr_iter_end(it, true); +} + +static int init_inode_xattrs(struct inode *inode) +{ + struct erofs_inode *const vi = EROFS_I(inode); + struct xattr_iter it; + unsigned int i; + struct erofs_xattr_ibody_header *ih; + struct super_block *sb; + struct erofs_sb_info *sbi; + bool atomic_map; + int ret = 0; + + /* the most case is that xattrs of this inode are initialized. */ + if (test_bit(EROFS_I_EA_INITED_BIT, &vi->flags)) + return 0; + + if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_XATTR_BIT, TASK_KILLABLE)) + return -ERESTARTSYS; + + /* someone has initialized xattrs for us? */ + if (test_bit(EROFS_I_EA_INITED_BIT, &vi->flags)) + goto out_unlock; + + /* + * bypass all xattr operations if ->xattr_isize is not greater than + * sizeof(struct erofs_xattr_ibody_header), in detail: + * 1) it is not enough to contain erofs_xattr_ibody_header then + * ->xattr_isize should be 0 (it means no xattr); + * 2) it is just to contain erofs_xattr_ibody_header, which is on-disk + * undefined right now (maybe use later with some new sb feature). + */ + if (vi->xattr_isize == sizeof(struct erofs_xattr_ibody_header)) { + erofs_err(inode->i_sb, + "xattr_isize %d of nid %llu is not supported yet", + vi->xattr_isize, vi->nid); + ret = -EOPNOTSUPP; + goto out_unlock; + } else if (vi->xattr_isize < sizeof(struct erofs_xattr_ibody_header)) { + if (vi->xattr_isize) { + erofs_err(inode->i_sb, + "bogus xattr ibody @ nid %llu", vi->nid); + DBG_BUGON(1); + ret = -EFSCORRUPTED; + goto out_unlock; /* xattr ondisk layout error */ + } + ret = -ENOATTR; + goto out_unlock; + } + + sb = inode->i_sb; + sbi = EROFS_SB(sb); + it.blkaddr = erofs_blknr(iloc(sbi, vi->nid) + vi->inode_isize); + it.ofs = erofs_blkoff(iloc(sbi, vi->nid) + vi->inode_isize); + + it.page = erofs_get_meta_page(sb, it.blkaddr); + if (IS_ERR(it.page)) { + ret = PTR_ERR(it.page); + goto out_unlock; + } + + /* read in shared xattr array (non-atomic, see kmalloc below) */ + it.kaddr = kmap(it.page); + atomic_map = false; + + ih = (struct erofs_xattr_ibody_header *)(it.kaddr + it.ofs); + + vi->xattr_shared_count = ih->h_shared_count; + vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count, + sizeof(uint), GFP_KERNEL); + if (!vi->xattr_shared_xattrs) { + xattr_iter_end(&it, atomic_map); + ret = -ENOMEM; + goto out_unlock; + } + + /* let's skip ibody header */ + it.ofs += sizeof(struct erofs_xattr_ibody_header); + + for (i = 0; i < vi->xattr_shared_count; ++i) { + if (it.ofs >= EROFS_BLKSIZ) { + /* cannot be unaligned */ + DBG_BUGON(it.ofs != EROFS_BLKSIZ); + xattr_iter_end(&it, atomic_map); + + it.page = erofs_get_meta_page(sb, ++it.blkaddr); + if (IS_ERR(it.page)) { + kfree(vi->xattr_shared_xattrs); + vi->xattr_shared_xattrs = NULL; + ret = PTR_ERR(it.page); + goto out_unlock; + } + + it.kaddr = kmap_atomic(it.page); + atomic_map = true; + it.ofs = 0; + } + vi->xattr_shared_xattrs[i] = + le32_to_cpu(*(__le32 *)(it.kaddr + it.ofs)); + it.ofs += sizeof(__le32); + } + xattr_iter_end(&it, atomic_map); + + set_bit(EROFS_I_EA_INITED_BIT, &vi->flags); + +out_unlock: + clear_and_wake_up_bit(EROFS_I_BL_XATTR_BIT, &vi->flags); + return ret; +} + +/* + * the general idea for these return values is + * if 0 is returned, go on processing the current xattr; + * 1 (> 0) is returned, skip this round to process the next xattr; + * -err (< 0) is returned, an error (maybe ENOXATTR) occurred + * and need to be handled + */ +struct xattr_iter_handlers { + int (*entry)(struct xattr_iter *_it, struct erofs_xattr_entry *entry); + int (*name)(struct xattr_iter *_it, unsigned int processed, char *buf, + unsigned int len); + int (*alloc_buffer)(struct xattr_iter *_it, unsigned int value_sz); + void (*value)(struct xattr_iter *_it, unsigned int processed, char *buf, + unsigned int len); +}; + +static inline int xattr_iter_fixup(struct xattr_iter *it) +{ + if (it->ofs < EROFS_BLKSIZ) + return 0; + + xattr_iter_end(it, true); + + it->blkaddr += erofs_blknr(it->ofs); + + it->page = erofs_get_meta_page(it->sb, it->blkaddr); + if (IS_ERR(it->page)) { + int err = PTR_ERR(it->page); + + it->page = NULL; + return err; + } + + it->kaddr = kmap_atomic(it->page); + it->ofs = erofs_blkoff(it->ofs); + return 0; +} + +static int inline_xattr_iter_begin(struct xattr_iter *it, + struct inode *inode) +{ + struct erofs_inode *const vi = EROFS_I(inode); + struct erofs_sb_info *const sbi = EROFS_SB(inode->i_sb); + unsigned int xattr_header_sz, inline_xattr_ofs; + + xattr_header_sz = inlinexattr_header_size(inode); + if (xattr_header_sz >= vi->xattr_isize) { + DBG_BUGON(xattr_header_sz > vi->xattr_isize); + return -ENOATTR; + } + + inline_xattr_ofs = vi->inode_isize + xattr_header_sz; + + it->blkaddr = erofs_blknr(iloc(sbi, vi->nid) + inline_xattr_ofs); + it->ofs = erofs_blkoff(iloc(sbi, vi->nid) + inline_xattr_ofs); + + it->page = erofs_get_meta_page(inode->i_sb, it->blkaddr); + if (IS_ERR(it->page)) + return PTR_ERR(it->page); + + it->kaddr = kmap_atomic(it->page); + return vi->xattr_isize - xattr_header_sz; +} + +/* + * Regardless of success or failure, `xattr_foreach' will end up with + * `ofs' pointing to the next xattr item rather than an arbitrary position. + */ +static int xattr_foreach(struct xattr_iter *it, + const struct xattr_iter_handlers *op, + unsigned int *tlimit) +{ + struct erofs_xattr_entry entry; + unsigned int value_sz, processed, slice; + int err; + + /* 0. fixup blkaddr, ofs, ipage */ + err = xattr_iter_fixup(it); + if (err) + return err; + + /* + * 1. read xattr entry to the memory, + * since we do EROFS_XATTR_ALIGN + * therefore entry should be in the page + */ + entry = *(struct erofs_xattr_entry *)(it->kaddr + it->ofs); + if (tlimit) { + unsigned int entry_sz = erofs_xattr_entry_size(&entry); + + /* xattr on-disk corruption: xattr entry beyond xattr_isize */ + if (*tlimit < entry_sz) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } + *tlimit -= entry_sz; + } + + it->ofs += sizeof(struct erofs_xattr_entry); + value_sz = le16_to_cpu(entry.e_value_size); + + /* handle entry */ + err = op->entry(it, &entry); + if (err) { + it->ofs += entry.e_name_len + value_sz; + goto out; + } + + /* 2. handle xattr name (ofs will finally be at the end of name) */ + processed = 0; + + while (processed < entry.e_name_len) { + if (it->ofs >= EROFS_BLKSIZ) { + DBG_BUGON(it->ofs > EROFS_BLKSIZ); + + err = xattr_iter_fixup(it); + if (err) + goto out; + it->ofs = 0; + } + + slice = min_t(unsigned int, PAGE_SIZE - it->ofs, + entry.e_name_len - processed); + + /* handle name */ + err = op->name(it, processed, it->kaddr + it->ofs, slice); + if (err) { + it->ofs += entry.e_name_len - processed + value_sz; + goto out; + } + + it->ofs += slice; + processed += slice; + } + + /* 3. handle xattr value */ + processed = 0; + + if (op->alloc_buffer) { + err = op->alloc_buffer(it, value_sz); + if (err) { + it->ofs += value_sz; + goto out; + } + } + + while (processed < value_sz) { + if (it->ofs >= EROFS_BLKSIZ) { + DBG_BUGON(it->ofs > EROFS_BLKSIZ); + + err = xattr_iter_fixup(it); + if (err) + goto out; + it->ofs = 0; + } + + slice = min_t(unsigned int, PAGE_SIZE - it->ofs, + value_sz - processed); + op->value(it, processed, it->kaddr + it->ofs, slice); + it->ofs += slice; + processed += slice; + } + +out: + /* xattrs should be 4-byte aligned (on-disk constraint) */ + it->ofs = EROFS_XATTR_ALIGN(it->ofs); + return err < 0 ? err : 0; +} + +struct getxattr_iter { + struct xattr_iter it; + + char *buffer; + int buffer_size, index; + struct qstr name; +}; + +static int xattr_entrymatch(struct xattr_iter *_it, + struct erofs_xattr_entry *entry) +{ + struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); + + return (it->index != entry->e_name_index || + it->name.len != entry->e_name_len) ? -ENOATTR : 0; +} + +static int xattr_namematch(struct xattr_iter *_it, + unsigned int processed, char *buf, unsigned int len) +{ + struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); + + return memcmp(buf, it->name.name + processed, len) ? -ENOATTR : 0; +} + +static int xattr_checkbuffer(struct xattr_iter *_it, + unsigned int value_sz) +{ + struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); + int err = it->buffer_size < value_sz ? -ERANGE : 0; + + it->buffer_size = value_sz; + return !it->buffer ? 1 : err; +} + +static void xattr_copyvalue(struct xattr_iter *_it, + unsigned int processed, + char *buf, unsigned int len) +{ + struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); + + memcpy(it->buffer + processed, buf, len); +} + +static const struct xattr_iter_handlers find_xattr_handlers = { + .entry = xattr_entrymatch, + .name = xattr_namematch, + .alloc_buffer = xattr_checkbuffer, + .value = xattr_copyvalue +}; + +static int inline_getxattr(struct inode *inode, struct getxattr_iter *it) +{ + int ret; + unsigned int remaining; + + ret = inline_xattr_iter_begin(&it->it, inode); + if (ret < 0) + return ret; + + remaining = ret; + while (remaining) { + ret = xattr_foreach(&it->it, &find_xattr_handlers, &remaining); + if (ret != -ENOATTR) + break; + } + xattr_iter_end_final(&it->it); + + return ret ? ret : it->buffer_size; +} + +static int shared_getxattr(struct inode *inode, struct getxattr_iter *it) +{ + struct erofs_inode *const vi = EROFS_I(inode); + struct super_block *const sb = inode->i_sb; + struct erofs_sb_info *const sbi = EROFS_SB(sb); + unsigned int i; + int ret = -ENOATTR; + + for (i = 0; i < vi->xattr_shared_count; ++i) { + erofs_blk_t blkaddr = + xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]); + + it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]); + + if (!i || blkaddr != it->it.blkaddr) { + if (i) + xattr_iter_end(&it->it, true); + + it->it.page = erofs_get_meta_page(sb, blkaddr); + if (IS_ERR(it->it.page)) + return PTR_ERR(it->it.page); + + it->it.kaddr = kmap_atomic(it->it.page); + it->it.blkaddr = blkaddr; + } + + ret = xattr_foreach(&it->it, &find_xattr_handlers, NULL); + if (ret != -ENOATTR) + break; + } + if (vi->xattr_shared_count) + xattr_iter_end_final(&it->it); + + return ret ? ret : it->buffer_size; +} + +static bool erofs_xattr_user_list(struct dentry *dentry) +{ + return test_opt(EROFS_SB(dentry->d_sb), XATTR_USER); +} + +static bool erofs_xattr_trusted_list(struct dentry *dentry) +{ + return capable(CAP_SYS_ADMIN); +} + +int erofs_getxattr(struct inode *inode, int index, + const char *name, + void *buffer, size_t buffer_size) +{ + int ret; + struct getxattr_iter it; + + if (!name) + return -EINVAL; + + ret = init_inode_xattrs(inode); + if (ret) + return ret; + + it.index = index; + + it.name.len = strlen(name); + if (it.name.len > EROFS_NAME_LEN) + return -ERANGE; + it.name.name = name; + + it.buffer = buffer; + it.buffer_size = buffer_size; + + it.it.sb = inode->i_sb; + ret = inline_getxattr(inode, &it); + if (ret == -ENOATTR) + ret = shared_getxattr(inode, &it); + return ret; +} + +static int erofs_xattr_generic_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + struct erofs_sb_info *const sbi = EROFS_I_SB(inode); + + switch (handler->flags) { + case EROFS_XATTR_INDEX_USER: + if (!test_opt(sbi, XATTR_USER)) + return -EOPNOTSUPP; + break; + case EROFS_XATTR_INDEX_TRUSTED: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + break; + case EROFS_XATTR_INDEX_SECURITY: + break; + default: + return -EINVAL; + } + + return erofs_getxattr(inode, handler->flags, name, buffer, size); +} + +const struct xattr_handler erofs_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .flags = EROFS_XATTR_INDEX_USER, + .list = erofs_xattr_user_list, + .get = erofs_xattr_generic_get, +}; + +const struct xattr_handler erofs_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .flags = EROFS_XATTR_INDEX_TRUSTED, + .list = erofs_xattr_trusted_list, + .get = erofs_xattr_generic_get, +}; + +#ifdef CONFIG_EROFS_FS_SECURITY +const struct xattr_handler __maybe_unused erofs_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .flags = EROFS_XATTR_INDEX_SECURITY, + .get = erofs_xattr_generic_get, +}; +#endif + +const struct xattr_handler *erofs_xattr_handlers[] = { + &erofs_xattr_user_handler, +#ifdef CONFIG_EROFS_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + &erofs_xattr_trusted_handler, +#ifdef CONFIG_EROFS_FS_SECURITY + &erofs_xattr_security_handler, +#endif + NULL, +}; + +struct listxattr_iter { + struct xattr_iter it; + + struct dentry *dentry; + char *buffer; + int buffer_size, buffer_ofs; +}; + +static int xattr_entrylist(struct xattr_iter *_it, + struct erofs_xattr_entry *entry) +{ + struct listxattr_iter *it = + container_of(_it, struct listxattr_iter, it); + unsigned int prefix_len; + const char *prefix; + + const struct xattr_handler *h = + erofs_xattr_handler(entry->e_name_index); + + if (!h || (h->list && !h->list(it->dentry))) + return 1; + + prefix = xattr_prefix(h); + prefix_len = strlen(prefix); + + if (!it->buffer) { + it->buffer_ofs += prefix_len + entry->e_name_len + 1; + return 1; + } + + if (it->buffer_ofs + prefix_len + + entry->e_name_len + 1 > it->buffer_size) + return -ERANGE; + + memcpy(it->buffer + it->buffer_ofs, prefix, prefix_len); + it->buffer_ofs += prefix_len; + return 0; +} + +static int xattr_namelist(struct xattr_iter *_it, + unsigned int processed, char *buf, unsigned int len) +{ + struct listxattr_iter *it = + container_of(_it, struct listxattr_iter, it); + + memcpy(it->buffer + it->buffer_ofs, buf, len); + it->buffer_ofs += len; + return 0; +} + +static int xattr_skipvalue(struct xattr_iter *_it, + unsigned int value_sz) +{ + struct listxattr_iter *it = + container_of(_it, struct listxattr_iter, it); + + it->buffer[it->buffer_ofs++] = '\0'; + return 1; +} + +static const struct xattr_iter_handlers list_xattr_handlers = { + .entry = xattr_entrylist, + .name = xattr_namelist, + .alloc_buffer = xattr_skipvalue, + .value = NULL +}; + +static int inline_listxattr(struct listxattr_iter *it) +{ + int ret; + unsigned int remaining; + + ret = inline_xattr_iter_begin(&it->it, d_inode(it->dentry)); + if (ret < 0) + return ret; + + remaining = ret; + while (remaining) { + ret = xattr_foreach(&it->it, &list_xattr_handlers, &remaining); + if (ret) + break; + } + xattr_iter_end_final(&it->it); + return ret ? ret : it->buffer_ofs; +} + +static int shared_listxattr(struct listxattr_iter *it) +{ + struct inode *const inode = d_inode(it->dentry); + struct erofs_inode *const vi = EROFS_I(inode); + struct super_block *const sb = inode->i_sb; + struct erofs_sb_info *const sbi = EROFS_SB(sb); + unsigned int i; + int ret = 0; + + for (i = 0; i < vi->xattr_shared_count; ++i) { + erofs_blk_t blkaddr = + xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]); + + it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]); + if (!i || blkaddr != it->it.blkaddr) { + if (i) + xattr_iter_end(&it->it, true); + + it->it.page = erofs_get_meta_page(sb, blkaddr); + if (IS_ERR(it->it.page)) + return PTR_ERR(it->it.page); + + it->it.kaddr = kmap_atomic(it->it.page); + it->it.blkaddr = blkaddr; + } + + ret = xattr_foreach(&it->it, &list_xattr_handlers, NULL); + if (ret) + break; + } + if (vi->xattr_shared_count) + xattr_iter_end_final(&it->it); + + return ret ? ret : it->buffer_ofs; +} + +ssize_t erofs_listxattr(struct dentry *dentry, + char *buffer, size_t buffer_size) +{ + int ret; + struct listxattr_iter it; + + ret = init_inode_xattrs(d_inode(dentry)); + if (ret) + return ret; + + it.dentry = dentry; + it.buffer = buffer; + it.buffer_size = buffer_size; + it.buffer_ofs = 0; + + it.it.sb = dentry->d_sb; + + ret = inline_listxattr(&it); + if (ret < 0 && ret != -ENOATTR) + return ret; + return shared_listxattr(&it); +} + +#ifdef CONFIG_EROFS_FS_POSIX_ACL +struct posix_acl *erofs_get_acl(struct inode *inode, int type) +{ + struct posix_acl *acl; + int prefix, rc; + char *value = NULL; + + switch (type) { + case ACL_TYPE_ACCESS: + prefix = EROFS_XATTR_INDEX_POSIX_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + prefix = EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT; + break; + default: + return ERR_PTR(-EINVAL); + } + + rc = erofs_getxattr(inode, prefix, "", NULL, 0); + if (rc > 0) { + value = kmalloc(rc, GFP_KERNEL); + if (!value) + return ERR_PTR(-ENOMEM); + rc = erofs_getxattr(inode, prefix, "", value, rc); + } + + if (rc == -ENOATTR) + acl = NULL; + else if (rc < 0) + acl = ERR_PTR(rc); + else + acl = posix_acl_from_xattr(&init_user_ns, value, rc); + kfree(value); + return acl; +} +#endif + diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h new file mode 100644 index 000000000000..3585b84d2f20 --- /dev/null +++ b/fs/erofs/xattr.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#ifndef __EROFS_XATTR_H +#define __EROFS_XATTR_H + +#include "internal.h" +#include <linux/posix_acl_xattr.h> +#include <linux/xattr.h> + +/* Attribute not found */ +#define ENOATTR ENODATA + +static inline unsigned int inlinexattr_header_size(struct inode *inode) +{ + return sizeof(struct erofs_xattr_ibody_header) + + sizeof(u32) * EROFS_I(inode)->xattr_shared_count; +} + +static inline erofs_blk_t xattrblock_addr(struct erofs_sb_info *sbi, + unsigned int xattr_id) +{ +#ifdef CONFIG_EROFS_FS_XATTR + return sbi->xattr_blkaddr + + xattr_id * sizeof(__u32) / EROFS_BLKSIZ; +#else + return 0; +#endif +} + +static inline unsigned int xattrblock_offset(struct erofs_sb_info *sbi, + unsigned int xattr_id) +{ + return (xattr_id * sizeof(__u32)) % EROFS_BLKSIZ; +} + +#ifdef CONFIG_EROFS_FS_XATTR +extern const struct xattr_handler erofs_xattr_user_handler; +extern const struct xattr_handler erofs_xattr_trusted_handler; +#ifdef CONFIG_EROFS_FS_SECURITY +extern const struct xattr_handler erofs_xattr_security_handler; +#endif + +static inline const struct xattr_handler *erofs_xattr_handler(unsigned int idx) +{ +static const struct xattr_handler *xattr_handler_map[] = { + [EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler, +#ifdef CONFIG_EROFS_FS_POSIX_ACL + [EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, + [EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] = + &posix_acl_default_xattr_handler, +#endif + [EROFS_XATTR_INDEX_TRUSTED] = &erofs_xattr_trusted_handler, +#ifdef CONFIG_EROFS_FS_SECURITY + [EROFS_XATTR_INDEX_SECURITY] = &erofs_xattr_security_handler, +#endif +}; + + return idx && idx < ARRAY_SIZE(xattr_handler_map) ? + xattr_handler_map[idx] : NULL; +} + +extern const struct xattr_handler *erofs_xattr_handlers[]; + +int erofs_getxattr(struct inode *, int, const char *, void *, size_t); +ssize_t erofs_listxattr(struct dentry *, char *, size_t); +#else +static inline int erofs_getxattr(struct inode *inode, int index, + const char *name, void *buffer, + size_t buffer_size) +{ + return -EOPNOTSUPP; +} + +static inline ssize_t erofs_listxattr(struct dentry *dentry, + char *buffer, size_t buffer_size) +{ + return -EOPNOTSUPP; +} +#endif /* !CONFIG_EROFS_FS_XATTR */ + +#ifdef CONFIG_EROFS_FS_POSIX_ACL +struct posix_acl *erofs_get_acl(struct inode *inode, int type); +#else +#define erofs_get_acl (NULL) +#endif + +#endif + diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c new file mode 100644 index 000000000000..96e34c90f814 --- /dev/null +++ b/fs/erofs/zdata.c @@ -0,0 +1,1431 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#include "zdata.h" +#include "compress.h" +#include <linux/prefetch.h> + +#include <trace/events/erofs.h> + +/* + * a compressed_pages[] placeholder in order to avoid + * being filled with file pages for in-place decompression. + */ +#define PAGE_UNALLOCATED ((void *)0x5F0E4B1D) + +/* how to allocate cached pages for a pcluster */ +enum z_erofs_cache_alloctype { + DONTALLOC, /* don't allocate any cached pages */ + DELAYEDALLOC, /* delayed allocation (at the time of submitting io) */ +}; + +/* + * tagged pointer with 1-bit tag for all compressed pages + * tag 0 - the page is just found with an extra page reference + */ +typedef tagptr1_t compressed_page_t; + +#define tag_compressed_page_justfound(page) \ + tagptr_fold(compressed_page_t, page, 1) + +static struct workqueue_struct *z_erofs_workqueue __read_mostly; +static struct kmem_cache *pcluster_cachep __read_mostly; + +void z_erofs_exit_zip_subsystem(void) +{ + destroy_workqueue(z_erofs_workqueue); + kmem_cache_destroy(pcluster_cachep); +} + +static inline int z_erofs_init_workqueue(void) +{ + const unsigned int onlinecpus = num_possible_cpus(); + const unsigned int flags = WQ_UNBOUND | WQ_HIGHPRI | WQ_CPU_INTENSIVE; + + /* + * no need to spawn too many threads, limiting threads could minimum + * scheduling overhead, perhaps per-CPU threads should be better? + */ + z_erofs_workqueue = alloc_workqueue("erofs_unzipd", flags, + onlinecpus + onlinecpus / 4); + return z_erofs_workqueue ? 0 : -ENOMEM; +} + +static void z_erofs_pcluster_init_once(void *ptr) +{ + struct z_erofs_pcluster *pcl = ptr; + struct z_erofs_collection *cl = z_erofs_primarycollection(pcl); + unsigned int i; + + mutex_init(&cl->lock); + cl->nr_pages = 0; + cl->vcnt = 0; + for (i = 0; i < Z_EROFS_CLUSTER_MAX_PAGES; ++i) + pcl->compressed_pages[i] = NULL; +} + +static void z_erofs_pcluster_init_always(struct z_erofs_pcluster *pcl) +{ + struct z_erofs_collection *cl = z_erofs_primarycollection(pcl); + + atomic_set(&pcl->obj.refcount, 1); + + DBG_BUGON(cl->nr_pages); + DBG_BUGON(cl->vcnt); +} + +int __init z_erofs_init_zip_subsystem(void) +{ + pcluster_cachep = kmem_cache_create("erofs_compress", + Z_EROFS_WORKGROUP_SIZE, 0, + SLAB_RECLAIM_ACCOUNT, + z_erofs_pcluster_init_once); + if (pcluster_cachep) { + if (!z_erofs_init_workqueue()) + return 0; + + kmem_cache_destroy(pcluster_cachep); + } + return -ENOMEM; +} + +enum z_erofs_collectmode { + COLLECT_SECONDARY, + COLLECT_PRIMARY, + /* + * The current collection was the tail of an exist chain, in addition + * that the previous processed chained collections are all decided to + * be hooked up to it. + * A new chain will be created for the remaining collections which are + * not processed yet, therefore different from COLLECT_PRIMARY_FOLLOWED, + * the next collection cannot reuse the whole page safely in + * the following scenario: + * ________________________________________________________________ + * | tail (partial) page | head (partial) page | + * | (belongs to the next cl) | (belongs to the current cl) | + * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________| + */ + COLLECT_PRIMARY_HOOKED, + COLLECT_PRIMARY_FOLLOWED_NOINPLACE, + /* + * The current collection has been linked with the owned chain, and + * could also be linked with the remaining collections, which means + * if the processing page is the tail page of the collection, thus + * the current collection can safely use the whole page (since + * the previous collection is under control) for in-place I/O, as + * illustrated below: + * ________________________________________________________________ + * | tail (partial) page | head (partial) page | + * | (of the current cl) | (of the previous collection) | + * | PRIMARY_FOLLOWED or | | + * |_____PRIMARY_HOOKED___|____________PRIMARY_FOLLOWED____________| + * + * [ (*) the above page can be used as inplace I/O. ] + */ + COLLECT_PRIMARY_FOLLOWED, +}; + +struct z_erofs_collector { + struct z_erofs_pagevec_ctor vector; + + struct z_erofs_pcluster *pcl, *tailpcl; + struct z_erofs_collection *cl; + struct page **compressedpages; + z_erofs_next_pcluster_t owned_head; + + enum z_erofs_collectmode mode; +}; + +struct z_erofs_decompress_frontend { + struct inode *const inode; + + struct z_erofs_collector clt; + struct erofs_map_blocks map; + + /* used for applying cache strategy on the fly */ + bool backmost; + erofs_off_t headoffset; +}; + +#define COLLECTOR_INIT() { \ + .owned_head = Z_EROFS_PCLUSTER_TAIL, \ + .mode = COLLECT_PRIMARY_FOLLOWED } + +#define DECOMPRESS_FRONTEND_INIT(__i) { \ + .inode = __i, .clt = COLLECTOR_INIT(), \ + .backmost = true, } + +static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES]; +static DEFINE_MUTEX(z_pagemap_global_lock); + +static void preload_compressed_pages(struct z_erofs_collector *clt, + struct address_space *mc, + enum z_erofs_cache_alloctype type, + struct list_head *pagepool) +{ + const struct z_erofs_pcluster *pcl = clt->pcl; + const unsigned int clusterpages = BIT(pcl->clusterbits); + struct page **pages = clt->compressedpages; + pgoff_t index = pcl->obj.index + (pages - pcl->compressed_pages); + bool standalone = true; + + if (clt->mode < COLLECT_PRIMARY_FOLLOWED) + return; + + for (; pages < pcl->compressed_pages + clusterpages; ++pages) { + struct page *page; + compressed_page_t t; + + /* the compressed page was loaded before */ + if (READ_ONCE(*pages)) + continue; + + page = find_get_page(mc, index); + + if (page) { + t = tag_compressed_page_justfound(page); + } else if (type == DELAYEDALLOC) { + t = tagptr_init(compressed_page_t, PAGE_UNALLOCATED); + } else { /* DONTALLOC */ + if (standalone) + clt->compressedpages = pages; + standalone = false; + continue; + } + + if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t))) + continue; + + if (page) + put_page(page); + } + + if (standalone) /* downgrade to PRIMARY_FOLLOWED_NOINPLACE */ + clt->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; +} + +/* called by erofs_shrinker to get rid of all compressed_pages */ +int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, + struct erofs_workgroup *grp) +{ + struct z_erofs_pcluster *const pcl = + container_of(grp, struct z_erofs_pcluster, obj); + struct address_space *const mapping = MNGD_MAPPING(sbi); + const unsigned int clusterpages = BIT(pcl->clusterbits); + int i; + + /* + * refcount of workgroup is now freezed as 1, + * therefore no need to worry about available decompression users. + */ + for (i = 0; i < clusterpages; ++i) { + struct page *page = pcl->compressed_pages[i]; + + if (!page) + continue; + + /* block other users from reclaiming or migrating the page */ + if (!trylock_page(page)) + return -EBUSY; + + if (page->mapping != mapping) + continue; + + /* barrier is implied in the following 'unlock_page' */ + WRITE_ONCE(pcl->compressed_pages[i], NULL); + set_page_private(page, 0); + ClearPagePrivate(page); + + unlock_page(page); + put_page(page); + } + return 0; +} + +int erofs_try_to_free_cached_page(struct address_space *mapping, + struct page *page) +{ + struct z_erofs_pcluster *const pcl = (void *)page_private(page); + const unsigned int clusterpages = BIT(pcl->clusterbits); + int ret = 0; /* 0 - busy */ + + if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) { + unsigned int i; + + for (i = 0; i < clusterpages; ++i) { + if (pcl->compressed_pages[i] == page) { + WRITE_ONCE(pcl->compressed_pages[i], NULL); + ret = 1; + break; + } + } + erofs_workgroup_unfreeze(&pcl->obj, 1); + + if (ret) { + ClearPagePrivate(page); + put_page(page); + } + } + return ret; +} + +/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */ +static inline bool z_erofs_try_inplace_io(struct z_erofs_collector *clt, + struct page *page) +{ + struct z_erofs_pcluster *const pcl = clt->pcl; + const unsigned int clusterpages = BIT(pcl->clusterbits); + + while (clt->compressedpages < pcl->compressed_pages + clusterpages) { + if (!cmpxchg(clt->compressedpages++, NULL, page)) + return true; + } + return false; +} + +/* callers must be with collection lock held */ +static int z_erofs_attach_page(struct z_erofs_collector *clt, + struct page *page, + enum z_erofs_page_type type) +{ + int ret; + bool occupied; + + /* give priority for inplaceio */ + if (clt->mode >= COLLECT_PRIMARY && + type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && + z_erofs_try_inplace_io(clt, page)) + return 0; + + ret = z_erofs_pagevec_enqueue(&clt->vector, + page, type, &occupied); + clt->cl->vcnt += (unsigned int)ret; + + return ret ? 0 : -EAGAIN; +} + +static enum z_erofs_collectmode +try_to_claim_pcluster(struct z_erofs_pcluster *pcl, + z_erofs_next_pcluster_t *owned_head) +{ + /* let's claim these following types of pclusters */ +retry: + if (pcl->next == Z_EROFS_PCLUSTER_NIL) { + /* type 1, nil pcluster */ + if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL, + *owned_head) != Z_EROFS_PCLUSTER_NIL) + goto retry; + + *owned_head = &pcl->next; + /* lucky, I am the followee :) */ + return COLLECT_PRIMARY_FOLLOWED; + } else if (pcl->next == Z_EROFS_PCLUSTER_TAIL) { + /* + * type 2, link to the end of a existing open chain, + * be careful that its submission itself is governed + * by the original owned chain. + */ + if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, + *owned_head) != Z_EROFS_PCLUSTER_TAIL) + goto retry; + *owned_head = Z_EROFS_PCLUSTER_TAIL; + return COLLECT_PRIMARY_HOOKED; + } + return COLLECT_PRIMARY; /* :( better luck next time */ +} + +static struct z_erofs_collection *cllookup(struct z_erofs_collector *clt, + struct inode *inode, + struct erofs_map_blocks *map) +{ + struct erofs_workgroup *grp; + struct z_erofs_pcluster *pcl; + struct z_erofs_collection *cl; + unsigned int length; + bool tag; + + grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT, &tag); + if (!grp) + return NULL; + + pcl = container_of(grp, struct z_erofs_pcluster, obj); + if (clt->owned_head == &pcl->next || pcl == clt->tailpcl) { + DBG_BUGON(1); + erofs_workgroup_put(grp); + return ERR_PTR(-EFSCORRUPTED); + } + + cl = z_erofs_primarycollection(pcl); + if (cl->pageofs != (map->m_la & ~PAGE_MASK)) { + DBG_BUGON(1); + erofs_workgroup_put(grp); + return ERR_PTR(-EFSCORRUPTED); + } + + length = READ_ONCE(pcl->length); + if (length & Z_EROFS_PCLUSTER_FULL_LENGTH) { + if ((map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) > length) { + DBG_BUGON(1); + erofs_workgroup_put(grp); + return ERR_PTR(-EFSCORRUPTED); + } + } else { + unsigned int llen = map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT; + + if (map->m_flags & EROFS_MAP_FULL_MAPPED) + llen |= Z_EROFS_PCLUSTER_FULL_LENGTH; + + while (llen > length && + length != cmpxchg_relaxed(&pcl->length, length, llen)) { + cpu_relax(); + length = READ_ONCE(pcl->length); + } + } + mutex_lock(&cl->lock); + /* used to check tail merging loop due to corrupted images */ + if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) + clt->tailpcl = pcl; + clt->mode = try_to_claim_pcluster(pcl, &clt->owned_head); + /* clean tailpcl if the current owned_head is Z_EROFS_PCLUSTER_TAIL */ + if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) + clt->tailpcl = NULL; + clt->pcl = pcl; + clt->cl = cl; + return cl; +} + +static struct z_erofs_collection *clregister(struct z_erofs_collector *clt, + struct inode *inode, + struct erofs_map_blocks *map) +{ + struct z_erofs_pcluster *pcl; + struct z_erofs_collection *cl; + int err; + + /* no available workgroup, let's allocate one */ + pcl = kmem_cache_alloc(pcluster_cachep, GFP_NOFS); + if (!pcl) + return ERR_PTR(-ENOMEM); + + z_erofs_pcluster_init_always(pcl); + pcl->obj.index = map->m_pa >> PAGE_SHIFT; + + pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) | + (map->m_flags & EROFS_MAP_FULL_MAPPED ? + Z_EROFS_PCLUSTER_FULL_LENGTH : 0); + + if (map->m_flags & EROFS_MAP_ZIPPED) + pcl->algorithmformat = Z_EROFS_COMPRESSION_LZ4; + else + pcl->algorithmformat = Z_EROFS_COMPRESSION_SHIFTED; + + pcl->clusterbits = EROFS_I(inode)->z_physical_clusterbits[0]; + pcl->clusterbits -= PAGE_SHIFT; + + /* new pclusters should be claimed as type 1, primary and followed */ + pcl->next = clt->owned_head; + clt->mode = COLLECT_PRIMARY_FOLLOWED; + + cl = z_erofs_primarycollection(pcl); + cl->pageofs = map->m_la & ~PAGE_MASK; + + /* + * lock all primary followed works before visible to others + * and mutex_trylock *never* fails for a new pcluster. + */ + mutex_trylock(&cl->lock); + + err = erofs_register_workgroup(inode->i_sb, &pcl->obj, 0); + if (err) { + mutex_unlock(&cl->lock); + kmem_cache_free(pcluster_cachep, pcl); + return ERR_PTR(-EAGAIN); + } + /* used to check tail merging loop due to corrupted images */ + if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) + clt->tailpcl = pcl; + clt->owned_head = &pcl->next; + clt->pcl = pcl; + clt->cl = cl; + return cl; +} + +static int z_erofs_collector_begin(struct z_erofs_collector *clt, + struct inode *inode, + struct erofs_map_blocks *map) +{ + struct z_erofs_collection *cl; + + DBG_BUGON(clt->cl); + + /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous collection */ + DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_NIL); + DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); + + if (!PAGE_ALIGNED(map->m_pa)) { + DBG_BUGON(1); + return -EINVAL; + } + +repeat: + cl = cllookup(clt, inode, map); + if (!cl) { + cl = clregister(clt, inode, map); + + if (cl == ERR_PTR(-EAGAIN)) + goto repeat; + } + + if (IS_ERR(cl)) + return PTR_ERR(cl); + + z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS, + cl->pagevec, cl->vcnt); + + clt->compressedpages = clt->pcl->compressed_pages; + if (clt->mode <= COLLECT_PRIMARY) /* cannot do in-place I/O */ + clt->compressedpages += Z_EROFS_CLUSTER_MAX_PAGES; + return 0; +} + +/* + * keep in mind that no referenced pclusters will be freed + * only after a RCU grace period. + */ +static void z_erofs_rcu_callback(struct rcu_head *head) +{ + struct z_erofs_collection *const cl = + container_of(head, struct z_erofs_collection, rcu); + + kmem_cache_free(pcluster_cachep, + container_of(cl, struct z_erofs_pcluster, + primary_collection)); +} + +void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) +{ + struct z_erofs_pcluster *const pcl = + container_of(grp, struct z_erofs_pcluster, obj); + struct z_erofs_collection *const cl = z_erofs_primarycollection(pcl); + + call_rcu(&cl->rcu, z_erofs_rcu_callback); +} + +static void z_erofs_collection_put(struct z_erofs_collection *cl) +{ + struct z_erofs_pcluster *const pcl = + container_of(cl, struct z_erofs_pcluster, primary_collection); + + erofs_workgroup_put(&pcl->obj); +} + +static bool z_erofs_collector_end(struct z_erofs_collector *clt) +{ + struct z_erofs_collection *cl = clt->cl; + + if (!cl) + return false; + + z_erofs_pagevec_ctor_exit(&clt->vector, false); + mutex_unlock(&cl->lock); + + /* + * if all pending pages are added, don't hold its reference + * any longer if the pcluster isn't hosted by ourselves. + */ + if (clt->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE) + z_erofs_collection_put(cl); + + clt->cl = NULL; + return true; +} + +static inline struct page *__stagingpage_alloc(struct list_head *pagepool, + gfp_t gfp) +{ + struct page *page = erofs_allocpage(pagepool, gfp, true); + + page->mapping = Z_EROFS_MAPPING_STAGING; + return page; +} + +static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe, + unsigned int cachestrategy, + erofs_off_t la) +{ + if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED) + return false; + + if (fe->backmost) + return true; + + return cachestrategy >= EROFS_ZIP_CACHE_READAROUND && + la < fe->headoffset; +} + +static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, + struct page *page, + struct list_head *pagepool) +{ + struct inode *const inode = fe->inode; + struct erofs_sb_info *const sbi __maybe_unused = EROFS_I_SB(inode); + struct erofs_map_blocks *const map = &fe->map; + struct z_erofs_collector *const clt = &fe->clt; + const loff_t offset = page_offset(page); + bool tight = (clt->mode >= COLLECT_PRIMARY_HOOKED); + + enum z_erofs_cache_alloctype cache_strategy; + enum z_erofs_page_type page_type; + unsigned int cur, end, spiltted, index; + int err = 0; + + /* register locked file pages as online pages in pack */ + z_erofs_onlinepage_init(page); + + spiltted = 0; + end = PAGE_SIZE; +repeat: + cur = end - 1; + + /* lucky, within the range of the current map_blocks */ + if (offset + cur >= map->m_la && + offset + cur < map->m_la + map->m_llen) { + /* didn't get a valid collection previously (very rare) */ + if (!clt->cl) + goto restart_now; + goto hitted; + } + + /* go ahead the next map_blocks */ + erofs_dbg("%s: [out-of-range] pos %llu", __func__, offset + cur); + + if (z_erofs_collector_end(clt)) + fe->backmost = false; + + map->m_la = offset + cur; + map->m_llen = 0; + err = z_erofs_map_blocks_iter(inode, map, 0); + if (err) + goto err_out; + +restart_now: + if (!(map->m_flags & EROFS_MAP_MAPPED)) + goto hitted; + + err = z_erofs_collector_begin(clt, inode, map); + if (err) + goto err_out; + + /* preload all compressed pages (maybe downgrade role if necessary) */ + if (should_alloc_managed_pages(fe, sbi->cache_strategy, map->m_la)) + cache_strategy = DELAYEDALLOC; + else + cache_strategy = DONTALLOC; + + preload_compressed_pages(clt, MNGD_MAPPING(sbi), + cache_strategy, pagepool); + + tight &= (clt->mode >= COLLECT_PRIMARY_HOOKED); +hitted: + cur = end - min_t(unsigned int, offset + end - map->m_la, end); + if (!(map->m_flags & EROFS_MAP_MAPPED)) { + zero_user_segment(page, cur, end); + goto next_part; + } + + /* let's derive page type */ + page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD : + (!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE : + (tight ? Z_EROFS_PAGE_TYPE_EXCLUSIVE : + Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED)); + + if (cur) + tight &= (clt->mode >= COLLECT_PRIMARY_FOLLOWED); + +retry: + err = z_erofs_attach_page(clt, page, page_type); + /* should allocate an additional staging page for pagevec */ + if (err == -EAGAIN) { + struct page *const newpage = + __stagingpage_alloc(pagepool, GFP_NOFS); + + err = z_erofs_attach_page(clt, newpage, + Z_EROFS_PAGE_TYPE_EXCLUSIVE); + if (!err) + goto retry; + } + + if (err) + goto err_out; + + index = page->index - (map->m_la >> PAGE_SHIFT); + + z_erofs_onlinepage_fixup(page, index, true); + + /* bump up the number of spiltted parts of a page */ + ++spiltted; + /* also update nr_pages */ + clt->cl->nr_pages = max_t(pgoff_t, clt->cl->nr_pages, index + 1); +next_part: + /* can be used for verification */ + map->m_llen = offset + cur - map->m_la; + + end = cur; + if (end > 0) + goto repeat; + +out: + z_erofs_onlinepage_endio(page); + + erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu", + __func__, page, spiltted, map->m_llen); + return err; + + /* if some error occurred while processing this page */ +err_out: + SetPageError(page); + goto out; +} + +static void z_erofs_vle_unzip_kickoff(void *ptr, int bios) +{ + tagptr1_t t = tagptr_init(tagptr1_t, ptr); + struct z_erofs_unzip_io *io = tagptr_unfold_ptr(t); + bool background = tagptr_unfold_tags(t); + + if (!background) { + unsigned long flags; + + spin_lock_irqsave(&io->u.wait.lock, flags); + if (!atomic_add_return(bios, &io->pending_bios)) + wake_up_locked(&io->u.wait); + spin_unlock_irqrestore(&io->u.wait.lock, flags); + return; + } + + if (!atomic_add_return(bios, &io->pending_bios)) + queue_work(z_erofs_workqueue, &io->u.work); +} + +static inline void z_erofs_vle_read_endio(struct bio *bio) +{ + struct erofs_sb_info *sbi = NULL; + blk_status_t err = bio->bi_status; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) { + struct page *page = bvec->bv_page; + bool cachemngd = false; + + DBG_BUGON(PageUptodate(page)); + DBG_BUGON(!page->mapping); + + if (!sbi && !z_erofs_page_is_staging(page)) + sbi = EROFS_SB(page->mapping->host->i_sb); + + /* sbi should already be gotten if the page is managed */ + if (sbi) + cachemngd = erofs_page_is_managed(sbi, page); + + if (err) + SetPageError(page); + else if (cachemngd) + SetPageUptodate(page); + + if (cachemngd) + unlock_page(page); + } + + z_erofs_vle_unzip_kickoff(bio->bi_private, -1); + bio_put(bio); +} + +static int z_erofs_decompress_pcluster(struct super_block *sb, + struct z_erofs_pcluster *pcl, + struct list_head *pagepool) +{ + struct erofs_sb_info *const sbi = EROFS_SB(sb); + const unsigned int clusterpages = BIT(pcl->clusterbits); + struct z_erofs_pagevec_ctor ctor; + unsigned int i, outputsize, llen, nr_pages; + struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES]; + struct page **pages, **compressed_pages, *page; + + enum z_erofs_page_type page_type; + bool overlapped, partial; + struct z_erofs_collection *cl; + int err; + + might_sleep(); + cl = z_erofs_primarycollection(pcl); + DBG_BUGON(!READ_ONCE(cl->nr_pages)); + + mutex_lock(&cl->lock); + nr_pages = cl->nr_pages; + + if (nr_pages <= Z_EROFS_VMAP_ONSTACK_PAGES) { + pages = pages_onstack; + } else if (nr_pages <= Z_EROFS_VMAP_GLOBAL_PAGES && + mutex_trylock(&z_pagemap_global_lock)) { + pages = z_pagemap_global; + } else { + gfp_t gfp_flags = GFP_KERNEL; + + if (nr_pages > Z_EROFS_VMAP_GLOBAL_PAGES) + gfp_flags |= __GFP_NOFAIL; + + pages = kvmalloc_array(nr_pages, sizeof(struct page *), + gfp_flags); + + /* fallback to global pagemap for the lowmem scenario */ + if (!pages) { + mutex_lock(&z_pagemap_global_lock); + pages = z_pagemap_global; + } + } + + for (i = 0; i < nr_pages; ++i) + pages[i] = NULL; + + err = 0; + z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS, + cl->pagevec, 0); + + for (i = 0; i < cl->vcnt; ++i) { + unsigned int pagenr; + + page = z_erofs_pagevec_dequeue(&ctor, &page_type); + + /* all pages in pagevec ought to be valid */ + DBG_BUGON(!page); + DBG_BUGON(!page->mapping); + + if (z_erofs_put_stagingpage(pagepool, page)) + continue; + + if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD) + pagenr = 0; + else + pagenr = z_erofs_onlinepage_index(page); + + DBG_BUGON(pagenr >= nr_pages); + + /* + * currently EROFS doesn't support multiref(dedup), + * so here erroring out one multiref page. + */ + if (pages[pagenr]) { + DBG_BUGON(1); + SetPageError(pages[pagenr]); + z_erofs_onlinepage_endio(pages[pagenr]); + err = -EFSCORRUPTED; + } + pages[pagenr] = page; + } + z_erofs_pagevec_ctor_exit(&ctor, true); + + overlapped = false; + compressed_pages = pcl->compressed_pages; + + for (i = 0; i < clusterpages; ++i) { + unsigned int pagenr; + + page = compressed_pages[i]; + + /* all compressed pages ought to be valid */ + DBG_BUGON(!page); + DBG_BUGON(!page->mapping); + + if (!z_erofs_page_is_staging(page)) { + if (erofs_page_is_managed(sbi, page)) { + if (!PageUptodate(page)) + err = -EIO; + continue; + } + + /* + * only if non-head page can be selected + * for inplace decompression + */ + pagenr = z_erofs_onlinepage_index(page); + + DBG_BUGON(pagenr >= nr_pages); + if (pages[pagenr]) { + DBG_BUGON(1); + SetPageError(pages[pagenr]); + z_erofs_onlinepage_endio(pages[pagenr]); + err = -EFSCORRUPTED; + } + pages[pagenr] = page; + + overlapped = true; + } + + /* PG_error needs checking for inplaced and staging pages */ + if (PageError(page)) { + DBG_BUGON(PageUptodate(page)); + err = -EIO; + } + } + + if (err) + goto out; + + llen = pcl->length >> Z_EROFS_PCLUSTER_LENGTH_BIT; + if (nr_pages << PAGE_SHIFT >= cl->pageofs + llen) { + outputsize = llen; + partial = !(pcl->length & Z_EROFS_PCLUSTER_FULL_LENGTH); + } else { + outputsize = (nr_pages << PAGE_SHIFT) - cl->pageofs; + partial = true; + } + + err = z_erofs_decompress(&(struct z_erofs_decompress_req) { + .sb = sb, + .in = compressed_pages, + .out = pages, + .pageofs_out = cl->pageofs, + .inputsize = PAGE_SIZE, + .outputsize = outputsize, + .alg = pcl->algorithmformat, + .inplace_io = overlapped, + .partial_decoding = partial + }, pagepool); + +out: + /* must handle all compressed pages before endding pages */ + for (i = 0; i < clusterpages; ++i) { + page = compressed_pages[i]; + + if (erofs_page_is_managed(sbi, page)) + continue; + + /* recycle all individual staging pages */ + (void)z_erofs_put_stagingpage(pagepool, page); + + WRITE_ONCE(compressed_pages[i], NULL); + } + + for (i = 0; i < nr_pages; ++i) { + page = pages[i]; + if (!page) + continue; + + DBG_BUGON(!page->mapping); + + /* recycle all individual staging pages */ + if (z_erofs_put_stagingpage(pagepool, page)) + continue; + + if (err < 0) + SetPageError(page); + + z_erofs_onlinepage_endio(page); + } + + if (pages == z_pagemap_global) + mutex_unlock(&z_pagemap_global_lock); + else if (pages != pages_onstack) + kvfree(pages); + + cl->nr_pages = 0; + cl->vcnt = 0; + + /* all cl locks MUST be taken before the following line */ + WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL); + + /* all cl locks SHOULD be released right now */ + mutex_unlock(&cl->lock); + + z_erofs_collection_put(cl); + return err; +} + +static void z_erofs_vle_unzip_all(struct super_block *sb, + struct z_erofs_unzip_io *io, + struct list_head *pagepool) +{ + z_erofs_next_pcluster_t owned = io->head; + + while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) { + struct z_erofs_pcluster *pcl; + + /* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */ + DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL); + + /* no possible that 'owned' equals NULL */ + DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL); + + pcl = container_of(owned, struct z_erofs_pcluster, next); + owned = READ_ONCE(pcl->next); + + z_erofs_decompress_pcluster(sb, pcl, pagepool); + } +} + +static void z_erofs_vle_unzip_wq(struct work_struct *work) +{ + struct z_erofs_unzip_io_sb *iosb = + container_of(work, struct z_erofs_unzip_io_sb, io.u.work); + LIST_HEAD(pagepool); + + DBG_BUGON(iosb->io.head == Z_EROFS_PCLUSTER_TAIL_CLOSED); + z_erofs_vle_unzip_all(iosb->sb, &iosb->io, &pagepool); + + put_pages_list(&pagepool); + kvfree(iosb); +} + +static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, + unsigned int nr, + struct list_head *pagepool, + struct address_space *mc, + gfp_t gfp) +{ + /* determined at compile time to avoid too many #ifdefs */ + const bool nocache = __builtin_constant_p(mc) ? !mc : false; + const pgoff_t index = pcl->obj.index; + bool tocache = false; + + struct address_space *mapping; + struct page *oldpage, *page; + + compressed_page_t t; + int justfound; + +repeat: + page = READ_ONCE(pcl->compressed_pages[nr]); + oldpage = page; + + if (!page) + goto out_allocpage; + + /* + * the cached page has not been allocated and + * an placeholder is out there, prepare it now. + */ + if (!nocache && page == PAGE_UNALLOCATED) { + tocache = true; + goto out_allocpage; + } + + /* process the target tagged pointer */ + t = tagptr_init(compressed_page_t, page); + justfound = tagptr_unfold_tags(t); + page = tagptr_unfold_ptr(t); + + mapping = READ_ONCE(page->mapping); + + /* + * if managed cache is disabled, it's no way to + * get such a cached-like page. + */ + if (nocache) { + /* if managed cache is disabled, it is impossible `justfound' */ + DBG_BUGON(justfound); + + /* and it should be locked, not uptodate, and not truncated */ + DBG_BUGON(!PageLocked(page)); + DBG_BUGON(PageUptodate(page)); + DBG_BUGON(!mapping); + goto out; + } + + /* + * unmanaged (file) pages are all locked solidly, + * therefore it is impossible for `mapping' to be NULL. + */ + if (mapping && mapping != mc) + /* ought to be unmanaged pages */ + goto out; + + lock_page(page); + + /* only true if page reclaim goes wrong, should never happen */ + DBG_BUGON(justfound && PagePrivate(page)); + + /* the page is still in manage cache */ + if (page->mapping == mc) { + WRITE_ONCE(pcl->compressed_pages[nr], page); + + ClearPageError(page); + if (!PagePrivate(page)) { + /* + * impossible to be !PagePrivate(page) for + * the current restriction as well if + * the page is already in compressed_pages[]. + */ + DBG_BUGON(!justfound); + + justfound = 0; + set_page_private(page, (unsigned long)pcl); + SetPagePrivate(page); + } + + /* no need to submit io if it is already up-to-date */ + if (PageUptodate(page)) { + unlock_page(page); + page = NULL; + } + goto out; + } + + /* + * the managed page has been truncated, it's unsafe to + * reuse this one, let's allocate a new cache-managed page. + */ + DBG_BUGON(page->mapping); + DBG_BUGON(!justfound); + + tocache = true; + unlock_page(page); + put_page(page); +out_allocpage: + page = __stagingpage_alloc(pagepool, gfp); + if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) { + list_add(&page->lru, pagepool); + cpu_relax(); + goto repeat; + } + if (nocache || !tocache) + goto out; + if (add_to_page_cache_lru(page, mc, index + nr, gfp)) { + page->mapping = Z_EROFS_MAPPING_STAGING; + goto out; + } + + set_page_private(page, (unsigned long)pcl); + SetPagePrivate(page); +out: /* the only exit (for tracing and debugging) */ + return page; +} + +static struct z_erofs_unzip_io *jobqueue_init(struct super_block *sb, + struct z_erofs_unzip_io *io, + bool foreground) +{ + struct z_erofs_unzip_io_sb *iosb; + + if (foreground) { + /* waitqueue available for foreground io */ + DBG_BUGON(!io); + + init_waitqueue_head(&io->u.wait); + atomic_set(&io->pending_bios, 0); + goto out; + } + + iosb = kvzalloc(sizeof(*iosb), GFP_KERNEL | __GFP_NOFAIL); + DBG_BUGON(!iosb); + + /* initialize fields in the allocated descriptor */ + io = &iosb->io; + iosb->sb = sb; + INIT_WORK(&io->u.work, z_erofs_vle_unzip_wq); +out: + io->head = Z_EROFS_PCLUSTER_TAIL_CLOSED; + return io; +} + +/* define decompression jobqueue types */ +enum { + JQ_BYPASS, + JQ_SUBMIT, + NR_JOBQUEUES, +}; + +static void *jobqueueset_init(struct super_block *sb, + z_erofs_next_pcluster_t qtail[], + struct z_erofs_unzip_io *q[], + struct z_erofs_unzip_io *fgq, + bool forcefg) +{ + /* + * if managed cache is enabled, bypass jobqueue is needed, + * no need to read from device for all pclusters in this queue. + */ + q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, true); + qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; + + q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, forcefg); + qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head; + + return tagptr_cast_ptr(tagptr_fold(tagptr1_t, q[JQ_SUBMIT], !forcefg)); +} + +static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, + z_erofs_next_pcluster_t qtail[], + z_erofs_next_pcluster_t owned_head) +{ + z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT]; + z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS]; + + DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); + if (owned_head == Z_EROFS_PCLUSTER_TAIL) + owned_head = Z_EROFS_PCLUSTER_TAIL_CLOSED; + + WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL_CLOSED); + + WRITE_ONCE(*submit_qtail, owned_head); + WRITE_ONCE(*bypass_qtail, &pcl->next); + + qtail[JQ_BYPASS] = &pcl->next; +} + +static bool postsubmit_is_all_bypassed(struct z_erofs_unzip_io *q[], + unsigned int nr_bios, + bool force_fg) +{ + /* + * although background is preferred, no one is pending for submission. + * don't issue workqueue for decompression but drop it directly instead. + */ + if (force_fg || nr_bios) + return false; + + kvfree(container_of(q[JQ_SUBMIT], struct z_erofs_unzip_io_sb, io)); + return true; +} + +static bool z_erofs_vle_submit_all(struct super_block *sb, + z_erofs_next_pcluster_t owned_head, + struct list_head *pagepool, + struct z_erofs_unzip_io *fgq, + bool force_fg) +{ + struct erofs_sb_info *const sbi __maybe_unused = EROFS_SB(sb); + z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; + struct z_erofs_unzip_io *q[NR_JOBQUEUES]; + struct bio *bio; + void *bi_private; + /* since bio will be NULL, no need to initialize last_index */ + pgoff_t uninitialized_var(last_index); + bool force_submit = false; + unsigned int nr_bios; + + if (owned_head == Z_EROFS_PCLUSTER_TAIL) + return false; + + force_submit = false; + bio = NULL; + nr_bios = 0; + bi_private = jobqueueset_init(sb, qtail, q, fgq, force_fg); + + /* by default, all need io submission */ + q[JQ_SUBMIT]->head = owned_head; + + do { + struct z_erofs_pcluster *pcl; + unsigned int clusterpages; + pgoff_t first_index; + struct page *page; + unsigned int i = 0, bypass = 0; + int err; + + /* no possible 'owned_head' equals the following */ + DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); + DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL); + + pcl = container_of(owned_head, struct z_erofs_pcluster, next); + + clusterpages = BIT(pcl->clusterbits); + + /* close the main owned chain at first */ + owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, + Z_EROFS_PCLUSTER_TAIL_CLOSED); + + first_index = pcl->obj.index; + force_submit |= (first_index != last_index + 1); + +repeat: + page = pickup_page_for_submission(pcl, i, pagepool, + MNGD_MAPPING(sbi), + GFP_NOFS); + if (!page) { + force_submit = true; + ++bypass; + goto skippage; + } + + if (bio && force_submit) { +submit_bio_retry: + submit_bio(bio); + bio = NULL; + } + + if (!bio) { + bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); + + bio->bi_end_io = z_erofs_vle_read_endio; + bio_set_dev(bio, sb->s_bdev); + bio->bi_iter.bi_sector = (sector_t)(first_index + i) << + LOG_SECTORS_PER_BLOCK; + bio->bi_private = bi_private; + bio->bi_opf = REQ_OP_READ; + + ++nr_bios; + } + + err = bio_add_page(bio, page, PAGE_SIZE, 0); + if (err < PAGE_SIZE) + goto submit_bio_retry; + + force_submit = false; + last_index = first_index + i; +skippage: + if (++i < clusterpages) + goto repeat; + + if (bypass < clusterpages) + qtail[JQ_SUBMIT] = &pcl->next; + else + move_to_bypass_jobqueue(pcl, qtail, owned_head); + } while (owned_head != Z_EROFS_PCLUSTER_TAIL); + + if (bio) + submit_bio(bio); + + if (postsubmit_is_all_bypassed(q, nr_bios, force_fg)) + return true; + + z_erofs_vle_unzip_kickoff(bi_private, nr_bios); + return true; +} + +static void z_erofs_submit_and_unzip(struct super_block *sb, + struct z_erofs_collector *clt, + struct list_head *pagepool, + bool force_fg) +{ + struct z_erofs_unzip_io io[NR_JOBQUEUES]; + + if (!z_erofs_vle_submit_all(sb, clt->owned_head, + pagepool, io, force_fg)) + return; + + /* decompress no I/O pclusters immediately */ + z_erofs_vle_unzip_all(sb, &io[JQ_BYPASS], pagepool); + + if (!force_fg) + return; + + /* wait until all bios are completed */ + wait_event(io[JQ_SUBMIT].u.wait, + !atomic_read(&io[JQ_SUBMIT].pending_bios)); + + /* let's synchronous decompression */ + z_erofs_vle_unzip_all(sb, &io[JQ_SUBMIT], pagepool); +} + +static int z_erofs_vle_normalaccess_readpage(struct file *file, + struct page *page) +{ + struct inode *const inode = page->mapping->host; + struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); + int err; + LIST_HEAD(pagepool); + + trace_erofs_readpage(page, false); + + f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; + + err = z_erofs_do_read_page(&f, page, &pagepool); + (void)z_erofs_collector_end(&f.clt); + + /* if some compressed cluster ready, need submit them anyway */ + z_erofs_submit_and_unzip(inode->i_sb, &f.clt, &pagepool, true); + + if (err) + erofs_err(inode->i_sb, "failed to read, err [%d]", err); + + if (f.map.mpage) + put_page(f.map.mpage); + + /* clean up the remaining free pages */ + put_pages_list(&pagepool); + return err; +} + +static bool should_decompress_synchronously(struct erofs_sb_info *sbi, + unsigned int nr) +{ + return nr <= sbi->max_sync_decompress_pages; +} + +static int z_erofs_vle_normalaccess_readpages(struct file *filp, + struct address_space *mapping, + struct list_head *pages, + unsigned int nr_pages) +{ + struct inode *const inode = mapping->host; + struct erofs_sb_info *const sbi = EROFS_I_SB(inode); + + bool sync = should_decompress_synchronously(sbi, nr_pages); + struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); + gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL); + struct page *head = NULL; + LIST_HEAD(pagepool); + + trace_erofs_readpages(mapping->host, lru_to_page(pages), + nr_pages, false); + + f.headoffset = (erofs_off_t)lru_to_page(pages)->index << PAGE_SHIFT; + + for (; nr_pages; --nr_pages) { + struct page *page = lru_to_page(pages); + + prefetchw(&page->flags); + list_del(&page->lru); + + /* + * A pure asynchronous readahead is indicated if + * a PG_readahead marked page is hitted at first. + * Let's also do asynchronous decompression for this case. + */ + sync &= !(PageReadahead(page) && !head); + + if (add_to_page_cache_lru(page, mapping, page->index, gfp)) { + list_add(&page->lru, &pagepool); + continue; + } + + set_page_private(page, (unsigned long)head); + head = page; + } + + while (head) { + struct page *page = head; + int err; + + /* traversal in reverse order */ + head = (void *)page_private(page); + + err = z_erofs_do_read_page(&f, page, &pagepool); + if (err) + erofs_err(inode->i_sb, + "readahead error at page %lu @ nid %llu", + page->index, EROFS_I(inode)->nid); + put_page(page); + } + + (void)z_erofs_collector_end(&f.clt); + + z_erofs_submit_and_unzip(inode->i_sb, &f.clt, &pagepool, sync); + + if (f.map.mpage) + put_page(f.map.mpage); + + /* clean up the remaining free pages */ + put_pages_list(&pagepool); + return 0; +} + +const struct address_space_operations z_erofs_vle_normalaccess_aops = { + .readpage = z_erofs_vle_normalaccess_readpage, + .readpages = z_erofs_vle_normalaccess_readpages, +}; + diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h new file mode 100644 index 000000000000..faf950189bd7 --- /dev/null +++ b/fs/erofs/zdata.h @@ -0,0 +1,193 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#ifndef __EROFS_FS_ZDATA_H +#define __EROFS_FS_ZDATA_H + +#include "internal.h" +#include "zpvec.h" + +#define Z_EROFS_NR_INLINE_PAGEVECS 3 + +/* + * Structure fields follow one of the following exclusion rules. + * + * I: Modifiable by initialization/destruction paths and read-only + * for everyone else; + * + * L: Field should be protected by pageset lock; + * + * A: Field should be accessed / updated in atomic for parallelized code. + */ +struct z_erofs_collection { + struct mutex lock; + + /* I: page offset of start position of decompression */ + unsigned short pageofs; + + /* L: maximum relative page index in pagevec[] */ + unsigned short nr_pages; + + /* L: total number of pages in pagevec[] */ + unsigned int vcnt; + + union { + /* L: inline a certain number of pagevecs for bootstrap */ + erofs_vtptr_t pagevec[Z_EROFS_NR_INLINE_PAGEVECS]; + + /* I: can be used to free the pcluster by RCU. */ + struct rcu_head rcu; + }; +}; + +#define Z_EROFS_PCLUSTER_FULL_LENGTH 0x00000001 +#define Z_EROFS_PCLUSTER_LENGTH_BIT 1 + +/* + * let's leave a type here in case of introducing + * another tagged pointer later. + */ +typedef void *z_erofs_next_pcluster_t; + +struct z_erofs_pcluster { + struct erofs_workgroup obj; + struct z_erofs_collection primary_collection; + + /* A: point to next chained pcluster or TAILs */ + z_erofs_next_pcluster_t next; + + /* A: compressed pages (including multi-usage pages) */ + struct page *compressed_pages[Z_EROFS_CLUSTER_MAX_PAGES]; + + /* A: lower limit of decompressed length and if full length or not */ + unsigned int length; + + /* I: compression algorithm format */ + unsigned char algorithmformat; + /* I: bit shift of physical cluster size */ + unsigned char clusterbits; +}; + +#define z_erofs_primarycollection(pcluster) (&(pcluster)->primary_collection) + +/* let's avoid the valid 32-bit kernel addresses */ + +/* the chained workgroup has't submitted io (still open) */ +#define Z_EROFS_PCLUSTER_TAIL ((void *)0x5F0ECAFE) +/* the chained workgroup has already submitted io */ +#define Z_EROFS_PCLUSTER_TAIL_CLOSED ((void *)0x5F0EDEAD) + +#define Z_EROFS_PCLUSTER_NIL (NULL) + +#define Z_EROFS_WORKGROUP_SIZE sizeof(struct z_erofs_pcluster) + +struct z_erofs_unzip_io { + atomic_t pending_bios; + z_erofs_next_pcluster_t head; + + union { + wait_queue_head_t wait; + struct work_struct work; + } u; +}; + +struct z_erofs_unzip_io_sb { + struct z_erofs_unzip_io io; + struct super_block *sb; +}; + +#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping) +static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi, + struct page *page) +{ + return page->mapping == MNGD_MAPPING(sbi); +} + +#define Z_EROFS_ONLINEPAGE_COUNT_BITS 2 +#define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1) +#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT (Z_EROFS_ONLINEPAGE_COUNT_BITS) + +/* + * waiters (aka. ongoing_packs): # to unlock the page + * sub-index: 0 - for partial page, >= 1 full page sub-index + */ +typedef atomic_t z_erofs_onlinepage_t; + +/* type punning */ +union z_erofs_onlinepage_converter { + z_erofs_onlinepage_t *o; + unsigned long *v; +}; + +static inline unsigned int z_erofs_onlinepage_index(struct page *page) +{ + union z_erofs_onlinepage_converter u; + + DBG_BUGON(!PagePrivate(page)); + u.v = &page_private(page); + + return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT; +} + +static inline void z_erofs_onlinepage_init(struct page *page) +{ + union { + z_erofs_onlinepage_t o; + unsigned long v; + /* keep from being unlocked in advance */ + } u = { .o = ATOMIC_INIT(1) }; + + set_page_private(page, u.v); + smp_wmb(); + SetPagePrivate(page); +} + +static inline void z_erofs_onlinepage_fixup(struct page *page, + uintptr_t index, bool down) +{ + unsigned long *p, o, v, id; +repeat: + p = &page_private(page); + o = READ_ONCE(*p); + + id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT; + if (id) { + if (!index) + return; + + DBG_BUGON(id != index); + } + + v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) | + ((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned int)down); + if (cmpxchg(p, o, v) != o) + goto repeat; +} + +static inline void z_erofs_onlinepage_endio(struct page *page) +{ + union z_erofs_onlinepage_converter u; + unsigned int v; + + DBG_BUGON(!PagePrivate(page)); + u.v = &page_private(page); + + v = atomic_dec_return(u.o); + if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) { + ClearPagePrivate(page); + if (!PageError(page)) + SetPageUptodate(page); + unlock_page(page); + } + erofs_dbg("%s, page %p value %x", __func__, page, atomic_read(u.o)); +} + +#define Z_EROFS_VMAP_ONSTACK_PAGES \ + min_t(unsigned int, THREAD_SIZE / 8 / sizeof(struct page *), 96U) +#define Z_EROFS_VMAP_GLOBAL_PAGES 2048 + +#endif + diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c new file mode 100644 index 000000000000..6a26c293ae2d --- /dev/null +++ b/fs/erofs/zmap.c @@ -0,0 +1,471 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2018-2019 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#include "internal.h" +#include <asm/unaligned.h> +#include <trace/events/erofs.h> + +int z_erofs_fill_inode(struct inode *inode) +{ + struct erofs_inode *const vi = EROFS_I(inode); + + if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) { + vi->z_advise = 0; + vi->z_algorithmtype[0] = 0; + vi->z_algorithmtype[1] = 0; + vi->z_logical_clusterbits = LOG_BLOCK_SIZE; + vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits; + vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits; + set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); + } + + inode->i_mapping->a_ops = &z_erofs_vle_normalaccess_aops; + return 0; +} + +static int fill_inode_lazy(struct inode *inode) +{ + struct erofs_inode *const vi = EROFS_I(inode); + struct super_block *const sb = inode->i_sb; + int err; + erofs_off_t pos; + struct page *page; + void *kaddr; + struct z_erofs_map_header *h; + + if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) + return 0; + + if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE)) + return -ERESTARTSYS; + + err = 0; + if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) + goto out_unlock; + + DBG_BUGON(vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY); + + pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize + + vi->xattr_isize, 8); + page = erofs_get_meta_page(sb, erofs_blknr(pos)); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out_unlock; + } + + kaddr = kmap_atomic(page); + + h = kaddr + erofs_blkoff(pos); + vi->z_advise = le16_to_cpu(h->h_advise); + vi->z_algorithmtype[0] = h->h_algorithmtype & 15; + vi->z_algorithmtype[1] = h->h_algorithmtype >> 4; + + if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX) { + erofs_err(sb, "unknown compression format %u for nid %llu, please upgrade kernel", + vi->z_algorithmtype[0], vi->nid); + err = -EOPNOTSUPP; + goto unmap_done; + } + + vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7); + vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits + + ((h->h_clusterbits >> 3) & 3); + + if (vi->z_physical_clusterbits[0] != LOG_BLOCK_SIZE) { + erofs_err(sb, "unsupported physical clusterbits %u for nid %llu, please upgrade kernel", + vi->z_physical_clusterbits[0], vi->nid); + err = -EOPNOTSUPP; + goto unmap_done; + } + + vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits + + ((h->h_clusterbits >> 5) & 7); + set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); +unmap_done: + kunmap_atomic(kaddr); + unlock_page(page); + put_page(page); +out_unlock: + clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags); + return err; +} + +struct z_erofs_maprecorder { + struct inode *inode; + struct erofs_map_blocks *map; + void *kaddr; + + unsigned long lcn; + /* compression extent information gathered */ + u8 type; + u16 clusterofs; + u16 delta[2]; + erofs_blk_t pblk; +}; + +static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m, + erofs_blk_t eblk) +{ + struct super_block *const sb = m->inode->i_sb; + struct erofs_map_blocks *const map = m->map; + struct page *mpage = map->mpage; + + if (mpage) { + if (mpage->index == eblk) { + if (!m->kaddr) + m->kaddr = kmap_atomic(mpage); + return 0; + } + + if (m->kaddr) { + kunmap_atomic(m->kaddr); + m->kaddr = NULL; + } + put_page(mpage); + } + + mpage = erofs_get_meta_page(sb, eblk); + if (IS_ERR(mpage)) { + map->mpage = NULL; + return PTR_ERR(mpage); + } + m->kaddr = kmap_atomic(mpage); + unlock_page(mpage); + map->mpage = mpage; + return 0; +} + +static int vle_legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, + unsigned long lcn) +{ + struct inode *const inode = m->inode; + struct erofs_inode *const vi = EROFS_I(inode); + const erofs_off_t ibase = iloc(EROFS_I_SB(inode), vi->nid); + const erofs_off_t pos = + Z_EROFS_VLE_LEGACY_INDEX_ALIGN(ibase + vi->inode_isize + + vi->xattr_isize) + + lcn * sizeof(struct z_erofs_vle_decompressed_index); + struct z_erofs_vle_decompressed_index *di; + unsigned int advise, type; + int err; + + err = z_erofs_reload_indexes(m, erofs_blknr(pos)); + if (err) + return err; + + m->lcn = lcn; + di = m->kaddr + erofs_blkoff(pos); + + advise = le16_to_cpu(di->di_advise); + type = (advise >> Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT) & + ((1 << Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) - 1); + switch (type) { + case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: + m->clusterofs = 1 << vi->z_logical_clusterbits; + m->delta[0] = le16_to_cpu(di->di_u.delta[0]); + m->delta[1] = le16_to_cpu(di->di_u.delta[1]); + break; + case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + m->clusterofs = le16_to_cpu(di->di_clusterofs); + m->pblk = le32_to_cpu(di->di_u.blkaddr); + break; + default: + DBG_BUGON(1); + return -EOPNOTSUPP; + } + m->type = type; + return 0; +} + +static unsigned int decode_compactedbits(unsigned int lobits, + unsigned int lomask, + u8 *in, unsigned int pos, u8 *type) +{ + const unsigned int v = get_unaligned_le32(in + pos / 8) >> (pos & 7); + const unsigned int lo = v & lomask; + + *type = (v >> lobits) & 3; + return lo; +} + +static int unpack_compacted_index(struct z_erofs_maprecorder *m, + unsigned int amortizedshift, + unsigned int eofs) +{ + struct erofs_inode *const vi = EROFS_I(m->inode); + const unsigned int lclusterbits = vi->z_logical_clusterbits; + const unsigned int lomask = (1 << lclusterbits) - 1; + unsigned int vcnt, base, lo, encodebits, nblk; + int i; + u8 *in, type; + + if (1 << amortizedshift == 4) + vcnt = 2; + else if (1 << amortizedshift == 2 && lclusterbits == 12) + vcnt = 16; + else + return -EOPNOTSUPP; + + encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt; + base = round_down(eofs, vcnt << amortizedshift); + in = m->kaddr + base; + + i = (eofs - base) >> amortizedshift; + + lo = decode_compactedbits(lclusterbits, lomask, + in, encodebits * i, &type); + m->type = type; + if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) { + m->clusterofs = 1 << lclusterbits; + if (i + 1 != vcnt) { + m->delta[0] = lo; + return 0; + } + /* + * since the last lcluster in the pack is special, + * of which lo saves delta[1] rather than delta[0]. + * Hence, get delta[0] by the previous lcluster indirectly. + */ + lo = decode_compactedbits(lclusterbits, lomask, + in, encodebits * (i - 1), &type); + if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) + lo = 0; + m->delta[0] = lo + 1; + return 0; + } + m->clusterofs = lo; + m->delta[0] = 0; + /* figout out blkaddr (pblk) for HEAD lclusters */ + nblk = 1; + while (i > 0) { + --i; + lo = decode_compactedbits(lclusterbits, lomask, + in, encodebits * i, &type); + if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) + i -= lo; + + if (i >= 0) + ++nblk; + } + in += (vcnt << amortizedshift) - sizeof(__le32); + m->pblk = le32_to_cpu(*(__le32 *)in) + nblk; + return 0; +} + +static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, + unsigned long lcn) +{ + struct inode *const inode = m->inode; + struct erofs_inode *const vi = EROFS_I(inode); + const unsigned int lclusterbits = vi->z_logical_clusterbits; + const erofs_off_t ebase = ALIGN(iloc(EROFS_I_SB(inode), vi->nid) + + vi->inode_isize + vi->xattr_isize, 8) + + sizeof(struct z_erofs_map_header); + const unsigned int totalidx = DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ); + unsigned int compacted_4b_initial, compacted_2b; + unsigned int amortizedshift; + erofs_off_t pos; + int err; + + if (lclusterbits != 12) + return -EOPNOTSUPP; + + if (lcn >= totalidx) + return -EINVAL; + + m->lcn = lcn; + /* used to align to 32-byte (compacted_2b) alignment */ + compacted_4b_initial = (32 - ebase % 32) / 4; + if (compacted_4b_initial == 32 / 4) + compacted_4b_initial = 0; + + if (vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B) + compacted_2b = rounddown(totalidx - compacted_4b_initial, 16); + else + compacted_2b = 0; + + pos = ebase; + if (lcn < compacted_4b_initial) { + amortizedshift = 2; + goto out; + } + pos += compacted_4b_initial * 4; + lcn -= compacted_4b_initial; + + if (lcn < compacted_2b) { + amortizedshift = 1; + goto out; + } + pos += compacted_2b * 2; + lcn -= compacted_2b; + amortizedshift = 2; +out: + pos += lcn * (1 << amortizedshift); + err = z_erofs_reload_indexes(m, erofs_blknr(pos)); + if (err) + return err; + return unpack_compacted_index(m, amortizedshift, erofs_blkoff(pos)); +} + +static int vle_load_cluster_from_disk(struct z_erofs_maprecorder *m, + unsigned int lcn) +{ + const unsigned int datamode = EROFS_I(m->inode)->datalayout; + + if (datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY) + return vle_legacy_load_cluster_from_disk(m, lcn); + + if (datamode == EROFS_INODE_FLAT_COMPRESSION) + return compacted_load_cluster_from_disk(m, lcn); + + return -EINVAL; +} + +static int vle_extent_lookback(struct z_erofs_maprecorder *m, + unsigned int lookback_distance) +{ + struct erofs_inode *const vi = EROFS_I(m->inode); + struct erofs_map_blocks *const map = m->map; + const unsigned int lclusterbits = vi->z_logical_clusterbits; + unsigned long lcn = m->lcn; + int err; + + if (lcn < lookback_distance) { + erofs_err(m->inode->i_sb, + "bogus lookback distance @ nid %llu", vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; + } + + /* load extent head logical cluster if needed */ + lcn -= lookback_distance; + err = vle_load_cluster_from_disk(m, lcn); + if (err) + return err; + + switch (m->type) { + case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: + if (!m->delta[0]) { + erofs_err(m->inode->i_sb, + "invalid lookback distance 0 @ nid %llu", + vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; + } + return vle_extent_lookback(m, m->delta[0]); + case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: + map->m_flags &= ~EROFS_MAP_ZIPPED; + /* fallthrough */ + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + map->m_la = (lcn << lclusterbits) | m->clusterofs; + break; + default: + erofs_err(m->inode->i_sb, + "unknown type %u @ lcn %lu of nid %llu", + m->type, lcn, vi->nid); + DBG_BUGON(1); + return -EOPNOTSUPP; + } + return 0; +} + +int z_erofs_map_blocks_iter(struct inode *inode, + struct erofs_map_blocks *map, + int flags) +{ + struct erofs_inode *const vi = EROFS_I(inode); + struct z_erofs_maprecorder m = { + .inode = inode, + .map = map, + }; + int err = 0; + unsigned int lclusterbits, endoff; + unsigned long long ofs, end; + + trace_z_erofs_map_blocks_iter_enter(inode, map, flags); + + /* when trying to read beyond EOF, leave it unmapped */ + if (map->m_la >= inode->i_size) { + map->m_llen = map->m_la + 1 - inode->i_size; + map->m_la = inode->i_size; + map->m_flags = 0; + goto out; + } + + err = fill_inode_lazy(inode); + if (err) + goto out; + + lclusterbits = vi->z_logical_clusterbits; + ofs = map->m_la; + m.lcn = ofs >> lclusterbits; + endoff = ofs & ((1 << lclusterbits) - 1); + + err = vle_load_cluster_from_disk(&m, m.lcn); + if (err) + goto unmap_out; + + map->m_flags = EROFS_MAP_ZIPPED; /* by default, compressed */ + end = (m.lcn + 1ULL) << lclusterbits; + + switch (m.type) { + case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: + if (endoff >= m.clusterofs) + map->m_flags &= ~EROFS_MAP_ZIPPED; + /* fallthrough */ + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + if (endoff >= m.clusterofs) { + map->m_la = (m.lcn << lclusterbits) | m.clusterofs; + break; + } + /* m.lcn should be >= 1 if endoff < m.clusterofs */ + if (!m.lcn) { + erofs_err(inode->i_sb, + "invalid logical cluster 0 at nid %llu", + vi->nid); + err = -EFSCORRUPTED; + goto unmap_out; + } + end = (m.lcn << lclusterbits) | m.clusterofs; + map->m_flags |= EROFS_MAP_FULL_MAPPED; + m.delta[0] = 1; + /* fallthrough */ + case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: + /* get the correspoinding first chunk */ + err = vle_extent_lookback(&m, m.delta[0]); + if (err) + goto unmap_out; + break; + default: + erofs_err(inode->i_sb, + "unknown type %u @ offset %llu of nid %llu", + m.type, ofs, vi->nid); + err = -EOPNOTSUPP; + goto unmap_out; + } + + map->m_llen = end - map->m_la; + map->m_plen = 1 << lclusterbits; + map->m_pa = blknr_to_addr(m.pblk); + map->m_flags |= EROFS_MAP_MAPPED; + +unmap_out: + if (m.kaddr) + kunmap_atomic(m.kaddr); + +out: + erofs_dbg("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o", + __func__, map->m_la, map->m_pa, + map->m_llen, map->m_plen, map->m_flags); + + trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err); + + /* aggressively BUG_ON iff CONFIG_EROFS_FS_DEBUG is on */ + DBG_BUGON(err < 0 && err != -ENOMEM); + return err; +} + diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h new file mode 100644 index 000000000000..58556903aa94 --- /dev/null +++ b/fs/erofs/zpvec.h @@ -0,0 +1,157 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#ifndef __EROFS_FS_ZPVEC_H +#define __EROFS_FS_ZPVEC_H + +#include "tagptr.h" + +/* page type in pagevec for decompress subsystem */ +enum z_erofs_page_type { + /* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */ + Z_EROFS_PAGE_TYPE_EXCLUSIVE, + + Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED, + + Z_EROFS_VLE_PAGE_TYPE_HEAD, + Z_EROFS_VLE_PAGE_TYPE_MAX +}; + +extern void __compiletime_error("Z_EROFS_PAGE_TYPE_EXCLUSIVE != 0") + __bad_page_type_exclusive(void); + +/* pagevec tagged pointer */ +typedef tagptr2_t erofs_vtptr_t; + +/* pagevec collector */ +struct z_erofs_pagevec_ctor { + struct page *curr, *next; + erofs_vtptr_t *pages; + + unsigned int nr, index; +}; + +static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor, + bool atomic) +{ + if (!ctor->curr) + return; + + if (atomic) + kunmap_atomic(ctor->pages); + else + kunmap(ctor->curr); +} + +static inline struct page * +z_erofs_pagevec_ctor_next_page(struct z_erofs_pagevec_ctor *ctor, + unsigned int nr) +{ + unsigned int index; + + /* keep away from occupied pages */ + if (ctor->next) + return ctor->next; + + for (index = 0; index < nr; ++index) { + const erofs_vtptr_t t = ctor->pages[index]; + const unsigned int tags = tagptr_unfold_tags(t); + + if (tags == Z_EROFS_PAGE_TYPE_EXCLUSIVE) + return tagptr_unfold_ptr(t); + } + DBG_BUGON(nr >= ctor->nr); + return NULL; +} + +static inline void +z_erofs_pagevec_ctor_pagedown(struct z_erofs_pagevec_ctor *ctor, + bool atomic) +{ + struct page *next = z_erofs_pagevec_ctor_next_page(ctor, ctor->nr); + + z_erofs_pagevec_ctor_exit(ctor, atomic); + + ctor->curr = next; + ctor->next = NULL; + ctor->pages = atomic ? + kmap_atomic(ctor->curr) : kmap(ctor->curr); + + ctor->nr = PAGE_SIZE / sizeof(struct page *); + ctor->index = 0; +} + +static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor, + unsigned int nr, + erofs_vtptr_t *pages, + unsigned int i) +{ + ctor->nr = nr; + ctor->curr = ctor->next = NULL; + ctor->pages = pages; + + if (i >= nr) { + i -= nr; + z_erofs_pagevec_ctor_pagedown(ctor, false); + while (i > ctor->nr) { + i -= ctor->nr; + z_erofs_pagevec_ctor_pagedown(ctor, false); + } + } + ctor->next = z_erofs_pagevec_ctor_next_page(ctor, i); + ctor->index = i; +} + +static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor, + struct page *page, + enum z_erofs_page_type type, + bool *occupied) +{ + *occupied = false; + if (!ctor->next && type) + if (ctor->index + 1 == ctor->nr) + return false; + + if (ctor->index >= ctor->nr) + z_erofs_pagevec_ctor_pagedown(ctor, false); + + /* exclusive page type must be 0 */ + if (Z_EROFS_PAGE_TYPE_EXCLUSIVE != (uintptr_t)NULL) + __bad_page_type_exclusive(); + + /* should remind that collector->next never equal to 1, 2 */ + if (type == (uintptr_t)ctor->next) { + ctor->next = page; + *occupied = true; + } + ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, page, type); + return true; +} + +static inline struct page * +z_erofs_pagevec_dequeue(struct z_erofs_pagevec_ctor *ctor, + enum z_erofs_page_type *type) +{ + erofs_vtptr_t t; + + if (ctor->index >= ctor->nr) { + DBG_BUGON(!ctor->next); + z_erofs_pagevec_ctor_pagedown(ctor, true); + } + + t = ctor->pages[ctor->index]; + + *type = tagptr_unfold_tags(t); + + /* should remind that collector->next never equal to 1, 2 */ + if (*type == (uintptr_t)ctor->next) + ctor->next = tagptr_unfold_ptr(t); + + ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, NULL, 0); + return tagptr_unfold_ptr(t); +} +#endif + diff --git a/fs/eventpoll.c b/fs/eventpoll.c index d7f1f5011fac..c4159bcc05d9 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1459,13 +1459,13 @@ static int ep_create_wakeup_source(struct epitem *epi) struct wakeup_source *ws; if (!epi->ep->ws) { - epi->ep->ws = wakeup_source_register("eventpoll"); + epi->ep->ws = wakeup_source_register(NULL, "eventpoll"); if (!epi->ep->ws) return -ENOMEM; } name = epi->ffd.file->f_path.dentry->d_name.name; - ws = wakeup_source_register(name); + ws = wakeup_source_register(NULL, name); if (!ws) return -ENOMEM; diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index f0e549783caf..09bc68708d28 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -7,7 +7,7 @@ * and for mapping back from file handles to dentries. * * For details on why we do all the strange and hairy things in here - * take a look at Documentation/filesystems/nfs/Exporting. + * take a look at Documentation/filesystems/nfs/exporting.rst. */ #include <linux/exportfs.h> #include <linux/fs.h> diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 44eb6e7eb492..baa36c6fb71e 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1002,6 +1002,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sb->s_maxbytes = ext2_max_size(sb->s_blocksize_bits); sb->s_max_links = EXT2_LINK_MAX; + sb->s_time_min = S32_MIN; + sb->s_time_max = S32_MAX; if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV) { sbi->s_inode_size = EXT2_GOOD_OLD_INODE_SIZE; diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 8fdfcd3c3e04..b17ddc229ac5 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -13,3 +13,4 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o +ext4-$(CONFIG_FS_VERITY) += verity.o diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index bf660aa7a9e0..42c6e4a5e673 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -41,6 +41,7 @@ #endif #include <linux/fscrypt.h> +#include <linux/fsverity.h> #include <linux/compiler.h> @@ -395,6 +396,7 @@ struct flex_groups { #define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ #define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ +#define EXT4_VERITY_FL 0x00100000 /* Verity protected inode */ #define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ #define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ #define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ @@ -402,7 +404,7 @@ struct flex_groups { #define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded file */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ -#define EXT4_FL_USER_VISIBLE 0x704BDFFF /* User visible flags */ +#define EXT4_FL_USER_VISIBLE 0x705BDFFF /* User visible flags */ #define EXT4_FL_USER_MODIFIABLE 0x604BC0FF /* User modifiable flags */ /* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */ @@ -467,6 +469,7 @@ enum { EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/ EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */ EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ + EXT4_INODE_VERITY = 20, /* Verity protected inode */ EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */ EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ @@ -512,6 +515,7 @@ static inline void ext4_check_flag_values(void) CHECK_FLAG_VALUE(TOPDIR); CHECK_FLAG_VALUE(HUGE_FILE); CHECK_FLAG_VALUE(EXTENTS); + CHECK_FLAG_VALUE(VERITY); CHECK_FLAG_VALUE(EA_INODE); CHECK_FLAG_VALUE(EOFBLOCKS); CHECK_FLAG_VALUE(INLINE_DATA); @@ -828,11 +832,13 @@ static inline void ext4_decode_extra_time(struct timespec64 *time, #define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ do { \ - (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {\ + (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ (raw_inode)->xtime ## _extra = \ ext4_encode_extra_time(&(inode)->xtime); \ } \ + else \ + (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (inode)->xtime.tv_sec, S32_MIN, S32_MAX)); \ } while (0) #define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ @@ -1560,6 +1566,7 @@ enum { EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ + EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ @@ -1610,6 +1617,12 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_SB(sb) (sb) #endif +static inline bool ext4_verity_in_progress(struct inode *inode) +{ + return IS_ENABLED(CONFIG_FS_VERITY) && + ext4_test_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); +} + #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime /* @@ -1632,6 +1645,10 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_GOOD_OLD_INODE_SIZE 128 +#define EXT4_EXTRA_TIMESTAMP_MAX (((s64)1 << 34) - 1 + S32_MIN) +#define EXT4_NON_EXTRA_TIMESTAMP_MAX S32_MAX +#define EXT4_TIMESTAMP_MIN S32_MIN + /* * Feature set definitions */ @@ -1662,6 +1679,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 #define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000 #define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000 +#define EXT4_FEATURE_RO_COMPAT_VERITY 0x8000 #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 @@ -1756,6 +1774,7 @@ EXT4_FEATURE_RO_COMPAT_FUNCS(bigalloc, BIGALLOC) EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum, METADATA_CSUM) EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY) EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT) +EXT4_FEATURE_RO_COMPAT_FUNCS(verity, VERITY) EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION) EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE) @@ -1813,7 +1832,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD) EXT4_FEATURE_RO_COMPAT_BIGALLOC |\ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\ EXT4_FEATURE_RO_COMPAT_QUOTA |\ - EXT4_FEATURE_RO_COMPAT_PROJECT) + EXT4_FEATURE_RO_COMPAT_PROJECT |\ + EXT4_FEATURE_RO_COMPAT_VERITY) #define EXTN_FEATURE_FUNCS(ver) \ static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \ @@ -3177,6 +3197,8 @@ static inline void ext4_set_de_type(struct super_block *sb, extern int ext4_mpage_readpages(struct address_space *mapping, struct list_head *pages, struct page *page, unsigned nr_pages, bool is_readahead); +extern int __init ext4_init_post_read_processing(void); +extern void ext4_exit_post_read_processing(void); /* symlink.c */ extern const struct inode_operations ext4_encrypted_symlink_inode_operations; @@ -3283,6 +3305,9 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io, /* mmp.c */ extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); +/* verity.c */ +extern const struct fsverity_operations ext4_verityops; + /* * Add new method to test whether block and inode bitmaps are properly * initialized. With uninit_bg reading the block from disk is not enough diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 70b0438dbc94..b8a20bb9a145 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -457,6 +457,10 @@ static int ext4_file_open(struct inode * inode, struct file * filp) if (ret) return ret; + ret = fsverity_file_open(inode, filp); + if (ret) + return ret; + /* * Set up the jbd2_inode if we are opening the inode for * writing and the journal is present diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 420fe3deed39..d0dc0e3463db 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1340,6 +1340,9 @@ retry_journal: } if (ret) { + bool extended = (pos + len > inode->i_size) && + !ext4_verity_in_progress(inode); + unlock_page(page); /* * __block_write_begin may have instantiated a few blocks @@ -1349,11 +1352,11 @@ retry_journal: * Add inode to orphan list in case we crash before * truncate finishes */ - if (pos + len > inode->i_size && ext4_can_truncate(inode)) + if (extended && ext4_can_truncate(inode)) ext4_orphan_add(handle, inode); ext4_journal_stop(handle); - if (pos + len > inode->i_size) { + if (extended) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might @@ -1406,6 +1409,7 @@ static int ext4_write_end(struct file *file, int ret = 0, ret2; int i_size_changed = 0; int inline_data = ext4_has_inline_data(inode); + bool verity = ext4_verity_in_progress(inode); trace_ext4_write_end(inode, pos, len, copied); if (inline_data) { @@ -1423,12 +1427,16 @@ static int ext4_write_end(struct file *file, /* * it's important to update i_size while still holding page lock: * page writeout could otherwise come in and zero beyond i_size. + * + * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree + * blocks are being written past EOF, so skip the i_size update. */ - i_size_changed = ext4_update_inode_size(inode, pos + copied); + if (!verity) + i_size_changed = ext4_update_inode_size(inode, pos + copied); unlock_page(page); put_page(page); - if (old_size < pos) + if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); /* * Don't mark the inode dirty under page lock. First, it unnecessarily @@ -1439,7 +1447,7 @@ static int ext4_write_end(struct file *file, if (i_size_changed || inline_data) ext4_mark_inode_dirty(handle, inode); - if (pos + len > inode->i_size && ext4_can_truncate(inode)) + if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them @@ -1450,7 +1458,7 @@ errout: if (!ret) ret = ret2; - if (pos + len > inode->i_size) { + if (pos + len > inode->i_size && !verity) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be @@ -1511,6 +1519,7 @@ static int ext4_journalled_write_end(struct file *file, unsigned from, to; int size_changed = 0; int inline_data = ext4_has_inline_data(inode); + bool verity = ext4_verity_in_progress(inode); trace_ext4_journalled_write_end(inode, pos, len, copied); from = pos & (PAGE_SIZE - 1); @@ -1540,13 +1549,14 @@ static int ext4_journalled_write_end(struct file *file, if (!partial) SetPageUptodate(page); } - size_changed = ext4_update_inode_size(inode, pos + copied); + if (!verity) + size_changed = ext4_update_inode_size(inode, pos + copied); ext4_set_inode_state(inode, EXT4_STATE_JDATA); EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; unlock_page(page); put_page(page); - if (old_size < pos) + if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); if (size_changed || inline_data) { @@ -1555,7 +1565,7 @@ static int ext4_journalled_write_end(struct file *file, ret = ret2; } - if (pos + len > inode->i_size && ext4_can_truncate(inode)) + if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them @@ -1566,7 +1576,7 @@ errout: ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; - if (pos + len > inode->i_size) { + if (pos + len > inode->i_size && !verity) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be @@ -2162,7 +2172,8 @@ static int ext4_writepage(struct page *page, trace_ext4_writepage(page); size = i_size_read(inode); - if (page->index == size >> PAGE_SHIFT) + if (page->index == size >> PAGE_SHIFT && + !ext4_verity_in_progress(inode)) len = size & ~PAGE_MASK; else len = PAGE_SIZE; @@ -2246,7 +2257,8 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) * after page tables are updated. */ size = i_size_read(mpd->inode); - if (page->index == size >> PAGE_SHIFT) + if (page->index == size >> PAGE_SHIFT && + !ext4_verity_in_progress(mpd->inode)) len = size & ~PAGE_MASK; else len = PAGE_SIZE; @@ -2345,6 +2357,9 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1) >> inode->i_blkbits; + if (ext4_verity_in_progress(inode)) + blocks = EXT_MAX_BLOCKS; + do { BUG_ON(buffer_locked(bh)); @@ -3061,8 +3076,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, index = pos >> PAGE_SHIFT; - if (ext4_nonda_switch(inode->i_sb) || - S_ISLNK(inode->i_mode)) { + if (ext4_nonda_switch(inode->i_sb) || S_ISLNK(inode->i_mode) || + ext4_verity_in_progress(inode)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; return ext4_write_begin(file, mapping, pos, len, flags, pagep, fsdata); @@ -3897,6 +3912,8 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) return 0; #endif + if (fsverity_active(inode)) + return 0; /* * If we are doing data journalling we don't support O_DIRECT @@ -4586,7 +4603,6 @@ static int __ext4_get_inode_loc(struct inode *inode, struct buffer_head *bh; struct super_block *sb = inode->i_sb; ext4_fsblk_t block; - struct blk_plug plug; int inodes_per_block, inode_offset; iloc->bh = NULL; @@ -4675,7 +4691,6 @@ make_io: * If we need to do any I/O, try to pre-readahead extra * blocks from the inode table. */ - blk_start_plug(&plug); if (EXT4_SB(sb)->s_inode_readahead_blks) { ext4_fsblk_t b, end, table; unsigned num; @@ -4706,7 +4721,6 @@ make_io: get_bh(bh); bh->b_end_io = end_buffer_read_sync; submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh); - blk_finish_plug(&plug); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { EXT4_ERROR_INODE_BLOCK(inode, block, @@ -4739,6 +4753,8 @@ static bool ext4_should_use_dax(struct inode *inode) return false; if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT)) return false; + if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY)) + return false; return true; } @@ -4763,9 +4779,11 @@ void ext4_set_inode_flags(struct inode *inode) new_fl |= S_ENCRYPTED; if (flags & EXT4_CASEFOLD_FL) new_fl |= S_CASEFOLD; + if (flags & EXT4_VERITY_FL) + new_fl |= S_VERITY; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX| - S_ENCRYPTED|S_CASEFOLD); + S_ENCRYPTED|S_CASEFOLD|S_VERITY); } static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, @@ -5555,6 +5573,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; + error = fsverity_prepare_setattr(dentry, attr); + if (error) + return error; + if (is_quota_modification(inode, attr)) { error = dquot_initialize(inode); if (error) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 442f7ef873fc..5444d49cbf09 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1113,8 +1113,35 @@ resizefs_out: #endif } case EXT4_IOC_GET_ENCRYPTION_POLICY: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; return fscrypt_ioctl_get_policy(filp, (void __user *)arg); + case FS_IOC_GET_ENCRYPTION_POLICY_EX: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_get_policy_ex(filp, (void __user *)arg); + + case FS_IOC_ADD_ENCRYPTION_KEY: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_add_key(filp, (void __user *)arg); + + case FS_IOC_REMOVE_ENCRYPTION_KEY: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_remove_key(filp, (void __user *)arg); + + case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_remove_key_all_users(filp, + (void __user *)arg); + case FS_IOC_GET_ENCRYPTION_KEY_STATUS: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_get_key_status(filp, (void __user *)arg); + case EXT4_IOC_FSGETXATTR: { struct fsxattr fa; @@ -1171,6 +1198,17 @@ out: } case EXT4_IOC_SHUTDOWN: return ext4_shutdown(sb, arg); + + case FS_IOC_ENABLE_VERITY: + if (!ext4_has_feature_verity(sb)) + return -EOPNOTSUPP; + return fsverity_ioctl_enable(filp, (const void __user *)arg); + + case FS_IOC_MEASURE_VERITY: + if (!ext4_has_feature_verity(sb)) + return -EOPNOTSUPP; + return fsverity_ioctl_measure(filp, (void __user *)arg); + default: return -ENOTTY; } @@ -1231,8 +1269,15 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC_SET_ENCRYPTION_POLICY: case EXT4_IOC_GET_ENCRYPTION_PWSALT: case EXT4_IOC_GET_ENCRYPTION_POLICY: + case FS_IOC_GET_ENCRYPTION_POLICY_EX: + case FS_IOC_ADD_ENCRYPTION_KEY: + case FS_IOC_REMOVE_ENCRYPTION_KEY: + case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: + case FS_IOC_GET_ENCRYPTION_KEY_STATUS: case EXT4_IOC_SHUTDOWN: case FS_IOC_GETFSMAP: + case FS_IOC_ENABLE_VERITY: + case FS_IOC_MEASURE_VERITY: break; default: return -ENOIOCTLCMD; diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index c916017db334..a30b203fa461 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -47,13 +47,103 @@ #include "ext4.h" -static inline bool ext4_bio_encrypted(struct bio *bio) +#define NUM_PREALLOC_POST_READ_CTXS 128 + +static struct kmem_cache *bio_post_read_ctx_cache; +static mempool_t *bio_post_read_ctx_pool; + +/* postprocessing steps for read bios */ +enum bio_post_read_step { + STEP_INITIAL = 0, + STEP_DECRYPT, + STEP_VERITY, +}; + +struct bio_post_read_ctx { + struct bio *bio; + struct work_struct work; + unsigned int cur_step; + unsigned int enabled_steps; +}; + +static void __read_end_io(struct bio *bio) { -#ifdef CONFIG_FS_ENCRYPTION - return unlikely(bio->bi_private != NULL); -#else - return false; -#endif + struct page *page; + struct bio_vec *bv; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bv, bio, iter_all) { + page = bv->bv_page; + + /* PG_error was set if any post_read step failed */ + if (bio->bi_status || PageError(page)) { + ClearPageUptodate(page); + /* will re-read again later */ + ClearPageError(page); + } else { + SetPageUptodate(page); + } + unlock_page(page); + } + if (bio->bi_private) + mempool_free(bio->bi_private, bio_post_read_ctx_pool); + bio_put(bio); +} + +static void bio_post_read_processing(struct bio_post_read_ctx *ctx); + +static void decrypt_work(struct work_struct *work) +{ + struct bio_post_read_ctx *ctx = + container_of(work, struct bio_post_read_ctx, work); + + fscrypt_decrypt_bio(ctx->bio); + + bio_post_read_processing(ctx); +} + +static void verity_work(struct work_struct *work) +{ + struct bio_post_read_ctx *ctx = + container_of(work, struct bio_post_read_ctx, work); + + fsverity_verify_bio(ctx->bio); + + bio_post_read_processing(ctx); +} + +static void bio_post_read_processing(struct bio_post_read_ctx *ctx) +{ + /* + * We use different work queues for decryption and for verity because + * verity may require reading metadata pages that need decryption, and + * we shouldn't recurse to the same workqueue. + */ + switch (++ctx->cur_step) { + case STEP_DECRYPT: + if (ctx->enabled_steps & (1 << STEP_DECRYPT)) { + INIT_WORK(&ctx->work, decrypt_work); + fscrypt_enqueue_decrypt_work(&ctx->work); + return; + } + ctx->cur_step++; + /* fall-through */ + case STEP_VERITY: + if (ctx->enabled_steps & (1 << STEP_VERITY)) { + INIT_WORK(&ctx->work, verity_work); + fsverity_enqueue_verify_work(&ctx->work); + return; + } + ctx->cur_step++; + /* fall-through */ + default: + __read_end_io(ctx->bio); + } +} + +static bool bio_post_read_required(struct bio *bio) +{ + return bio->bi_private && !bio->bi_status; } /* @@ -70,30 +160,53 @@ static inline bool ext4_bio_encrypted(struct bio *bio) */ static void mpage_end_io(struct bio *bio) { - struct bio_vec *bv; - struct bvec_iter_all iter_all; + if (bio_post_read_required(bio)) { + struct bio_post_read_ctx *ctx = bio->bi_private; - if (ext4_bio_encrypted(bio)) { - if (bio->bi_status) { - fscrypt_release_ctx(bio->bi_private); - } else { - fscrypt_enqueue_decrypt_bio(bio->bi_private, bio); - return; - } + ctx->cur_step = STEP_INITIAL; + bio_post_read_processing(ctx); + return; } - bio_for_each_segment_all(bv, bio, iter_all) { - struct page *page = bv->bv_page; + __read_end_io(bio); +} - if (!bio->bi_status) { - SetPageUptodate(page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - unlock_page(page); +static inline bool ext4_need_verity(const struct inode *inode, pgoff_t idx) +{ + return fsverity_active(inode) && + idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); +} + +static struct bio_post_read_ctx *get_bio_post_read_ctx(struct inode *inode, + struct bio *bio, + pgoff_t first_idx) +{ + unsigned int post_read_steps = 0; + struct bio_post_read_ctx *ctx = NULL; + + if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) + post_read_steps |= 1 << STEP_DECRYPT; + + if (ext4_need_verity(inode, first_idx)) + post_read_steps |= 1 << STEP_VERITY; + + if (post_read_steps) { + ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); + if (!ctx) + return ERR_PTR(-ENOMEM); + ctx->bio = bio; + ctx->enabled_steps = post_read_steps; + bio->bi_private = ctx; } + return ctx; +} - bio_put(bio); +static inline loff_t ext4_readpage_limit(struct inode *inode) +{ + if (IS_ENABLED(CONFIG_FS_VERITY) && + (IS_VERITY(inode) || ext4_verity_in_progress(inode))) + return inode->i_sb->s_maxbytes; + + return i_size_read(inode); } int ext4_mpage_readpages(struct address_space *mapping, @@ -141,7 +254,8 @@ int ext4_mpage_readpages(struct address_space *mapping, block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits); last_block = block_in_file + nr_pages * blocks_per_page; - last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; + last_block_in_file = (ext4_readpage_limit(inode) + + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) last_block = last_block_in_file; page_block = 0; @@ -218,6 +332,9 @@ int ext4_mpage_readpages(struct address_space *mapping, zero_user_segment(page, first_hole << blkbits, PAGE_SIZE); if (first_hole == 0) { + if (ext4_need_verity(inode, page->index) && + !fsverity_verify_page(page)) + goto set_error_page; SetPageUptodate(page); unlock_page(page); goto next_page; @@ -241,18 +358,16 @@ int ext4_mpage_readpages(struct address_space *mapping, bio = NULL; } if (bio == NULL) { - struct fscrypt_ctx *ctx = NULL; + struct bio_post_read_ctx *ctx; - if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) { - ctx = fscrypt_get_ctx(GFP_NOFS); - if (IS_ERR(ctx)) - goto set_error_page; - } bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); - if (!bio) { - if (ctx) - fscrypt_release_ctx(ctx); + if (!bio) + goto set_error_page; + ctx = get_bio_post_read_ctx(inode, bio, page->index); + if (IS_ERR(ctx)) { + bio_put(bio); + bio = NULL; goto set_error_page; } bio_set_dev(bio, bdev); @@ -293,3 +408,29 @@ int ext4_mpage_readpages(struct address_space *mapping, submit_bio(bio); return 0; } + +int __init ext4_init_post_read_processing(void) +{ + bio_post_read_ctx_cache = + kmem_cache_create("ext4_bio_post_read_ctx", + sizeof(struct bio_post_read_ctx), 0, 0, NULL); + if (!bio_post_read_ctx_cache) + goto fail; + bio_post_read_ctx_pool = + mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS, + bio_post_read_ctx_cache); + if (!bio_post_read_ctx_pool) + goto fail_free_cache; + return 0; + +fail_free_cache: + kmem_cache_destroy(bio_post_read_ctx_cache); +fail: + return -ENOMEM; +} + +void ext4_exit_post_read_processing(void) +{ + mempool_destroy(bio_post_read_ctx_pool); + kmem_cache_destroy(bio_post_read_ctx_cache); +} diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4079605d437a..3db5f17228b7 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1107,6 +1107,9 @@ static int ext4_drop_inode(struct inode *inode) { int drop = generic_drop_inode(inode); + if (!drop) + drop = fscrypt_drop_inode(inode); + trace_ext4_drop_inode(inode, drop); return drop; } @@ -1179,6 +1182,7 @@ void ext4_clear_inode(struct inode *inode) EXT4_I(inode)->jinode = NULL; } fscrypt_put_encryption_info(inode); + fsverity_cleanup_inode(inode); } static struct inode *ext4_nfs_get_inode(struct super_block *sb, @@ -4035,8 +4039,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_inode_size); goto failed_mount; } - if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) - sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2); + /* + * i_atime_extra is the last extra field available for [acm]times in + * struct ext4_inode. Checking for that field should suffice to ensure + * we have extra space for all three. + */ + if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) + + sizeof(((struct ext4_inode *)0)->i_atime_extra)) { + sb->s_time_gran = 1; + sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX; + } else { + sb->s_time_gran = NSEC_PER_SEC; + sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX; + } + + sb->s_time_min = EXT4_TIMESTAMP_MIN; } sbi->s_desc_size = le16_to_cpu(es->s_desc_size); @@ -4272,6 +4289,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) #ifdef CONFIG_FS_ENCRYPTION sb->s_cop = &ext4_cryptops; #endif +#ifdef CONFIG_FS_VERITY + sb->s_vop = &ext4_verityops; +#endif #ifdef CONFIG_QUOTA sb->dq_op = &ext4_quota_operations; if (ext4_has_feature_quota(sb)) @@ -4419,6 +4439,11 @@ no_journal: goto failed_mount_wq; } + if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) { + ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity"); + goto failed_mount_wq; + } + if (DUMMY_ENCRYPTION_ENABLED(sbi) && !sb_rdonly(sb) && !ext4_has_feature_encrypt(sb)) { ext4_set_feature_encrypt(sb); @@ -6095,6 +6120,10 @@ static int __init ext4_init_fs(void) err = ext4_init_pending(); if (err) + goto out7; + + err = ext4_init_post_read_processing(); + if (err) goto out6; err = ext4_init_pageio(); @@ -6135,8 +6164,10 @@ out3: out4: ext4_exit_pageio(); out5: - ext4_exit_pending(); + ext4_exit_post_read_processing(); out6: + ext4_exit_pending(); +out7: ext4_exit_es(); return err; @@ -6153,6 +6184,7 @@ static void __exit ext4_exit_fs(void) ext4_exit_sysfs(); ext4_exit_system_zone(); ext4_exit_pageio(); + ext4_exit_post_read_processing(); ext4_exit_es(); ext4_exit_pending(); } diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index b3cd7655a6ff..eb1efad0e20a 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -242,6 +242,9 @@ EXT4_ATTR_FEATURE(encryption); #ifdef CONFIG_UNICODE EXT4_ATTR_FEATURE(casefold); #endif +#ifdef CONFIG_FS_VERITY +EXT4_ATTR_FEATURE(verity); +#endif EXT4_ATTR_FEATURE(metadata_csum_seed); static struct attribute *ext4_feat_attrs[] = { @@ -254,6 +257,9 @@ static struct attribute *ext4_feat_attrs[] = { #ifdef CONFIG_UNICODE ATTR_LIST(casefold), #endif +#ifdef CONFIG_FS_VERITY + ATTR_LIST(verity), +#endif ATTR_LIST(metadata_csum_seed), NULL, }; diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c new file mode 100644 index 000000000000..d0d8a9795dd6 --- /dev/null +++ b/fs/ext4/verity.c @@ -0,0 +1,367 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/ext4/verity.c: fs-verity support for ext4 + * + * Copyright 2019 Google LLC + */ + +/* + * Implementation of fsverity_operations for ext4. + * + * ext4 stores the verity metadata (Merkle tree and fsverity_descriptor) past + * the end of the file, starting at the first 64K boundary beyond i_size. This + * approach works because (a) verity files are readonly, and (b) pages fully + * beyond i_size aren't visible to userspace but can be read/written internally + * by ext4 with only some relatively small changes to ext4. This approach + * avoids having to depend on the EA_INODE feature and on rearchitecturing + * ext4's xattr support to support paging multi-gigabyte xattrs into memory, and + * to support encrypting xattrs. Note that the verity metadata *must* be + * encrypted when the file is, since it contains hashes of the plaintext data. + * + * Using a 64K boundary rather than a 4K one keeps things ready for + * architectures with 64K pages, and it doesn't necessarily waste space on-disk + * since there can be a hole between i_size and the start of the Merkle tree. + */ + +#include <linux/quotaops.h> + +#include "ext4.h" +#include "ext4_extents.h" +#include "ext4_jbd2.h" + +static inline loff_t ext4_verity_metadata_pos(const struct inode *inode) +{ + return round_up(inode->i_size, 65536); +} + +/* + * Read some verity metadata from the inode. __vfs_read() can't be used because + * we need to read beyond i_size. + */ +static int pagecache_read(struct inode *inode, void *buf, size_t count, + loff_t pos) +{ + while (count) { + size_t n = min_t(size_t, count, + PAGE_SIZE - offset_in_page(pos)); + struct page *page; + void *addr; + + page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT, + NULL); + if (IS_ERR(page)) + return PTR_ERR(page); + + addr = kmap_atomic(page); + memcpy(buf, addr + offset_in_page(pos), n); + kunmap_atomic(addr); + + put_page(page); + + buf += n; + pos += n; + count -= n; + } + return 0; +} + +/* + * Write some verity metadata to the inode for FS_IOC_ENABLE_VERITY. + * kernel_write() can't be used because the file descriptor is readonly. + */ +static int pagecache_write(struct inode *inode, const void *buf, size_t count, + loff_t pos) +{ + if (pos + count > inode->i_sb->s_maxbytes) + return -EFBIG; + + while (count) { + size_t n = min_t(size_t, count, + PAGE_SIZE - offset_in_page(pos)); + struct page *page; + void *fsdata; + void *addr; + int res; + + res = pagecache_write_begin(NULL, inode->i_mapping, pos, n, 0, + &page, &fsdata); + if (res) + return res; + + addr = kmap_atomic(page); + memcpy(addr + offset_in_page(pos), buf, n); + kunmap_atomic(addr); + + res = pagecache_write_end(NULL, inode->i_mapping, pos, n, n, + page, fsdata); + if (res < 0) + return res; + if (res != n) + return -EIO; + + buf += n; + pos += n; + count -= n; + } + return 0; +} + +static int ext4_begin_enable_verity(struct file *filp) +{ + struct inode *inode = file_inode(filp); + const int credits = 2; /* superblock and inode for ext4_orphan_add() */ + handle_t *handle; + int err; + + if (ext4_verity_in_progress(inode)) + return -EBUSY; + + /* + * Since the file was opened readonly, we have to initialize the jbd + * inode and quotas here and not rely on ->open() doing it. This must + * be done before evicting the inline data. + */ + + err = ext4_inode_attach_jinode(inode); + if (err) + return err; + + err = dquot_initialize(inode); + if (err) + return err; + + err = ext4_convert_inline_data(inode); + if (err) + return err; + + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + ext4_warning_inode(inode, + "verity is only allowed on extent-based files"); + return -EOPNOTSUPP; + } + + /* + * ext4 uses the last allocated block to find the verity descriptor, so + * we must remove any other blocks past EOF which might confuse things. + */ + err = ext4_truncate(inode); + if (err) + return err; + + handle = ext4_journal_start(inode, EXT4_HT_INODE, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + err = ext4_orphan_add(handle, inode); + if (err == 0) + ext4_set_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); + + ext4_journal_stop(handle); + return err; +} + +/* + * ext4 stores the verity descriptor beginning on the next filesystem block + * boundary after the Merkle tree. Then, the descriptor size is stored in the + * last 4 bytes of the last allocated filesystem block --- which is either the + * block in which the descriptor ends, or the next block after that if there + * weren't at least 4 bytes remaining. + * + * We can't simply store the descriptor in an xattr because it *must* be + * encrypted when ext4 encryption is used, but ext4 encryption doesn't encrypt + * xattrs. Also, if the descriptor includes a large signature blob it may be + * too large to store in an xattr without the EA_INODE feature. + */ +static int ext4_write_verity_descriptor(struct inode *inode, const void *desc, + size_t desc_size, u64 merkle_tree_size) +{ + const u64 desc_pos = round_up(ext4_verity_metadata_pos(inode) + + merkle_tree_size, i_blocksize(inode)); + const u64 desc_end = desc_pos + desc_size; + const __le32 desc_size_disk = cpu_to_le32(desc_size); + const u64 desc_size_pos = round_up(desc_end + sizeof(desc_size_disk), + i_blocksize(inode)) - + sizeof(desc_size_disk); + int err; + + err = pagecache_write(inode, desc, desc_size, desc_pos); + if (err) + return err; + + return pagecache_write(inode, &desc_size_disk, sizeof(desc_size_disk), + desc_size_pos); +} + +static int ext4_end_enable_verity(struct file *filp, const void *desc, + size_t desc_size, u64 merkle_tree_size) +{ + struct inode *inode = file_inode(filp); + const int credits = 2; /* superblock and inode for ext4_orphan_del() */ + handle_t *handle; + int err = 0; + int err2; + + if (desc != NULL) { + /* Succeeded; write the verity descriptor. */ + err = ext4_write_verity_descriptor(inode, desc, desc_size, + merkle_tree_size); + + /* Write all pages before clearing VERITY_IN_PROGRESS. */ + if (!err) + err = filemap_write_and_wait(inode->i_mapping); + } + + /* If we failed, truncate anything we wrote past i_size. */ + if (desc == NULL || err) + ext4_truncate(inode); + + /* + * We must always clean up by clearing EXT4_STATE_VERITY_IN_PROGRESS and + * deleting the inode from the orphan list, even if something failed. + * If everything succeeded, we'll also set the verity bit in the same + * transaction. + */ + + ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); + + handle = ext4_journal_start(inode, EXT4_HT_INODE, credits); + if (IS_ERR(handle)) { + ext4_orphan_del(NULL, inode); + return PTR_ERR(handle); + } + + err2 = ext4_orphan_del(handle, inode); + if (err2) + goto out_stop; + + if (desc != NULL && !err) { + struct ext4_iloc iloc; + + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out_stop; + ext4_set_inode_flag(inode, EXT4_INODE_VERITY); + ext4_set_inode_flags(inode); + err = ext4_mark_iloc_dirty(handle, inode, &iloc); + } +out_stop: + ext4_journal_stop(handle); + return err ?: err2; +} + +static int ext4_get_verity_descriptor_location(struct inode *inode, + size_t *desc_size_ret, + u64 *desc_pos_ret) +{ + struct ext4_ext_path *path; + struct ext4_extent *last_extent; + u32 end_lblk; + u64 desc_size_pos; + __le32 desc_size_disk; + u32 desc_size; + u64 desc_pos; + int err; + + /* + * Descriptor size is in last 4 bytes of last allocated block. + * See ext4_write_verity_descriptor(). + */ + + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + EXT4_ERROR_INODE(inode, "verity file doesn't use extents"); + return -EFSCORRUPTED; + } + + path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + + last_extent = path[path->p_depth].p_ext; + if (!last_extent) { + EXT4_ERROR_INODE(inode, "verity file has no extents"); + ext4_ext_drop_refs(path); + kfree(path); + return -EFSCORRUPTED; + } + + end_lblk = le32_to_cpu(last_extent->ee_block) + + ext4_ext_get_actual_len(last_extent); + desc_size_pos = (u64)end_lblk << inode->i_blkbits; + ext4_ext_drop_refs(path); + kfree(path); + + if (desc_size_pos < sizeof(desc_size_disk)) + goto bad; + desc_size_pos -= sizeof(desc_size_disk); + + err = pagecache_read(inode, &desc_size_disk, sizeof(desc_size_disk), + desc_size_pos); + if (err) + return err; + desc_size = le32_to_cpu(desc_size_disk); + + /* + * The descriptor is stored just before the desc_size_disk, but starting + * on a filesystem block boundary. + */ + + if (desc_size > INT_MAX || desc_size > desc_size_pos) + goto bad; + + desc_pos = round_down(desc_size_pos - desc_size, i_blocksize(inode)); + if (desc_pos < ext4_verity_metadata_pos(inode)) + goto bad; + + *desc_size_ret = desc_size; + *desc_pos_ret = desc_pos; + return 0; + +bad: + EXT4_ERROR_INODE(inode, "verity file corrupted; can't find descriptor"); + return -EFSCORRUPTED; +} + +static int ext4_get_verity_descriptor(struct inode *inode, void *buf, + size_t buf_size) +{ + size_t desc_size = 0; + u64 desc_pos = 0; + int err; + + err = ext4_get_verity_descriptor_location(inode, &desc_size, &desc_pos); + if (err) + return err; + + if (buf_size) { + if (desc_size > buf_size) + return -ERANGE; + err = pagecache_read(inode, buf, desc_size, desc_pos); + if (err) + return err; + } + return desc_size; +} + +static struct page *ext4_read_merkle_tree_page(struct inode *inode, + pgoff_t index) +{ + index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT; + + return read_mapping_page(inode->i_mapping, index, NULL); +} + +static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf, + u64 index, int log_blocksize) +{ + loff_t pos = ext4_verity_metadata_pos(inode) + (index << log_blocksize); + + return pagecache_write(inode, buf, 1 << log_blocksize, pos); +} + +const struct fsverity_operations ext4_verityops = { + .begin_enable_verity = ext4_begin_enable_verity, + .end_enable_verity = ext4_end_enable_verity, + .get_verity_descriptor = ext4_get_verity_descriptor, + .read_merkle_tree_page = ext4_read_merkle_tree_page, + .write_merkle_tree_block = ext4_write_merkle_tree_block, +}; diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index 776c4b936504..2aaecc63834f 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -8,3 +8,4 @@ f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o +f2fs-$(CONFIG_FS_VERITY) += verity.o diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index abbf14e9bd72..54cad80acb7d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -74,6 +74,7 @@ static enum count_type __read_io_type(struct page *page) enum bio_post_read_step { STEP_INITIAL = 0, STEP_DECRYPT, + STEP_VERITY, }; struct bio_post_read_ctx { @@ -120,8 +121,23 @@ static void decrypt_work(struct work_struct *work) bio_post_read_processing(ctx); } +static void verity_work(struct work_struct *work) +{ + struct bio_post_read_ctx *ctx = + container_of(work, struct bio_post_read_ctx, work); + + fsverity_verify_bio(ctx->bio); + + bio_post_read_processing(ctx); +} + static void bio_post_read_processing(struct bio_post_read_ctx *ctx) { + /* + * We use different work queues for decryption and for verity because + * verity may require reading metadata pages that need decryption, and + * we shouldn't recurse to the same workqueue. + */ switch (++ctx->cur_step) { case STEP_DECRYPT: if (ctx->enabled_steps & (1 << STEP_DECRYPT)) { @@ -131,6 +147,14 @@ static void bio_post_read_processing(struct bio_post_read_ctx *ctx) } ctx->cur_step++; /* fall-through */ + case STEP_VERITY: + if (ctx->enabled_steps & (1 << STEP_VERITY)) { + INIT_WORK(&ctx->work, verity_work); + fsverity_enqueue_verify_work(&ctx->work); + return; + } + ctx->cur_step++; + /* fall-through */ default: __read_end_io(ctx->bio); } @@ -608,8 +632,15 @@ out: up_write(&io->io_rwsem); } +static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx) +{ + return fsverity_active(inode) && + idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); +} + static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, - unsigned nr_pages, unsigned op_flag) + unsigned nr_pages, unsigned op_flag, + pgoff_t first_idx) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; @@ -625,6 +656,10 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, if (f2fs_encrypted_file(inode)) post_read_steps |= 1 << STEP_DECRYPT; + + if (f2fs_need_verity(inode, first_idx)) + post_read_steps |= 1 << STEP_VERITY; + if (post_read_steps) { ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); if (!ctx) { @@ -646,7 +681,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; - bio = f2fs_grab_read_bio(inode, blkaddr, 1, 0); + bio = f2fs_grab_read_bio(inode, blkaddr, 1, 0, page->index); if (IS_ERR(bio)) return PTR_ERR(bio); @@ -1569,6 +1604,15 @@ out: return ret; } +static inline loff_t f2fs_readpage_limit(struct inode *inode) +{ + if (IS_ENABLED(CONFIG_FS_VERITY) && + (IS_VERITY(inode) || f2fs_verity_in_progress(inode))) + return inode->i_sb->s_maxbytes; + + return i_size_read(inode); +} + static int f2fs_read_single_page(struct inode *inode, struct page *page, unsigned nr_pages, struct f2fs_map_blocks *map, @@ -1587,7 +1631,7 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page, block_in_file = (sector_t)page_index(page); last_block = block_in_file + nr_pages; - last_block_in_file = (i_size_read(inode) + blocksize - 1) >> + last_block_in_file = (f2fs_readpage_limit(inode) + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) last_block = last_block_in_file; @@ -1632,6 +1676,11 @@ got_it: } else { zero_out: zero_user_segment(page, 0, PAGE_SIZE); + if (f2fs_need_verity(inode, page->index) && + !fsverity_verify_page(page)) { + ret = -EIO; + goto out; + } if (!PageUptodate(page)) SetPageUptodate(page); unlock_page(page); @@ -1650,7 +1699,7 @@ submit_and_realloc: } if (bio == NULL) { bio = f2fs_grab_read_bio(inode, block_nr, nr_pages, - is_readahead ? REQ_RAHEAD : 0); + is_readahead ? REQ_RAHEAD : 0, page->index); if (IS_ERR(bio)) { ret = PTR_ERR(bio); bio = NULL; @@ -2052,7 +2101,7 @@ static int __write_data_page(struct page *page, bool *submitted, if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; - if (page->index < end_index) + if (page->index < end_index || f2fs_verity_in_progress(inode)) goto write; /* @@ -2427,7 +2476,8 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; loff_t i_size = i_size_read(inode); - if (to > i_size) { + /* In the fs-verity case, f2fs_end_enable_verity() does the truncate */ + if (to > i_size && !f2fs_verity_in_progress(inode)) { down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); down_write(&F2FS_I(inode)->i_mmap_sem); @@ -2458,7 +2508,8 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, * the block addresses when there is no need to fill the page. */ if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE && - !is_inode_flag_set(inode, FI_NO_PREALLOC)) + !is_inode_flag_set(inode, FI_NO_PREALLOC) && + !f2fs_verity_in_progress(inode)) return 0; /* f2fs_lock_op avoids race between write CP and convert_inline_page */ @@ -2597,7 +2648,8 @@ repeat: if (len == PAGE_SIZE || PageUptodate(page)) return 0; - if (!(pos & (PAGE_SIZE - 1)) && (pos + len) >= i_size_read(inode)) { + if (!(pos & (PAGE_SIZE - 1)) && (pos + len) >= i_size_read(inode) && + !f2fs_verity_in_progress(inode)) { zero_user_segment(page, len, PAGE_SIZE); return 0; } @@ -2660,7 +2712,8 @@ static int f2fs_write_end(struct file *file, set_page_dirty(page); - if (pos + copied > i_size_read(inode)) + if (pos + copied > i_size_read(inode) && + !f2fs_verity_in_progress(inode)) f2fs_i_size_write(inode, pos + copied); unlock_out: f2fs_put_page(page, 1); @@ -3104,7 +3157,9 @@ void f2fs_clear_page_cache_dirty_tag(struct page *page) int __init f2fs_init_post_read_processing(void) { - bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, 0); + bio_post_read_ctx_cache = + kmem_cache_create("f2fs_bio_post_read_ctx", + sizeof(struct bio_post_read_ctx), 0, 0, NULL); if (!bio_post_read_ctx_cache) goto fail; bio_post_read_ctx_pool = diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 17382da7f0bd..7c5f121edac5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -25,6 +25,7 @@ #include <crypto/hash.h> #include <linux/fscrypt.h> +#include <linux/fsverity.h> #ifdef CONFIG_F2FS_CHECK_FS #define f2fs_bug_on(sbi, condition) BUG_ON(condition) @@ -151,7 +152,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_QUOTA_INO 0x0080 #define F2FS_FEATURE_INODE_CRTIME 0x0100 #define F2FS_FEATURE_LOST_FOUND 0x0200 -#define F2FS_FEATURE_VERITY 0x0400 /* reserved */ +#define F2FS_FEATURE_VERITY 0x0400 #define F2FS_FEATURE_SB_CHKSUM 0x0800 #define __F2FS_HAS_FEATURE(raw_super, mask) \ @@ -630,7 +631,7 @@ enum { #define FADVISE_ENC_NAME_BIT 0x08 #define FADVISE_KEEP_SIZE_BIT 0x10 #define FADVISE_HOT_BIT 0x20 -#define FADVISE_VERITY_BIT 0x40 /* reserved */ +#define FADVISE_VERITY_BIT 0x40 #define FADVISE_MODIFIABLE_BITS (FADVISE_COLD_BIT | FADVISE_HOT_BIT) @@ -650,6 +651,8 @@ enum { #define file_is_hot(inode) is_file(inode, FADVISE_HOT_BIT) #define file_set_hot(inode) set_file(inode, FADVISE_HOT_BIT) #define file_clear_hot(inode) clear_file(inode, FADVISE_HOT_BIT) +#define file_is_verity(inode) is_file(inode, FADVISE_VERITY_BIT) +#define file_set_verity(inode) set_file(inode, FADVISE_VERITY_BIT) #define DEF_DIR_LEVEL 0 @@ -2412,6 +2415,7 @@ enum { FI_PROJ_INHERIT, /* indicate file inherits projectid */ FI_PIN_FILE, /* indicate file should not be gced */ FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */ + FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ }; static inline void __mark_inode_dirty_flag(struct inode *inode, @@ -2451,6 +2455,12 @@ static inline void clear_inode_flag(struct inode *inode, int flag) __mark_inode_dirty_flag(inode, flag, false); } +static inline bool f2fs_verity_in_progress(struct inode *inode) +{ + return IS_ENABLED(CONFIG_FS_VERITY) && + is_inode_flag_set(inode, FI_VERITY_IN_PROGRESS); +} + static inline void set_acl_inode(struct inode *inode, umode_t mode) { F2FS_I(inode)->i_acl_mode = mode; @@ -3521,6 +3531,9 @@ void f2fs_exit_sysfs(void); int f2fs_register_sysfs(struct f2fs_sb_info *sbi); void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi); +/* verity.c */ +extern const struct fsverity_operations f2fs_verityops; + /* * crypto support */ @@ -3543,7 +3556,7 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode) */ static inline bool f2fs_post_read_required(struct inode *inode) { - return f2fs_encrypted_file(inode); + return f2fs_encrypted_file(inode) || fsverity_active(inode); } #define F2FS_FEATURE_FUNCS(name, flagname) \ @@ -3561,6 +3574,7 @@ F2FS_FEATURE_FUNCS(flexible_inline_xattr, FLEXIBLE_INLINE_XATTR); F2FS_FEATURE_FUNCS(quota_ino, QUOTA_INO); F2FS_FEATURE_FUNCS(inode_crtime, INODE_CRTIME); F2FS_FEATURE_FUNCS(lost_found, LOST_FOUND); +F2FS_FEATURE_FUNCS(verity, VERITY); F2FS_FEATURE_FUNCS(sb_chksum, SB_CHKSUM); #ifdef CONFIG_BLK_DEV_ZONED diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 3e58a6f697dd..56efde9d3659 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -496,6 +496,10 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) if (err) return err; + err = fsverity_file_open(inode, filp); + if (err) + return err; + filp->f_mode |= FMODE_NOWAIT; return dquot_file_open(inode, filp); @@ -741,15 +745,18 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) inode->i_uid = attr->ia_uid; if (ia_valid & ATTR_GID) inode->i_gid = attr->ia_gid; - if (ia_valid & ATTR_ATIME) - inode->i_atime = timespec64_trunc(attr->ia_atime, - inode->i_sb->s_time_gran); - if (ia_valid & ATTR_MTIME) - inode->i_mtime = timespec64_trunc(attr->ia_mtime, - inode->i_sb->s_time_gran); - if (ia_valid & ATTR_CTIME) - inode->i_ctime = timespec64_trunc(attr->ia_ctime, - inode->i_sb->s_time_gran); + if (ia_valid & ATTR_ATIME) { + inode->i_atime = timestamp_truncate(attr->ia_atime, + inode); + } + if (ia_valid & ATTR_MTIME) { + inode->i_mtime = timestamp_truncate(attr->ia_mtime, + inode); + } + if (ia_valid & ATTR_CTIME) { + inode->i_ctime = timestamp_truncate(attr->ia_ctime, + inode); + } if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; @@ -778,6 +785,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (err) return err; + err = fsverity_prepare_setattr(dentry, attr); + if (err) + return err; + if (is_quota_modification(inode, attr)) { err = dquot_initialize(inode); if (err) @@ -1705,7 +1716,8 @@ static const struct { FS_PROJINHERIT_FL | \ FS_ENCRYPT_FL | \ FS_INLINE_DATA_FL | \ - FS_NOCOW_FL) + FS_NOCOW_FL | \ + FS_VERITY_FL) #define F2FS_SETTABLE_FS_FL ( \ FS_SYNC_FL | \ @@ -1750,6 +1762,8 @@ static int f2fs_ioc_getflags(struct file *filp, unsigned long arg) if (IS_ENCRYPTED(inode)) fsflags |= FS_ENCRYPT_FL; + if (IS_VERITY(inode)) + fsflags |= FS_VERITY_FL; if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) fsflags |= FS_INLINE_DATA_FL; if (is_inode_flag_set(inode, FI_PIN_FILE)) @@ -2184,6 +2198,49 @@ out_err: return err; } +static int f2fs_ioc_get_encryption_policy_ex(struct file *filp, + unsigned long arg) +{ + if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fscrypt_ioctl_get_policy_ex(filp, (void __user *)arg); +} + +static int f2fs_ioc_add_encryption_key(struct file *filp, unsigned long arg) +{ + if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fscrypt_ioctl_add_key(filp, (void __user *)arg); +} + +static int f2fs_ioc_remove_encryption_key(struct file *filp, unsigned long arg) +{ + if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fscrypt_ioctl_remove_key(filp, (void __user *)arg); +} + +static int f2fs_ioc_remove_encryption_key_all_users(struct file *filp, + unsigned long arg) +{ + if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fscrypt_ioctl_remove_key_all_users(filp, (void __user *)arg); +} + +static int f2fs_ioc_get_encryption_key_status(struct file *filp, + unsigned long arg) +{ + if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fscrypt_ioctl_get_key_status(filp, (void __user *)arg); +} + static int f2fs_ioc_gc(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -3060,6 +3117,30 @@ static int f2fs_ioc_resize_fs(struct file *filp, unsigned long arg) return ret; } +static int f2fs_ioc_enable_verity(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + + if (!f2fs_sb_has_verity(F2FS_I_SB(inode))) { + f2fs_warn(F2FS_I_SB(inode), + "Can't enable fs-verity on inode %lu: the verity feature is not enabled on this filesystem.\n", + inode->i_ino); + return -EOPNOTSUPP; + } + + return fsverity_ioctl_enable(filp, (const void __user *)arg); +} + +static int f2fs_ioc_measure_verity(struct file *filp, unsigned long arg) +{ + if (!f2fs_sb_has_verity(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fsverity_ioctl_measure(filp, (void __user *)arg); +} + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp))))) @@ -3092,6 +3173,16 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_get_encryption_policy(filp, arg); case F2FS_IOC_GET_ENCRYPTION_PWSALT: return f2fs_ioc_get_encryption_pwsalt(filp, arg); + case FS_IOC_GET_ENCRYPTION_POLICY_EX: + return f2fs_ioc_get_encryption_policy_ex(filp, arg); + case FS_IOC_ADD_ENCRYPTION_KEY: + return f2fs_ioc_add_encryption_key(filp, arg); + case FS_IOC_REMOVE_ENCRYPTION_KEY: + return f2fs_ioc_remove_encryption_key(filp, arg); + case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: + return f2fs_ioc_remove_encryption_key_all_users(filp, arg); + case FS_IOC_GET_ENCRYPTION_KEY_STATUS: + return f2fs_ioc_get_encryption_key_status(filp, arg); case F2FS_IOC_GARBAGE_COLLECT: return f2fs_ioc_gc(filp, arg); case F2FS_IOC_GARBAGE_COLLECT_RANGE: @@ -3118,6 +3209,10 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_precache_extents(filp, arg); case F2FS_IOC_RESIZE_FS: return f2fs_ioc_resize_fs(filp, arg); + case FS_IOC_ENABLE_VERITY: + return f2fs_ioc_enable_verity(filp, arg); + case FS_IOC_MEASURE_VERITY: + return f2fs_ioc_measure_verity(filp, arg); default: return -ENOTTY; } @@ -3219,6 +3314,11 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_SET_ENCRYPTION_POLICY: case F2FS_IOC_GET_ENCRYPTION_PWSALT: case F2FS_IOC_GET_ENCRYPTION_POLICY: + case FS_IOC_GET_ENCRYPTION_POLICY_EX: + case FS_IOC_ADD_ENCRYPTION_KEY: + case FS_IOC_REMOVE_ENCRYPTION_KEY: + case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: + case FS_IOC_GET_ENCRYPTION_KEY_STATUS: case F2FS_IOC_GARBAGE_COLLECT: case F2FS_IOC_GARBAGE_COLLECT_RANGE: case F2FS_IOC_WRITE_CHECKPOINT: @@ -3232,6 +3332,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_SET_PIN_FILE: case F2FS_IOC_PRECACHE_EXTENTS: case F2FS_IOC_RESIZE_FS: + case FS_IOC_ENABLE_VERITY: + case FS_IOC_MEASURE_VERITY: break; default: return -ENOIOCTLCMD; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index a33d7a849b2d..06da75d418e0 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -46,9 +46,11 @@ void f2fs_set_inode_flags(struct inode *inode) new_fl |= S_DIRSYNC; if (file_is_encrypt(inode)) new_fl |= S_ENCRYPTED; + if (file_is_verity(inode)) + new_fl |= S_VERITY; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC| - S_ENCRYPTED); + S_ENCRYPTED|S_VERITY); } static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) @@ -733,6 +735,7 @@ no_delete: } out_clear: fscrypt_put_encryption_info(inode); + fsverity_cleanup_inode(inode); clear_inode(inode); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 78a1b873e48a..f43befda0e1a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -913,6 +913,8 @@ static int f2fs_drop_inode(struct inode *inode) return 0; } ret = generic_drop_inode(inode); + if (!ret) + ret = fscrypt_drop_inode(inode); trace_f2fs_drop_inode(inode, ret); return ret; } @@ -3144,6 +3146,9 @@ try_onemore: #ifdef CONFIG_FS_ENCRYPTION sb->s_cop = &f2fs_cryptops; #endif +#ifdef CONFIG_FS_VERITY + sb->s_vop = &f2fs_verityops; +#endif sb->s_xattr = f2fs_xattr_handlers; sb->s_export_op = &f2fs_export_ops; sb->s_magic = F2FS_SUPER_MAGIC; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 3aeacd0aacfd..0cd64f994068 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -131,6 +131,9 @@ static ssize_t features_show(struct f2fs_attr *a, if (f2fs_sb_has_lost_found(sbi)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "lost_found"); + if (f2fs_sb_has_verity(sbi)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "verity"); if (f2fs_sb_has_sb_chksum(sbi)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "sb_checksum"); @@ -364,6 +367,7 @@ enum feat_id { FEAT_QUOTA_INO, FEAT_INODE_CRTIME, FEAT_LOST_FOUND, + FEAT_VERITY, FEAT_SB_CHECKSUM, }; @@ -381,6 +385,7 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a, case FEAT_QUOTA_INO: case FEAT_INODE_CRTIME: case FEAT_LOST_FOUND: + case FEAT_VERITY: case FEAT_SB_CHECKSUM: return snprintf(buf, PAGE_SIZE, "supported\n"); } @@ -470,6 +475,9 @@ F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR); F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO); F2FS_FEATURE_RO_ATTR(inode_crtime, FEAT_INODE_CRTIME); F2FS_FEATURE_RO_ATTR(lost_found, FEAT_LOST_FOUND); +#ifdef CONFIG_FS_VERITY +F2FS_FEATURE_RO_ATTR(verity, FEAT_VERITY); +#endif F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) @@ -534,6 +542,9 @@ static struct attribute *f2fs_feat_attrs[] = { ATTR_LIST(quota_ino), ATTR_LIST(inode_crtime), ATTR_LIST(lost_found), +#ifdef CONFIG_FS_VERITY + ATTR_LIST(verity), +#endif ATTR_LIST(sb_checksum), NULL, }; diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c new file mode 100644 index 000000000000..a401ef72bc82 --- /dev/null +++ b/fs/f2fs/verity.c @@ -0,0 +1,247 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/f2fs/verity.c: fs-verity support for f2fs + * + * Copyright 2019 Google LLC + */ + +/* + * Implementation of fsverity_operations for f2fs. + * + * Like ext4, f2fs stores the verity metadata (Merkle tree and + * fsverity_descriptor) past the end of the file, starting at the first 64K + * boundary beyond i_size. This approach works because (a) verity files are + * readonly, and (b) pages fully beyond i_size aren't visible to userspace but + * can be read/written internally by f2fs with only some relatively small + * changes to f2fs. Extended attributes cannot be used because (a) f2fs limits + * the total size of an inode's xattr entries to 4096 bytes, which wouldn't be + * enough for even a single Merkle tree block, and (b) f2fs encryption doesn't + * encrypt xattrs, yet the verity metadata *must* be encrypted when the file is + * because it contains hashes of the plaintext data. + * + * Using a 64K boundary rather than a 4K one keeps things ready for + * architectures with 64K pages, and it doesn't necessarily waste space on-disk + * since there can be a hole between i_size and the start of the Merkle tree. + */ + +#include <linux/f2fs_fs.h> + +#include "f2fs.h" +#include "xattr.h" + +static inline loff_t f2fs_verity_metadata_pos(const struct inode *inode) +{ + return round_up(inode->i_size, 65536); +} + +/* + * Read some verity metadata from the inode. __vfs_read() can't be used because + * we need to read beyond i_size. + */ +static int pagecache_read(struct inode *inode, void *buf, size_t count, + loff_t pos) +{ + while (count) { + size_t n = min_t(size_t, count, + PAGE_SIZE - offset_in_page(pos)); + struct page *page; + void *addr; + + page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT, + NULL); + if (IS_ERR(page)) + return PTR_ERR(page); + + addr = kmap_atomic(page); + memcpy(buf, addr + offset_in_page(pos), n); + kunmap_atomic(addr); + + put_page(page); + + buf += n; + pos += n; + count -= n; + } + return 0; +} + +/* + * Write some verity metadata to the inode for FS_IOC_ENABLE_VERITY. + * kernel_write() can't be used because the file descriptor is readonly. + */ +static int pagecache_write(struct inode *inode, const void *buf, size_t count, + loff_t pos) +{ + if (pos + count > inode->i_sb->s_maxbytes) + return -EFBIG; + + while (count) { + size_t n = min_t(size_t, count, + PAGE_SIZE - offset_in_page(pos)); + struct page *page; + void *fsdata; + void *addr; + int res; + + res = pagecache_write_begin(NULL, inode->i_mapping, pos, n, 0, + &page, &fsdata); + if (res) + return res; + + addr = kmap_atomic(page); + memcpy(addr + offset_in_page(pos), buf, n); + kunmap_atomic(addr); + + res = pagecache_write_end(NULL, inode->i_mapping, pos, n, n, + page, fsdata); + if (res < 0) + return res; + if (res != n) + return -EIO; + + buf += n; + pos += n; + count -= n; + } + return 0; +} + +/* + * Format of f2fs verity xattr. This points to the location of the verity + * descriptor within the file data rather than containing it directly because + * the verity descriptor *must* be encrypted when f2fs encryption is used. But, + * f2fs encryption does not encrypt xattrs. + */ +struct fsverity_descriptor_location { + __le32 version; + __le32 size; + __le64 pos; +}; + +static int f2fs_begin_enable_verity(struct file *filp) +{ + struct inode *inode = file_inode(filp); + int err; + + if (f2fs_verity_in_progress(inode)) + return -EBUSY; + + if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode)) + return -EOPNOTSUPP; + + /* + * Since the file was opened readonly, we have to initialize the quotas + * here and not rely on ->open() doing it. This must be done before + * evicting the inline data. + */ + err = dquot_initialize(inode); + if (err) + return err; + + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + + set_inode_flag(inode, FI_VERITY_IN_PROGRESS); + return 0; +} + +static int f2fs_end_enable_verity(struct file *filp, const void *desc, + size_t desc_size, u64 merkle_tree_size) +{ + struct inode *inode = file_inode(filp); + u64 desc_pos = f2fs_verity_metadata_pos(inode) + merkle_tree_size; + struct fsverity_descriptor_location dloc = { + .version = cpu_to_le32(1), + .size = cpu_to_le32(desc_size), + .pos = cpu_to_le64(desc_pos), + }; + int err = 0; + + if (desc != NULL) { + /* Succeeded; write the verity descriptor. */ + err = pagecache_write(inode, desc, desc_size, desc_pos); + + /* Write all pages before clearing FI_VERITY_IN_PROGRESS. */ + if (!err) + err = filemap_write_and_wait(inode->i_mapping); + } + + /* If we failed, truncate anything we wrote past i_size. */ + if (desc == NULL || err) + f2fs_truncate(inode); + + clear_inode_flag(inode, FI_VERITY_IN_PROGRESS); + + if (desc != NULL && !err) { + err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_VERITY, + F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc), + NULL, XATTR_CREATE); + if (!err) { + file_set_verity(inode); + f2fs_set_inode_flags(inode); + f2fs_mark_inode_dirty_sync(inode, true); + } + } + return err; +} + +static int f2fs_get_verity_descriptor(struct inode *inode, void *buf, + size_t buf_size) +{ + struct fsverity_descriptor_location dloc; + int res; + u32 size; + u64 pos; + + /* Get the descriptor location */ + res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_VERITY, + F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc), NULL); + if (res < 0 && res != -ERANGE) + return res; + if (res != sizeof(dloc) || dloc.version != cpu_to_le32(1)) { + f2fs_warn(F2FS_I_SB(inode), "unknown verity xattr format"); + return -EINVAL; + } + size = le32_to_cpu(dloc.size); + pos = le64_to_cpu(dloc.pos); + + /* Get the descriptor */ + if (pos + size < pos || pos + size > inode->i_sb->s_maxbytes || + pos < f2fs_verity_metadata_pos(inode) || size > INT_MAX) { + f2fs_warn(F2FS_I_SB(inode), "invalid verity xattr"); + return -EFSCORRUPTED; + } + if (buf_size) { + if (size > buf_size) + return -ERANGE; + res = pagecache_read(inode, buf, size, pos); + if (res) + return res; + } + return size; +} + +static struct page *f2fs_read_merkle_tree_page(struct inode *inode, + pgoff_t index) +{ + index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT; + + return read_mapping_page(inode->i_mapping, index, NULL); +} + +static int f2fs_write_merkle_tree_block(struct inode *inode, const void *buf, + u64 index, int log_blocksize) +{ + loff_t pos = f2fs_verity_metadata_pos(inode) + (index << log_blocksize); + + return pagecache_write(inode, buf, 1 << log_blocksize, pos); +} + +const struct fsverity_operations f2fs_verityops = { + .begin_enable_verity = f2fs_begin_enable_verity, + .end_enable_verity = f2fs_end_enable_verity, + .get_verity_descriptor = f2fs_get_verity_descriptor, + .read_merkle_tree_page = f2fs_read_merkle_tree_page, + .write_merkle_tree_block = f2fs_write_merkle_tree_block, +}; diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index a90920e2f949..de0c600b9cab 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -34,8 +34,10 @@ #define F2FS_XATTR_INDEX_ADVISE 7 /* Should be same as EXT4_XATTR_INDEX_ENCRYPTION */ #define F2FS_XATTR_INDEX_ENCRYPTION 9 +#define F2FS_XATTR_INDEX_VERITY 11 #define F2FS_XATTR_NAME_ENCRYPTION_CONTEXT "c" +#define F2FS_XATTR_NAME_VERITY "v" struct f2fs_xattr_header { __le32 h_magic; /* magic number for identification */ diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 05689198f5af..5f04c5c810fb 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -31,6 +31,11 @@ #define KB_IN_SECTORS 2 +/* DOS dates from 1980/1/1 through 2107/12/31 */ +#define FAT_DATE_MIN (0<<9 | 1<<5 | 1) +#define FAT_DATE_MAX (127<<9 | 12<<5 | 31) +#define FAT_TIME_MAX (23<<11 | 59<<5 | 29) + /* * A deserialized copy of the on-disk structure laid out in struct * fat_boot_sector. @@ -1605,6 +1610,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, int debug; long error; char buf[50]; + struct timespec64 ts; /* * GFP_KERNEL is ok here, because while we do hold the @@ -1698,6 +1704,12 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, sbi->free_clus_valid = 0; sbi->prev_free = FAT_START_ENT; sb->s_maxbytes = 0xffffffff; + fat_time_fat2unix(sbi, &ts, 0, cpu_to_le16(FAT_DATE_MIN), 0); + sb->s_time_min = ts.tv_sec; + + fat_time_fat2unix(sbi, &ts, cpu_to_le16(FAT_TIME_MAX), + cpu_to_le16(FAT_DATE_MAX), 0); + sb->s_time_max = ts.tv_sec; if (!sbi->fat_length && bpb.fat32_length) { struct fat_boot_fsinfo *fsinfo; diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index a89f68c3cbed..578a5062706e 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -229,6 +229,8 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent) sbp->s_op = &vxfs_super_ops; sbp->s_fs_info = infp; + sbp->s_time_min = 0; + sbp->s_time_max = U32_MAX; if (!vxfs_try_sb_magic(sbp, silent, 1, (__force __fs32)cpu_to_le32(VXFS_SUPER_MAGIC))) { diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 542b02d170f8..8aaa7eec7b74 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -36,10 +36,6 @@ */ #define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10)) -struct wb_completion { - atomic_t cnt; -}; - /* * Passed into wb_writeback(), essentially a subset of writeback_control */ @@ -61,19 +57,6 @@ struct wb_writeback_work { }; /* - * If one wants to wait for one or more wb_writeback_works, each work's - * ->done should be set to a wb_completion defined using the following - * macro. Once all work items are issued with wb_queue_work(), the caller - * can wait for the completion of all using wb_wait_for_completion(). Work - * items which are waited upon aren't freed automatically on completion. - */ -#define DEFINE_WB_COMPLETION_ONSTACK(cmpl) \ - struct wb_completion cmpl = { \ - .cnt = ATOMIC_INIT(1), \ - } - - -/* * If an inode is constantly having its pages dirtied, but then the * updates stop dirtytime_expire_interval seconds in the past, it's * possible for the worst case time between when an inode has its @@ -182,7 +165,7 @@ static void finish_writeback_work(struct bdi_writeback *wb, if (work->auto_free) kfree(work); if (done && atomic_dec_and_test(&done->cnt)) - wake_up_all(&wb->bdi->wb_waitq); + wake_up_all(done->waitq); } static void wb_queue_work(struct bdi_writeback *wb, @@ -206,28 +189,44 @@ static void wb_queue_work(struct bdi_writeback *wb, /** * wb_wait_for_completion - wait for completion of bdi_writeback_works - * @bdi: bdi work items were issued to * @done: target wb_completion * * Wait for one or more work items issued to @bdi with their ->done field - * set to @done, which should have been defined with - * DEFINE_WB_COMPLETION_ONSTACK(). This function returns after all such - * work items are completed. Work items which are waited upon aren't freed + * set to @done, which should have been initialized with + * DEFINE_WB_COMPLETION(). This function returns after all such work items + * are completed. Work items which are waited upon aren't freed * automatically on completion. */ -static void wb_wait_for_completion(struct backing_dev_info *bdi, - struct wb_completion *done) +void wb_wait_for_completion(struct wb_completion *done) { atomic_dec(&done->cnt); /* put down the initial count */ - wait_event(bdi->wb_waitq, !atomic_read(&done->cnt)); + wait_event(*done->waitq, !atomic_read(&done->cnt)); } #ifdef CONFIG_CGROUP_WRITEBACK -/* parameters for foreign inode detection, see wb_detach_inode() */ +/* + * Parameters for foreign inode detection, see wbc_detach_inode() to see + * how they're used. + * + * These paramters are inherently heuristical as the detection target + * itself is fuzzy. All we want to do is detaching an inode from the + * current owner if it's being written to by some other cgroups too much. + * + * The current cgroup writeback is built on the assumption that multiple + * cgroups writing to the same inode concurrently is very rare and a mode + * of operation which isn't well supported. As such, the goal is not + * taking too long when a different cgroup takes over an inode while + * avoiding too aggressive flip-flops from occasional foreign writes. + * + * We record, very roughly, 2s worth of IO time history and if more than + * half of that is foreign, trigger the switch. The recording is quantized + * to 16 slots. To avoid tiny writes from swinging the decision too much, + * writes smaller than 1/8 of avg size are ignored. + */ #define WB_FRN_TIME_SHIFT 13 /* 1s = 2^13, upto 8 secs w/ 16bit */ #define WB_FRN_TIME_AVG_SHIFT 3 /* avg = avg * 7/8 + new * 1/8 */ -#define WB_FRN_TIME_CUT_DIV 2 /* ignore rounds < avg / 2 */ +#define WB_FRN_TIME_CUT_DIV 8 /* ignore rounds < avg / 8 */ #define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT)) /* 2s */ #define WB_FRN_HIST_SLOTS 16 /* inode->i_wb_frn_history is 16bit */ @@ -237,6 +236,7 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi, /* if foreign slots >= 8, switch */ #define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1) /* one round can affect upto 5 slots */ +#define WB_FRN_MAX_IN_FLIGHT 1024 /* don't queue too many concurrently */ static atomic_t isw_nr_in_flight = ATOMIC_INIT(0); static struct workqueue_struct *isw_wq; @@ -389,6 +389,8 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) if (unlikely(inode->i_state & I_FREEING)) goto skip_switch; + trace_inode_switch_wbs(inode, old_wb, new_wb); + /* * Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to @@ -489,18 +491,13 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) if (inode->i_state & I_WB_SWITCH) return; - /* - * Avoid starting new switches while sync_inodes_sb() is in - * progress. Otherwise, if the down_write protected issue path - * blocks heavily, we might end up starting a large number of - * switches which will block on the rwsem. - */ - if (!down_read_trylock(&bdi->wb_switch_rwsem)) + /* avoid queueing a new switch if too many are already in flight */ + if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT) return; isw = kzalloc(sizeof(*isw), GFP_ATOMIC); if (!isw) - goto out_unlock; + return; /* find and pin the new wb */ rcu_read_lock(); @@ -534,15 +531,12 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); atomic_inc(&isw_nr_in_flight); - - goto out_unlock; + return; out_free: if (isw->new_wb) wb_put(isw->new_wb); kfree(isw); -out_unlock: - up_read(&bdi->wb_switch_rwsem); } /** @@ -681,6 +675,9 @@ void wbc_detach_inode(struct writeback_control *wbc) if (wbc->wb_id != max_id) history |= (1U << slots) - 1; + if (history) + trace_inode_foreign_history(inode, wbc, history); + /* * Switch if the current wb isn't the consistent winner. * If there are multiple closely competing dirtiers, the @@ -843,7 +840,7 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, restart: rcu_read_lock(); list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) { - DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done); + DEFINE_WB_COMPLETION(fallback_work_done, bdi); struct wb_writeback_work fallback_work; struct wb_writeback_work *work; long nr_pages; @@ -890,7 +887,7 @@ restart: last_wb = wb; rcu_read_unlock(); - wb_wait_for_completion(bdi, &fallback_work_done); + wb_wait_for_completion(&fallback_work_done); goto restart; } rcu_read_unlock(); @@ -900,6 +897,89 @@ restart: } /** + * cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs + * @bdi_id: target bdi id + * @memcg_id: target memcg css id + * @nr_pages: number of pages to write, 0 for best-effort dirty flushing + * @reason: reason why some writeback work initiated + * @done: target wb_completion + * + * Initiate flush of the bdi_writeback identified by @bdi_id and @memcg_id + * with the specified parameters. + */ +int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr, + enum wb_reason reason, struct wb_completion *done) +{ + struct backing_dev_info *bdi; + struct cgroup_subsys_state *memcg_css; + struct bdi_writeback *wb; + struct wb_writeback_work *work; + int ret; + + /* lookup bdi and memcg */ + bdi = bdi_get_by_id(bdi_id); + if (!bdi) + return -ENOENT; + + rcu_read_lock(); + memcg_css = css_from_id(memcg_id, &memory_cgrp_subsys); + if (memcg_css && !css_tryget(memcg_css)) + memcg_css = NULL; + rcu_read_unlock(); + if (!memcg_css) { + ret = -ENOENT; + goto out_bdi_put; + } + + /* + * And find the associated wb. If the wb isn't there already + * there's nothing to flush, don't create one. + */ + wb = wb_get_lookup(bdi, memcg_css); + if (!wb) { + ret = -ENOENT; + goto out_css_put; + } + + /* + * If @nr is zero, the caller is attempting to write out most of + * the currently dirty pages. Let's take the current dirty page + * count and inflate it by 25% which should be large enough to + * flush out most dirty pages while avoiding getting livelocked by + * concurrent dirtiers. + */ + if (!nr) { + unsigned long filepages, headroom, dirty, writeback; + + mem_cgroup_wb_stats(wb, &filepages, &headroom, &dirty, + &writeback); + nr = dirty * 10 / 8; + } + + /* issue the writeback work */ + work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN); + if (work) { + work->nr_pages = nr; + work->sync_mode = WB_SYNC_NONE; + work->range_cyclic = 1; + work->reason = reason; + work->done = done; + work->auto_free = 1; + wb_queue_work(wb, work); + ret = 0; + } else { + ret = -ENOMEM; + } + + wb_put(wb); +out_css_put: + css_put(memcg_css); +out_bdi_put: + bdi_put(bdi); + return ret; +} + +/** * cgroup_writeback_umount - flush inode wb switches for umount * * This function is called when a super_block is about to be destroyed and @@ -2362,7 +2442,8 @@ static void wait_sb_inodes(struct super_block *sb) static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, enum wb_reason reason, bool skip_if_busy) { - DEFINE_WB_COMPLETION_ONSTACK(done); + struct backing_dev_info *bdi = sb->s_bdi; + DEFINE_WB_COMPLETION(done, bdi); struct wb_writeback_work work = { .sb = sb, .sync_mode = WB_SYNC_NONE, @@ -2371,14 +2452,13 @@ static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, .nr_pages = nr, .reason = reason, }; - struct backing_dev_info *bdi = sb->s_bdi; if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info) return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy); - wb_wait_for_completion(bdi, &done); + wb_wait_for_completion(&done); } /** @@ -2440,7 +2520,8 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb); */ void sync_inodes_sb(struct super_block *sb) { - DEFINE_WB_COMPLETION_ONSTACK(done); + struct backing_dev_info *bdi = sb->s_bdi; + DEFINE_WB_COMPLETION(done, bdi); struct wb_writeback_work work = { .sb = sb, .sync_mode = WB_SYNC_ALL, @@ -2450,7 +2531,6 @@ void sync_inodes_sb(struct super_block *sb) .reason = WB_REASON_SYNC, .for_sync = 1, }; - struct backing_dev_info *bdi = sb->s_bdi; /* * Can't skip on !bdi_has_dirty() because we should wait for !dirty @@ -2464,7 +2544,7 @@ void sync_inodes_sb(struct super_block *sb) /* protect against inode wb switch, see inode_switch_wbs_work_fn() */ bdi_down_write_wb_switch_rwsem(bdi); bdi_split_work_to_wbs(bdi, &work, false); - wb_wait_for_completion(bdi, &done); + wb_wait_for_completion(&done); bdi_up_write_wb_switch_rwsem(bdi); wait_sb_inodes(sb); diff --git a/fs/fs_context.c b/fs/fs_context.c index 103643c68e3f..87c2c9687d90 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -279,10 +279,8 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type, fc->user_ns = get_user_ns(reference->d_sb->s_user_ns); break; case FS_CONTEXT_FOR_RECONFIGURE: - /* We don't pin any namespaces as the superblock's - * subscriptions cannot be changed at this point. - */ atomic_inc(&reference->d_sb->s_active); + fc->user_ns = get_user_ns(reference->d_sb->s_user_ns); fc->root = dget(reference); break; } diff --git a/fs/fs_parser.c b/fs/fs_parser.c index 460ea4206fa2..d1930adce68d 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -204,9 +204,23 @@ int fs_parse(struct fs_context *fc, goto okay; case fs_param_is_fd: { - if (param->type != fs_value_is_file) + switch (param->type) { + case fs_value_is_string: + if (!result->has_value) + goto bad_value; + + ret = kstrtouint(param->string, 0, &result->uint_32); + break; + case fs_value_is_file: + result->uint_32 = param->dirfd; + ret = 0; + default: goto bad_value; - goto okay; + } + + if (result->uint_32 > INT_MAX) + goto bad_value; + goto maybe_okay; } case fs_param_is_blockdev: diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index ab2e7cc2ff33..1cca83218fb5 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -334,7 +334,7 @@ long hpfs_ioctl(struct file *file, unsigned cmd, unsigned long arg); * local time (HPFS) to GMT (Unix) */ -static inline time64_t local_to_gmt(struct super_block *s, time32_t t) +static inline time64_t local_to_gmt(struct super_block *s, time64_t t) { extern struct timezone sys_tz; return t + sys_tz.tz_minuteswest * 60 + hpfs_sb(s)->sb_timeshift; @@ -343,9 +343,7 @@ static inline time64_t local_to_gmt(struct super_block *s, time32_t t) static inline time32_t gmt_to_local(struct super_block *s, time64_t t) { extern struct timezone sys_tz; - t = t - sys_tz.tz_minuteswest * 60 - hpfs_sb(s)->sb_timeshift; - - return clamp_t(time64_t, t, 0, U32_MAX); + return t - sys_tz.tz_minuteswest * 60 - hpfs_sb(s)->sb_timeshift; } static inline time32_t local_get_seconds(struct super_block *s) diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 9db6d84f0d62..0a677a9aaf34 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -614,6 +614,8 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) s->s_magic = HPFS_SUPER_MAGIC; s->s_op = &hpfs_sops; s->s_d_op = &hpfs_dentry_operations; + s->s_time_min = local_to_gmt(s, 0); + s->s_time_max = local_to_gmt(s, U32_MAX); sbi->sb_root = le32_to_cpu(superblock->root); sbi->sb_fs_size = le32_to_cpu(superblock->n_sectors); diff --git a/fs/inode.c b/fs/inode.c index 0f1e3b563c47..64bf28cf05cd 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2167,6 +2167,37 @@ struct timespec64 timespec64_trunc(struct timespec64 t, unsigned gran) EXPORT_SYMBOL(timespec64_trunc); /** + * timestamp_truncate - Truncate timespec to a granularity + * @t: Timespec + * @inode: inode being updated + * + * Truncate a timespec to the granularity supported by the fs + * containing the inode. Always rounds down. gran must + * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns). + */ +struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + unsigned int gran = sb->s_time_gran; + + t.tv_sec = clamp(t.tv_sec, sb->s_time_min, sb->s_time_max); + if (unlikely(t.tv_sec == sb->s_time_max || t.tv_sec == sb->s_time_min)) + t.tv_nsec = 0; + + /* Avoid division in the common cases 1 ns and 1 s. */ + if (gran == 1) + ; /* nothing */ + else if (gran == NSEC_PER_SEC) + t.tv_nsec = 0; + else if (gran > 1 && gran < NSEC_PER_SEC) + t.tv_nsec -= t.tv_nsec % gran; + else + WARN(1, "invalid file time granularity: %u", gran); + return t; +} +EXPORT_SYMBOL(timestamp_truncate); + +/** * current_time - Return FS time * @inode: inode. * @@ -2187,7 +2218,7 @@ struct timespec64 current_time(struct inode *inode) return now; } - return timespec64_trunc(now, inode->i_sb->s_time_gran); + return timestamp_truncate(now, inode); } EXPORT_SYMBOL(current_time); diff --git a/fs/io_uring.c b/fs/io_uring.c index cfb48bd088e1..0dadbdbead0f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -75,7 +75,7 @@ #include "internal.h" -#define IORING_MAX_ENTRIES 4096 +#define IORING_MAX_ENTRIES 32768 #define IORING_MAX_FIXED_FILES 1024 struct io_uring { @@ -84,27 +84,29 @@ struct io_uring { }; /* - * This data is shared with the application through the mmap at offset - * IORING_OFF_SQ_RING. + * This data is shared with the application through the mmap at offsets + * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING. * * The offsets to the member fields are published through struct * io_sqring_offsets when calling io_uring_setup. */ -struct io_sq_ring { +struct io_rings { /* * Head and tail offsets into the ring; the offsets need to be * masked to get valid indices. * - * The kernel controls head and the application controls tail. + * The kernel controls head of the sq ring and the tail of the cq ring, + * and the application controls tail of the sq ring and the head of the + * cq ring. */ - struct io_uring r; + struct io_uring sq, cq; /* - * Bitmask to apply to head and tail offsets (constant, equals + * Bitmasks to apply to head and tail offsets (constant, equals * ring_entries - 1) */ - u32 ring_mask; - /* Ring size (constant, power of 2) */ - u32 ring_entries; + u32 sq_ring_mask, cq_ring_mask; + /* Ring sizes (constant, power of 2) */ + u32 sq_ring_entries, cq_ring_entries; /* * Number of invalid entries dropped by the kernel due to * invalid index stored in array @@ -117,7 +119,7 @@ struct io_sq_ring { * counter includes all submissions that were dropped reaching * the new SQ head (and possibly more). */ - u32 dropped; + u32 sq_dropped; /* * Runtime flags * @@ -127,43 +129,7 @@ struct io_sq_ring { * The application needs a full memory barrier before checking * for IORING_SQ_NEED_WAKEUP after updating the sq tail. */ - u32 flags; - /* - * Ring buffer of indices into array of io_uring_sqe, which is - * mmapped by the application using the IORING_OFF_SQES offset. - * - * This indirection could e.g. be used to assign fixed - * io_uring_sqe entries to operations and only submit them to - * the queue when needed. - * - * The kernel modifies neither the indices array nor the entries - * array. - */ - u32 array[]; -}; - -/* - * This data is shared with the application through the mmap at offset - * IORING_OFF_CQ_RING. - * - * The offsets to the member fields are published through struct - * io_cqring_offsets when calling io_uring_setup. - */ -struct io_cq_ring { - /* - * Head and tail offsets into the ring; the offsets need to be - * masked to get valid indices. - * - * The application controls head and the kernel tail. - */ - struct io_uring r; - /* - * Bitmask to apply to head and tail offsets (constant, equals - * ring_entries - 1) - */ - u32 ring_mask; - /* Ring size (constant, power of 2) */ - u32 ring_entries; + u32 sq_flags; /* * Number of completion events lost because the queue was full; * this should be avoided by the application by making sure @@ -177,7 +143,7 @@ struct io_cq_ring { * As completion events come in out of order this counter is not * ordered with any other data. */ - u32 overflow; + u32 cq_overflow; /* * Ring buffer of completion events. * @@ -185,7 +151,7 @@ struct io_cq_ring { * produced, so the application is allowed to modify pending * entries. */ - struct io_uring_cqe cqes[]; + struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; }; struct io_mapped_ubuf { @@ -201,7 +167,7 @@ struct async_list { struct list_head list; struct file *file; - off_t io_end; + off_t io_start; size_t io_len; }; @@ -215,8 +181,18 @@ struct io_ring_ctx { bool compat; bool account_mem; - /* SQ ring */ - struct io_sq_ring *sq_ring; + /* + * Ring buffer of indices into array of io_uring_sqe, which is + * mmapped by the application using the IORING_OFF_SQES offset. + * + * This indirection could e.g. be used to assign fixed + * io_uring_sqe entries to operations and only submit them to + * the queue when needed. + * + * The kernel modifies neither the indices array nor the entries + * array. + */ + u32 *sq_array; unsigned cached_sq_head; unsigned sq_entries; unsigned sq_mask; @@ -227,15 +203,13 @@ struct io_ring_ctx { } ____cacheline_aligned_in_smp; /* IO offload */ - struct workqueue_struct *sqo_wq; + struct workqueue_struct *sqo_wq[2]; struct task_struct *sqo_thread; /* if using sq thread polling */ struct mm_struct *sqo_mm; wait_queue_head_t sqo_wait; struct completion sqo_thread_started; struct { - /* CQ ring */ - struct io_cq_ring *cq_ring; unsigned cached_cq_tail; unsigned cq_entries; unsigned cq_mask; @@ -244,6 +218,8 @@ struct io_ring_ctx { struct eventfd_ctx *cq_ev_fd; } ____cacheline_aligned_in_smp; + struct io_rings *rings; + /* * If used, fixed file set. Writers must ensure that ->refs is dead, * readers must ensure that ->refs is alive as long as the file* is @@ -288,6 +264,7 @@ struct io_ring_ctx { struct sqe_submit { const struct io_uring_sqe *sqe; unsigned short index; + u32 sequence; bool has_user; bool needs_lock; bool needs_fixed_file; @@ -335,6 +312,7 @@ struct io_kiocb { #define REQ_F_LINK 64 /* linked sqes */ #define REQ_F_LINK_DONE 128 /* linked sqes done */ #define REQ_F_FAIL_LINK 256 /* fail rest of links */ +#define REQ_F_SHADOW_DRAIN 512 /* link-drain shadow req */ u64 user_data; u32 result; u32 sequence; @@ -366,6 +344,7 @@ struct io_submit_state { }; static void io_sq_wq_submit_work(struct work_struct *work); +static void __io_free_req(struct io_kiocb *req); static struct kmem_cache *req_cachep; @@ -430,7 +409,7 @@ static inline bool io_sequence_defer(struct io_ring_ctx *ctx, if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN) return false; - return req->sequence != ctx->cached_cq_tail + ctx->sq_ring->dropped; + return req->sequence != ctx->cached_cq_tail + ctx->rings->sq_dropped; } static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) @@ -451,11 +430,11 @@ static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) static void __io_commit_cqring(struct io_ring_ctx *ctx) { - struct io_cq_ring *ring = ctx->cq_ring; + struct io_rings *rings = ctx->rings; - if (ctx->cached_cq_tail != READ_ONCE(ring->r.tail)) { + if (ctx->cached_cq_tail != READ_ONCE(rings->cq.tail)) { /* order cqe stores with ring update */ - smp_store_release(&ring->r.tail, ctx->cached_cq_tail); + smp_store_release(&rings->cq.tail, ctx->cached_cq_tail); if (wq_has_sleeper(&ctx->cq_wait)) { wake_up_interruptible(&ctx->cq_wait); @@ -464,6 +443,24 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) } } +static inline void io_queue_async_work(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + int rw; + + switch (req->submit.sqe->opcode) { + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + rw = !(req->rw.ki_flags & IOCB_DIRECT); + break; + default: + rw = 0; + break; + } + + queue_work(ctx->sqo_wq[rw], &req->work); +} + static void io_commit_cqring(struct io_ring_ctx *ctx) { struct io_kiocb *req; @@ -471,14 +468,19 @@ static void io_commit_cqring(struct io_ring_ctx *ctx) __io_commit_cqring(ctx); while ((req = io_get_deferred_req(ctx)) != NULL) { + if (req->flags & REQ_F_SHADOW_DRAIN) { + /* Just for drain, free it. */ + __io_free_req(req); + continue; + } req->flags |= REQ_F_IO_DRAINED; - queue_work(ctx->sqo_wq, &req->work); + io_queue_async_work(ctx, req); } } static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) { - struct io_cq_ring *ring = ctx->cq_ring; + struct io_rings *rings = ctx->rings; unsigned tail; tail = ctx->cached_cq_tail; @@ -487,11 +489,11 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) * control dependency is enough as we're using WRITE_ONCE to * fill the cq entry */ - if (tail - READ_ONCE(ring->r.head) == ring->ring_entries) + if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries) return NULL; ctx->cached_cq_tail++; - return &ring->cqes[tail & ctx->cq_mask]; + return &rings->cqes[tail & ctx->cq_mask]; } static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, @@ -510,9 +512,9 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, 0); } else { - unsigned overflow = READ_ONCE(ctx->cq_ring->overflow); + unsigned overflow = READ_ONCE(ctx->rings->cq_overflow); - WRITE_ONCE(ctx->cq_ring->overflow, overflow + 1); + WRITE_ONCE(ctx->rings->cq_overflow, overflow + 1); } } @@ -635,7 +637,7 @@ static void io_req_link_next(struct io_kiocb *req) nxt->flags |= REQ_F_LINK_DONE; INIT_WORK(&nxt->work, io_sq_wq_submit_work); - queue_work(req->ctx->sqo_wq, &nxt->work); + io_queue_async_work(req->ctx, nxt); } } @@ -679,11 +681,11 @@ static void io_put_req(struct io_kiocb *req) io_free_req(req); } -static unsigned io_cqring_events(struct io_cq_ring *ring) +static unsigned io_cqring_events(struct io_rings *rings) { /* See comment at the top of this file */ smp_rmb(); - return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head); + return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head); } /* @@ -836,7 +838,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, * If we do, we can potentially be spinning for commands that * already triggered a CQE (eg in error). */ - if (io_cqring_events(ctx->cq_ring)) + if (io_cqring_events(ctx->rings)) break; /* @@ -1187,6 +1189,28 @@ static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw, return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter); } +static inline bool io_should_merge(struct async_list *al, struct kiocb *kiocb) +{ + if (al->file == kiocb->ki_filp) { + off_t start, end; + + /* + * Allow merging if we're anywhere in the range of the same + * page. Generally this happens for sub-page reads or writes, + * and it's beneficial to allow the first worker to bring the + * page in and the piggy backed work can then work on the + * cached page. + */ + start = al->io_start & PAGE_MASK; + end = (al->io_start + al->io_len + PAGE_SIZE - 1) & PAGE_MASK; + if (kiocb->ki_pos >= start && kiocb->ki_pos <= end) + return true; + } + + al->file = NULL; + return false; +} + /* * Make a note of the last file/offset/direction we punted to async * context. We'll use this information to see if we can piggy back a @@ -1198,9 +1222,8 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len) struct async_list *async_list = &req->ctx->pending_async[rw]; struct kiocb *kiocb = &req->rw; struct file *filp = kiocb->ki_filp; - off_t io_end = kiocb->ki_pos + len; - if (filp == async_list->file && kiocb->ki_pos == async_list->io_end) { + if (io_should_merge(async_list, kiocb)) { unsigned long max_bytes; /* Use 8x RA size as a decent limiter for both reads/writes */ @@ -1213,17 +1236,16 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len) req->flags |= REQ_F_SEQ_PREV; async_list->io_len += len; } else { - io_end = 0; - async_list->io_len = 0; + async_list->file = NULL; } } /* New file? Reset state. */ if (async_list->file != filp) { - async_list->io_len = 0; + async_list->io_start = kiocb->ki_pos; + async_list->io_len = len; async_list->file = filp; } - async_list->io_end = io_end; } static int io_read(struct io_kiocb *req, const struct sqe_submit *s, @@ -1535,7 +1557,7 @@ static void io_poll_remove_one(struct io_kiocb *req) WRITE_ONCE(poll->canceled, true); if (!list_empty(&poll->wait.entry)) { list_del_init(&poll->wait.entry); - queue_work(req->ctx->sqo_wq, &req->work); + io_queue_async_work(req->ctx, req); } spin_unlock(&poll->head->lock); @@ -1649,7 +1671,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, io_cqring_ev_posted(ctx); io_put_req(req); } else { - queue_work(ctx->sqo_wq, &req->work); + io_queue_async_work(ctx, req); } return 1; @@ -1992,7 +2014,7 @@ out: */ static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req) { - bool ret = false; + bool ret; if (!list) return false; @@ -2038,10 +2060,14 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s, flags = READ_ONCE(s->sqe->flags); fd = READ_ONCE(s->sqe->fd); - if (flags & IOSQE_IO_DRAIN) { + if (flags & IOSQE_IO_DRAIN) req->flags |= REQ_F_IO_DRAIN; - req->sequence = ctx->cached_sq_head - 1; - } + /* + * All io need record the previous position, if LINK vs DARIN, + * it can be used to mark the position of the first IO in the + * link list. + */ + req->sequence = s->sequence; if (!io_op_needs_file(s->sqe)) return 0; @@ -2063,21 +2089,12 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s, return 0; } -static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, - struct sqe_submit *s) +static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, + struct sqe_submit *s, bool force_nonblock) { int ret; - ret = io_req_defer(ctx, req, s->sqe); - if (ret) { - if (ret != -EIOCBQUEUED) { - io_free_req(req); - io_cqring_add_event(ctx, s->sqe->user_data, ret); - } - return 0; - } - - ret = __io_submit_sqe(ctx, req, s, true); + ret = __io_submit_sqe(ctx, req, s, force_nonblock); if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { struct io_uring_sqe *sqe_copy; @@ -2094,7 +2111,7 @@ static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, if (list) atomic_inc(&list->cnt); INIT_WORK(&req->work, io_sq_wq_submit_work); - queue_work(ctx->sqo_wq, &req->work); + io_queue_async_work(ctx, req); } /* @@ -2119,10 +2136,70 @@ static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, return ret; } +static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, + struct sqe_submit *s, bool force_nonblock) +{ + int ret; + + ret = io_req_defer(ctx, req, s->sqe); + if (ret) { + if (ret != -EIOCBQUEUED) { + io_free_req(req); + io_cqring_add_event(ctx, s->sqe->user_data, ret); + } + return 0; + } + + return __io_queue_sqe(ctx, req, s, force_nonblock); +} + +static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req, + struct sqe_submit *s, struct io_kiocb *shadow, + bool force_nonblock) +{ + int ret; + int need_submit = false; + + if (!shadow) + return io_queue_sqe(ctx, req, s, force_nonblock); + + /* + * Mark the first IO in link list as DRAIN, let all the following + * IOs enter the defer list. all IO needs to be completed before link + * list. + */ + req->flags |= REQ_F_IO_DRAIN; + ret = io_req_defer(ctx, req, s->sqe); + if (ret) { + if (ret != -EIOCBQUEUED) { + io_free_req(req); + io_cqring_add_event(ctx, s->sqe->user_data, ret); + return 0; + } + } else { + /* + * If ret == 0 means that all IOs in front of link io are + * running done. let's queue link head. + */ + need_submit = true; + } + + /* Insert shadow req to defer_list, blocking next IOs */ + spin_lock_irq(&ctx->completion_lock); + list_add_tail(&shadow->list, &ctx->defer_list); + spin_unlock_irq(&ctx->completion_lock); + + if (need_submit) + return __io_queue_sqe(ctx, req, s, force_nonblock); + + return 0; +} + #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK) static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, - struct io_submit_state *state, struct io_kiocb **link) + struct io_submit_state *state, struct io_kiocb **link, + bool force_nonblock) { struct io_uring_sqe *sqe_copy; struct io_kiocb *req; @@ -2175,7 +2252,7 @@ err: INIT_LIST_HEAD(&req->link_list); *link = req; } else { - io_queue_sqe(ctx, req, s); + io_queue_sqe(ctx, req, s, force_nonblock); } } @@ -2205,15 +2282,15 @@ static void io_submit_state_start(struct io_submit_state *state, static void io_commit_sqring(struct io_ring_ctx *ctx) { - struct io_sq_ring *ring = ctx->sq_ring; + struct io_rings *rings = ctx->rings; - if (ctx->cached_sq_head != READ_ONCE(ring->r.head)) { + if (ctx->cached_sq_head != READ_ONCE(rings->sq.head)) { /* * Ensure any loads from the SQEs are done at this point, * since once we write the new head, the application could * write new data to them. */ - smp_store_release(&ring->r.head, ctx->cached_sq_head); + smp_store_release(&rings->sq.head, ctx->cached_sq_head); } } @@ -2227,7 +2304,8 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) */ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) { - struct io_sq_ring *ring = ctx->sq_ring; + struct io_rings *rings = ctx->rings; + u32 *sq_array = ctx->sq_array; unsigned head; /* @@ -2240,20 +2318,21 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) */ head = ctx->cached_sq_head; /* make sure SQ entry isn't read before tail */ - if (head == smp_load_acquire(&ring->r.tail)) + if (head == smp_load_acquire(&rings->sq.tail)) return false; - head = READ_ONCE(ring->array[head & ctx->sq_mask]); + head = READ_ONCE(sq_array[head & ctx->sq_mask]); if (head < ctx->sq_entries) { s->index = head; s->sqe = &ctx->sq_sqes[head]; + s->sequence = ctx->cached_sq_head; ctx->cached_sq_head++; return true; } /* drop invalid entries */ ctx->cached_sq_head++; - ring->dropped++; + rings->sq_dropped++; return false; } @@ -2262,6 +2341,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, { struct io_submit_state state, *statep = NULL; struct io_kiocb *link = NULL; + struct io_kiocb *shadow_req = NULL; bool prev_was_link = false; int i, submitted = 0; @@ -2276,11 +2356,21 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, * that's the end of the chain. Submit the previous link. */ if (!prev_was_link && link) { - io_queue_sqe(ctx, link, &link->submit); + io_queue_link_head(ctx, link, &link->submit, shadow_req, + true); link = NULL; } prev_was_link = (sqes[i].sqe->flags & IOSQE_IO_LINK) != 0; + if (link && (sqes[i].sqe->flags & IOSQE_IO_DRAIN)) { + if (!shadow_req) { + shadow_req = io_get_req(ctx, NULL); + shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN); + refcount_dec(&shadow_req->refs); + } + shadow_req->sequence = sqes[i].sequence; + } + if (unlikely(mm_fault)) { io_cqring_add_event(ctx, sqes[i].sqe->user_data, -EFAULT); @@ -2288,13 +2378,13 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, sqes[i].has_user = has_user; sqes[i].needs_lock = true; sqes[i].needs_fixed_file = true; - io_submit_sqe(ctx, &sqes[i], statep, &link); + io_submit_sqe(ctx, &sqes[i], statep, &link, true); submitted++; } } if (link) - io_queue_sqe(ctx, link, &link->submit); + io_queue_link_head(ctx, link, &link->submit, shadow_req, true); if (statep) io_submit_state_end(&state); @@ -2366,7 +2456,7 @@ static int io_sq_thread(void *data) TASK_INTERRUPTIBLE); /* Tell userspace we may need a wakeup call */ - ctx->sq_ring->flags |= IORING_SQ_NEED_WAKEUP; + ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP; /* make sure to read SQ tail after writing flags */ smp_mb(); @@ -2380,12 +2470,12 @@ static int io_sq_thread(void *data) schedule(); finish_wait(&ctx->sqo_wait, &wait); - ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; + ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; continue; } finish_wait(&ctx->sqo_wait, &wait); - ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; + ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; } i = 0; @@ -2426,10 +2516,12 @@ static int io_sq_thread(void *data) return 0; } -static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) +static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit, + bool block_for_last) { struct io_submit_state state, *statep = NULL; struct io_kiocb *link = NULL; + struct io_kiocb *shadow_req = NULL; bool prev_was_link = false; int i, submit = 0; @@ -2439,6 +2531,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) } for (i = 0; i < to_submit; i++) { + bool force_nonblock = true; struct sqe_submit s; if (!io_get_sqring(ctx, &s)) @@ -2449,21 +2542,43 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) * that's the end of the chain. Submit the previous link. */ if (!prev_was_link && link) { - io_queue_sqe(ctx, link, &link->submit); + io_queue_link_head(ctx, link, &link->submit, shadow_req, + force_nonblock); link = NULL; } prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0; + if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) { + if (!shadow_req) { + shadow_req = io_get_req(ctx, NULL); + shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN); + refcount_dec(&shadow_req->refs); + } + shadow_req->sequence = s.sequence; + } + s.has_user = true; s.needs_lock = false; s.needs_fixed_file = false; submit++; - io_submit_sqe(ctx, &s, statep, &link); + + /* + * The caller will block for events after submit, submit the + * last IO non-blocking. This is either the only IO it's + * submitting, or it already submitted the previous ones. This + * improves performance by avoiding an async punt that we don't + * need to do. + */ + if (block_for_last && submit == to_submit) + force_nonblock = false; + + io_submit_sqe(ctx, &s, statep, &link, force_nonblock); } io_commit_sqring(ctx); if (link) - io_queue_sqe(ctx, link, &link->submit); + io_queue_link_head(ctx, link, &link->submit, shadow_req, + block_for_last); if (statep) io_submit_state_end(statep); @@ -2477,10 +2592,10 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, const sigset_t __user *sig, size_t sigsz) { - struct io_cq_ring *ring = ctx->cq_ring; + struct io_rings *rings = ctx->rings; int ret; - if (io_cqring_events(ring) >= min_events) + if (io_cqring_events(rings) >= min_events) return 0; if (sig) { @@ -2496,12 +2611,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return ret; } - ret = wait_event_interruptible(ctx->wait, io_cqring_events(ring) >= min_events); + ret = wait_event_interruptible(ctx->wait, io_cqring_events(rings) >= min_events); restore_saved_sigmask_unless(ret == -ERESTARTSYS); if (ret == -ERESTARTSYS) ret = -EINTR; - return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0; + return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) @@ -2551,11 +2666,15 @@ static void io_sq_thread_stop(struct io_ring_ctx *ctx) static void io_finish_async(struct io_ring_ctx *ctx) { + int i; + io_sq_thread_stop(ctx); - if (ctx->sqo_wq) { - destroy_workqueue(ctx->sqo_wq); - ctx->sqo_wq = NULL; + for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++) { + if (ctx->sqo_wq[i]) { + destroy_workqueue(ctx->sqo_wq[i]); + ctx->sqo_wq[i] = NULL; + } } } @@ -2763,16 +2882,31 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, } /* Do QD, or 2 * CPUS, whatever is smallest */ - ctx->sqo_wq = alloc_workqueue("io_ring-wq", WQ_UNBOUND | WQ_FREEZABLE, + ctx->sqo_wq[0] = alloc_workqueue("io_ring-wq", + WQ_UNBOUND | WQ_FREEZABLE, min(ctx->sq_entries - 1, 2 * num_online_cpus())); - if (!ctx->sqo_wq) { + if (!ctx->sqo_wq[0]) { + ret = -ENOMEM; + goto err; + } + + /* + * This is for buffered writes, where we want to limit the parallelism + * due to file locking in file systems. As "normal" buffered writes + * should parellelize on writeout quite nicely, limit us to having 2 + * pending. This avoids massive contention on the inode when doing + * buffered async writes. + */ + ctx->sqo_wq[1] = alloc_workqueue("io_ring-write-wq", + WQ_UNBOUND | WQ_FREEZABLE, 2); + if (!ctx->sqo_wq[1]) { ret = -ENOMEM; goto err; } return 0; err: - io_sq_thread_stop(ctx); + io_finish_async(ctx); mmdrop(ctx->sqo_mm); ctx->sqo_mm = NULL; return ret; @@ -2821,17 +2955,45 @@ static void *io_mem_alloc(size_t size) return (void *) __get_free_pages(gfp_flags, get_order(size)); } +static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, + size_t *sq_offset) +{ + struct io_rings *rings; + size_t off, sq_array_size; + + off = struct_size(rings, cqes, cq_entries); + if (off == SIZE_MAX) + return SIZE_MAX; + +#ifdef CONFIG_SMP + off = ALIGN(off, SMP_CACHE_BYTES); + if (off == 0) + return SIZE_MAX; +#endif + + sq_array_size = array_size(sizeof(u32), sq_entries); + if (sq_array_size == SIZE_MAX) + return SIZE_MAX; + + if (check_add_overflow(off, sq_array_size, &off)) + return SIZE_MAX; + + if (sq_offset) + *sq_offset = off; + + return off; +} + static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries) { - struct io_sq_ring *sq_ring; - struct io_cq_ring *cq_ring; - size_t bytes; + size_t pages; - bytes = struct_size(sq_ring, array, sq_entries); - bytes += array_size(sizeof(struct io_uring_sqe), sq_entries); - bytes += struct_size(cq_ring, cqes, cq_entries); + pages = (size_t)1 << get_order( + rings_size(sq_entries, cq_entries, NULL)); + pages += (size_t)1 << get_order( + array_size(sizeof(struct io_uring_sqe), sq_entries)); - return (bytes + PAGE_SIZE - 1) / PAGE_SIZE; + return pages; } static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) @@ -2845,7 +3007,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) struct io_mapped_ubuf *imu = &ctx->user_bufs[i]; for (j = 0; j < imu->nr_bvecs; j++) - put_page(imu->bvec[j].bv_page); + put_user_page(imu->bvec[j].bv_page); if (ctx->account_mem) io_unaccount_mem(ctx->user, imu->nr_bvecs); @@ -2989,10 +3151,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, * if we did partial map, or found file backed vmas, * release any pages we did get */ - if (pret > 0) { - for (j = 0; j < pret; j++) - put_page(pages[j]); - } + if (pret > 0) + put_user_pages(pages, pret); if (ctx->account_mem) io_unaccount_mem(ctx->user, nr_pages); kvfree(imu->bvec); @@ -3078,9 +3238,8 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) } #endif - io_mem_free(ctx->sq_ring); + io_mem_free(ctx->rings); io_mem_free(ctx->sq_sqes); - io_mem_free(ctx->cq_ring); percpu_ref_exit(&ctx->refs); if (ctx->account_mem) @@ -3101,10 +3260,10 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) * io_commit_cqring */ smp_rmb(); - if (READ_ONCE(ctx->sq_ring->r.tail) - ctx->cached_sq_head != - ctx->sq_ring->ring_entries) + if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head != + ctx->rings->sq_ring_entries) mask |= EPOLLOUT | EPOLLWRNORM; - if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail) + if (READ_ONCE(ctx->rings->sq.head) != ctx->cached_cq_tail) mask |= EPOLLIN | EPOLLRDNORM; return mask; @@ -3149,14 +3308,12 @@ static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) switch (offset) { case IORING_OFF_SQ_RING: - ptr = ctx->sq_ring; + case IORING_OFF_CQ_RING: + ptr = ctx->rings; break; case IORING_OFF_SQES: ptr = ctx->sq_sqes; break; - case IORING_OFF_CQ_RING: - ptr = ctx->cq_ring; - break; default: return -EINVAL; } @@ -3199,19 +3356,27 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, * Just return the requested submit count, and wake the thread if * we were asked to. */ + ret = 0; if (ctx->flags & IORING_SETUP_SQPOLL) { if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sqo_wait); submitted = to_submit; - goto out_ctx; - } + } else if (to_submit) { + bool block_for_last = false; - ret = 0; - if (to_submit) { to_submit = min(to_submit, ctx->sq_entries); + /* + * Allow last submission to block in a series, IFF the caller + * asked to wait for events and we don't currently have + * enough. This potentially avoids an async punt. + */ + if (to_submit == min_complete && + io_cqring_events(ctx->rings) < min_complete) + block_for_last = true; + mutex_lock(&ctx->uring_lock); - submitted = io_ring_submit(ctx, to_submit); + submitted = io_ring_submit(ctx, to_submit, block_for_last); mutex_unlock(&ctx->uring_lock); } if (flags & IORING_ENTER_GETEVENTS) { @@ -3226,7 +3391,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, } } -out_ctx: io_ring_drop_ctx_refs(ctx, 1); out_fput: fdput(f); @@ -3243,19 +3407,27 @@ static const struct file_operations io_uring_fops = { static int io_allocate_scq_urings(struct io_ring_ctx *ctx, struct io_uring_params *p) { - struct io_sq_ring *sq_ring; - struct io_cq_ring *cq_ring; - size_t size; + struct io_rings *rings; + size_t size, sq_array_offset; - sq_ring = io_mem_alloc(struct_size(sq_ring, array, p->sq_entries)); - if (!sq_ring) + size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset); + if (size == SIZE_MAX) + return -EOVERFLOW; + + rings = io_mem_alloc(size); + if (!rings) return -ENOMEM; - ctx->sq_ring = sq_ring; - sq_ring->ring_mask = p->sq_entries - 1; - sq_ring->ring_entries = p->sq_entries; - ctx->sq_mask = sq_ring->ring_mask; - ctx->sq_entries = sq_ring->ring_entries; + ctx->rings = rings; + ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); + rings->sq_ring_mask = p->sq_entries - 1; + rings->cq_ring_mask = p->cq_entries - 1; + rings->sq_ring_entries = p->sq_entries; + rings->cq_ring_entries = p->cq_entries; + ctx->sq_mask = rings->sq_ring_mask; + ctx->cq_mask = rings->cq_ring_mask; + ctx->sq_entries = rings->sq_ring_entries; + ctx->cq_entries = rings->cq_ring_entries; size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); if (size == SIZE_MAX) @@ -3265,15 +3437,6 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx, if (!ctx->sq_sqes) return -ENOMEM; - cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries)); - if (!cq_ring) - return -ENOMEM; - - ctx->cq_ring = cq_ring; - cq_ring->ring_mask = p->cq_entries - 1; - cq_ring->ring_entries = p->cq_entries; - ctx->cq_mask = cq_ring->ring_mask; - ctx->cq_entries = cq_ring->ring_entries; return 0; } @@ -3377,21 +3540,23 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) goto err; memset(&p->sq_off, 0, sizeof(p->sq_off)); - p->sq_off.head = offsetof(struct io_sq_ring, r.head); - p->sq_off.tail = offsetof(struct io_sq_ring, r.tail); - p->sq_off.ring_mask = offsetof(struct io_sq_ring, ring_mask); - p->sq_off.ring_entries = offsetof(struct io_sq_ring, ring_entries); - p->sq_off.flags = offsetof(struct io_sq_ring, flags); - p->sq_off.dropped = offsetof(struct io_sq_ring, dropped); - p->sq_off.array = offsetof(struct io_sq_ring, array); + p->sq_off.head = offsetof(struct io_rings, sq.head); + p->sq_off.tail = offsetof(struct io_rings, sq.tail); + p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); + p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); + p->sq_off.flags = offsetof(struct io_rings, sq_flags); + p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); + p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; memset(&p->cq_off, 0, sizeof(p->cq_off)); - p->cq_off.head = offsetof(struct io_cq_ring, r.head); - p->cq_off.tail = offsetof(struct io_cq_ring, r.tail); - p->cq_off.ring_mask = offsetof(struct io_cq_ring, ring_mask); - p->cq_off.ring_entries = offsetof(struct io_cq_ring, ring_entries); - p->cq_off.overflow = offsetof(struct io_cq_ring, overflow); - p->cq_off.cqes = offsetof(struct io_cq_ring, cqes); + p->cq_off.head = offsetof(struct io_rings, cq.head); + p->cq_off.tail = offsetof(struct io_rings, cq.tail); + p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); + p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); + p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); + p->cq_off.cqes = offsetof(struct io_rings, cqes); + + p->features = IORING_FEAT_SINGLE_MMAP; return ret; err: io_ring_ctx_wait_and_kill(ctx); diff --git a/fs/isofs/export.c b/fs/isofs/export.c index 85a9093769a9..35768a63fb1d 100644 --- a/fs/isofs/export.c +++ b/fs/isofs/export.c @@ -10,7 +10,7 @@ * * The following files are helpful: * - * Documentation/filesystems/nfs/Exporting + * Documentation/filesystems/nfs/exporting.rst * fs/exportfs/expfs.c. */ diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 9e30d8703735..62c0462dc89f 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -30,6 +30,9 @@ #include "isofs.h" #include "zisofs.h" +/* max tz offset is 13 hours */ +#define MAX_TZ_OFFSET (52*15*60) + #define BEQUIET static int isofs_hashi(const struct dentry *parent, struct qstr *qstr); @@ -801,6 +804,10 @@ root_found: */ s->s_maxbytes = 0x80000000000LL; + /* ECMA-119 timestamp from 1900/1/1 with tz offset */ + s->s_time_min = mktime64(1900, 1, 1, 0, 0, 0) - MAX_TZ_OFFSET; + s->s_time_max = mktime64(U8_MAX+1900, 12, 31, 23, 59, 59) + MAX_TZ_OFFSET; + /* Set this for reference. Its not currently used except on write which we don't have .. */ diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 8a20ddd25f2d..05fe6cf5f1ac 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -17,6 +17,7 @@ #include <linux/sched.h> #include <linux/cred.h> #include <linux/fs.h> +#include <linux/fs_context.h> #include <linux/list.h> #include <linux/mtd/mtd.h> #include <linux/pagemap.h> @@ -184,7 +185,7 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size) { truncate_setsize(inode, iattr->ia_size); inode->i_blocks = (inode->i_size + 511) >> 9; - } + } return 0; } @@ -391,7 +392,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags) jffs2_do_setattr(inode, &iattr); } -int jffs2_do_remount_fs(struct super_block *sb, int *flags, char *data) +int jffs2_do_remount_fs(struct super_block *sb, struct fs_context *fc) { struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); @@ -409,10 +410,10 @@ int jffs2_do_remount_fs(struct super_block *sb, int *flags, char *data) mutex_unlock(&c->alloc_sem); } - if (!(*flags & SB_RDONLY)) + if (!(fc->sb_flags & SB_RDONLY)) jffs2_start_garbage_collect_thread(c); - *flags |= SB_NOATIME; + fc->sb_flags |= SB_NOATIME; return 0; } @@ -509,7 +510,7 @@ static int calculate_inocache_hashsize(uint32_t flash_size) return hashsize; } -int jffs2_do_fill_super(struct super_block *sb, void *data, int silent) +int jffs2_do_fill_super(struct super_block *sb, struct fs_context *fc) { struct jffs2_sb_info *c; struct inode *root_i; @@ -524,11 +525,11 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent) #ifndef CONFIG_JFFS2_FS_WRITEBUFFER if (c->mtd->type == MTD_NANDFLASH) { - pr_err("Cannot operate on NAND flash unless jffs2 NAND support is compiled in\n"); + errorf(fc, "Cannot operate on NAND flash unless jffs2 NAND support is compiled in"); return -EINVAL; } if (c->mtd->type == MTD_DATAFLASH) { - pr_err("Cannot operate on DataFlash unless jffs2 DataFlash support is compiled in\n"); + errorf(fc, "Cannot operate on DataFlash unless jffs2 DataFlash support is compiled in"); return -EINVAL; } #endif @@ -542,12 +543,12 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent) */ if ((c->sector_size * blocks) != c->flash_size) { c->flash_size = c->sector_size * blocks; - pr_info("Flash size not aligned to erasesize, reducing to %dKiB\n", - c->flash_size / 1024); + infof(fc, "Flash size not aligned to erasesize, reducing to %dKiB", + c->flash_size / 1024); } if (c->flash_size < 5*c->sector_size) { - pr_err("Too few erase blocks (%d)\n", + errorf(fc, "Too few erase blocks (%d)", c->flash_size / c->sector_size); return -EINVAL; } @@ -590,6 +591,9 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent) sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = JFFS2_SUPER_MAGIC; + sb->s_time_min = 0; + sb->s_time_max = U32_MAX; + if (!sb_rdonly(sb)) jffs2_start_garbage_collect_thread(c); return 0; diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index bd3d5f0ddc34..21071fc2975d 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -172,8 +172,8 @@ void jffs2_dirty_inode(struct inode *inode, int flags); struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_raw_inode *ri); int jffs2_statfs (struct dentry *, struct kstatfs *); -int jffs2_do_remount_fs(struct super_block *, int *, char *); -int jffs2_do_fill_super(struct super_block *sb, void *data, int silent); +int jffs2_do_remount_fs(struct super_block *sb, struct fs_context *fc); +int jffs2_do_fill_super(struct super_block *sb, struct fs_context *fc); void jffs2_gc_release_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f); struct jffs2_inode_info *jffs2_gc_fetch_inode(struct jffs2_sb_info *c, diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index af4aa6599473..cbe70637c117 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -19,7 +19,8 @@ #include <linux/fs.h> #include <linux/err.h> #include <linux/mount.h> -#include <linux/parser.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/jffs2.h> #include <linux/pagemap.h> #include <linux/mtd/super.h> @@ -157,96 +158,77 @@ static const struct export_operations jffs2_export_ops = { /* * JFFS2 mount options. * + * Opt_source: The source device * Opt_override_compr: override default compressor * Opt_rp_size: size of reserved pool in KiB - * Opt_err: just end of array marker */ enum { + Opt_source, Opt_override_compr, Opt_rp_size, - Opt_err, }; -static const match_table_t tokens = { - {Opt_override_compr, "compr=%s"}, - {Opt_rp_size, "rp_size=%u"}, - {Opt_err, NULL}, +static const struct fs_parameter_spec jffs2_param_specs[] = { + fsparam_string ("source", Opt_source), + fsparam_enum ("compr", Opt_override_compr), + fsparam_u32 ("rp_size", Opt_rp_size), + {} }; -static int jffs2_parse_options(struct jffs2_sb_info *c, char *data) -{ - substring_t args[MAX_OPT_ARGS]; - char *p, *name; - unsigned int opt; - - if (!data) - return 0; - - while ((p = strsep(&data, ","))) { - int token; - - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_override_compr: - name = match_strdup(&args[0]); - - if (!name) - return -ENOMEM; - if (!strcmp(name, "none")) - c->mount_opts.compr = JFFS2_COMPR_MODE_NONE; +static const struct fs_parameter_enum jffs2_param_enums[] = { + { Opt_override_compr, "none", JFFS2_COMPR_MODE_NONE }, #ifdef CONFIG_JFFS2_LZO - else if (!strcmp(name, "lzo")) - c->mount_opts.compr = JFFS2_COMPR_MODE_FORCELZO; + { Opt_override_compr, "lzo", JFFS2_COMPR_MODE_FORCELZO }, #endif #ifdef CONFIG_JFFS2_ZLIB - else if (!strcmp(name, "zlib")) - c->mount_opts.compr = - JFFS2_COMPR_MODE_FORCEZLIB; + { Opt_override_compr, "zlib", JFFS2_COMPR_MODE_FORCEZLIB }, #endif - else { - pr_err("Error: unknown compressor \"%s\"\n", - name); - kfree(name); - return -EINVAL; - } - kfree(name); - c->mount_opts.override_compr = true; - break; - case Opt_rp_size: - if (match_int(&args[0], &opt)) - return -EINVAL; - opt *= 1024; - if (opt > c->mtd->size) { - pr_warn("Too large reserve pool specified, max " - "is %llu KB\n", c->mtd->size / 1024); - return -EINVAL; - } - c->mount_opts.rp_size = opt; - break; - default: - pr_err("Error: unrecognized mount option '%s' or missing value\n", - p); - return -EINVAL; - } + {} +}; + +const struct fs_parameter_description jffs2_fs_parameters = { + .name = "jffs2", + .specs = jffs2_param_specs, + .enums = jffs2_param_enums, +}; + +static int jffs2_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct fs_parse_result result; + struct jffs2_sb_info *c = fc->s_fs_info; + int opt; + + opt = fs_parse(fc, &jffs2_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_override_compr: + c->mount_opts.compr = result.uint_32; + c->mount_opts.override_compr = true; + break; + case Opt_rp_size: + if (result.uint_32 > UINT_MAX / 1024) + return invalf(fc, "jffs2: rp_size unrepresentable"); + opt = result.uint_32 * 1024; + if (opt > c->mtd->size) + return invalf(fc, "jffs2: Too large reserve pool specified, max is %llu KB", + c->mtd->size / 1024); + c->mount_opts.rp_size = opt; + break; + default: + return -EINVAL; } return 0; } -static int jffs2_remount_fs(struct super_block *sb, int *flags, char *data) +static int jffs2_reconfigure(struct fs_context *fc) { - struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); - int err; + struct super_block *sb = fc->root->d_sb; sync_filesystem(sb); - err = jffs2_parse_options(c, data); - if (err) - return -EINVAL; - - return jffs2_do_remount_fs(sb, flags, data); + return jffs2_do_remount_fs(sb, fc); } static const struct super_operations jffs2_super_operations = @@ -255,7 +237,6 @@ static const struct super_operations jffs2_super_operations = .free_inode = jffs2_free_inode, .put_super = jffs2_put_super, .statfs = jffs2_statfs, - .remount_fs = jffs2_remount_fs, .evict_inode = jffs2_evict_inode, .dirty_inode = jffs2_dirty_inode, .show_options = jffs2_show_options, @@ -265,26 +246,16 @@ static const struct super_operations jffs2_super_operations = /* * fill in the superblock */ -static int jffs2_fill_super(struct super_block *sb, void *data, int silent) +static int jffs2_fill_super(struct super_block *sb, struct fs_context *fc) { - struct jffs2_sb_info *c; - int ret; + struct jffs2_sb_info *c = sb->s_fs_info; jffs2_dbg(1, "jffs2_get_sb_mtd():" " New superblock for device %d (\"%s\")\n", sb->s_mtd->index, sb->s_mtd->name); - c = kzalloc(sizeof(*c), GFP_KERNEL); - if (!c) - return -ENOMEM; - c->mtd = sb->s_mtd; c->os_priv = sb; - sb->s_fs_info = c; - - ret = jffs2_parse_options(c, data); - if (ret) - return -EINVAL; /* Initialize JFFS2 superblock locks, the further initialization will * be done later */ @@ -302,15 +273,37 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent) #ifdef CONFIG_JFFS2_FS_POSIX_ACL sb->s_flags |= SB_POSIXACL; #endif - ret = jffs2_do_fill_super(sb, data, silent); - return ret; + return jffs2_do_fill_super(sb, fc); } -static struct dentry *jffs2_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data) +static int jffs2_get_tree(struct fs_context *fc) { - return mount_mtd(fs_type, flags, dev_name, data, jffs2_fill_super); + return get_tree_mtd(fc, jffs2_fill_super); +} + +static void jffs2_free_fc(struct fs_context *fc) +{ + kfree(fc->s_fs_info); +} + +static const struct fs_context_operations jffs2_context_ops = { + .free = jffs2_free_fc, + .parse_param = jffs2_parse_param, + .get_tree = jffs2_get_tree, + .reconfigure = jffs2_reconfigure, +}; + +static int jffs2_init_fs_context(struct fs_context *fc) +{ + struct jffs2_sb_info *ctx; + + ctx = kzalloc(sizeof(struct jffs2_sb_info), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + fc->s_fs_info = ctx; + fc->ops = &jffs2_context_ops; + return 0; } static void jffs2_put_super (struct super_block *sb) @@ -347,7 +340,8 @@ static void jffs2_kill_sb(struct super_block *sb) static struct file_system_type jffs2_fs_type = { .owner = THIS_MODULE, .name = "jffs2", - .mount = jffs2_mount, + .init_fs_context = jffs2_init_fs_context, + .parameters = &jffs2_fs_parameters, .kill_sb = jffs2_kill_sb, }; MODULE_ALIAS_FS("jffs2"); diff --git a/fs/jfs/Kconfig b/fs/jfs/Kconfig index 22a273bd4648..05cb0e8e4382 100644 --- a/fs/jfs/Kconfig +++ b/fs/jfs/Kconfig @@ -5,7 +5,7 @@ config JFS_FS select CRC32 help This is a port of IBM's Journaled Filesystem . More information is - available in the file <file:Documentation/filesystems/jfs.txt>. + available in the file <file:Documentation/admin-guide/jfs.rst>. If you do not intend to use the JFS filesystem, say N. diff --git a/fs/jfs/super.c b/fs/jfs/super.c index f4e10cb9f734..b2dc4d1f9dcc 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -503,6 +503,8 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_fs_info = sbi; sb->s_max_links = JFS_LINK_MAX; + sb->s_time_min = 0; + sb->s_time_max = U32_MAX; sbi->sb = sb; sbi->uid = INVALID_UID; sbi->gid = INVALID_GID; diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index a387534c9577..6ebae6bbe6a5 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -137,6 +137,9 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, if (kn_from == kn_to) return strlcpy(buf, "/", buflen); + if (!buf) + return -EINVAL; + common = kernfs_common_ancestor(kn_from, kn_to); if (WARN_ON(!common)) return -EINVAL; @@ -144,8 +147,7 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, depth_to = kernfs_depth(common, kn_to); depth_from = kernfs_depth(common, kn_from); - if (buf) - buf[0] = '\0'; + buf[0] = '\0'; for (i = 0; i < depth_from; i++) len += strlcpy(buf + len, parent_str, @@ -430,7 +432,6 @@ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn) */ void kernfs_put_active(struct kernfs_node *kn) { - struct kernfs_root *root = kernfs_root(kn); int v; if (unlikely(!kn)) @@ -442,7 +443,7 @@ void kernfs_put_active(struct kernfs_node *kn) if (likely(v != KN_DEACTIVATED_BIAS)) return; - wake_up_all(&root->deactivate_waitq); + wake_up_all(&kernfs_root(kn)->deactivate_waitq); } /** diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index f3f3984cce80..f3eaa8869f42 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -158,12 +158,11 @@ static inline void set_default_inode_attr(struct inode *inode, umode_t mode) static inline void set_inode_attr(struct inode *inode, struct kernfs_iattrs *attrs) { - struct super_block *sb = inode->i_sb; inode->i_uid = attrs->ia_uid; inode->i_gid = attrs->ia_gid; - inode->i_atime = timespec64_trunc(attrs->ia_atime, sb->s_time_gran); - inode->i_mtime = timespec64_trunc(attrs->ia_mtime, sb->s_time_gran); - inode->i_ctime = timespec64_trunc(attrs->ia_ctime, sb->s_time_gran); + inode->i_atime = timestamp_truncate(attrs->ia_atime, inode); + inode->i_mtime = timestamp_truncate(attrs->ia_mtime, inode); + inode->i_ctime = timestamp_truncate(attrs->ia_ctime, inode); } static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) diff --git a/fs/locks.c b/fs/locks.c index 686eae21daf6..a364ebc5cec3 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1592,7 +1592,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) ctx = smp_load_acquire(&inode->i_flctx); if (!ctx) { WARN_ON_ONCE(1); - return error; + goto free_lock; } percpu_down_read(&file_rwsem); @@ -1672,6 +1672,7 @@ out: spin_unlock(&ctx->flc_lock); percpu_up_read(&file_rwsem); locks_dispose_list(&dispose); +free_lock: locks_free_lock(new_fl); return error; } @@ -2784,10 +2785,10 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, ? (fl->fl_type & LOCK_WRITE) ? "RW " : "READ " : (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE "); } else { - seq_printf(f, "%s ", - (lease_breaking(fl)) - ? (fl->fl_type == F_UNLCK) ? "UNLCK" : "READ " - : (fl->fl_type == F_WRLCK) ? "WRITE" : "READ "); + int type = IS_LEASE(fl) ? target_leasetype(fl) : fl->fl_type; + + seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" : + (type == F_RDLCK) ? "READ" : "UNLCK"); } if (inode) { /* userspace relies on this representation of dev_t */ diff --git a/fs/minix/inode.c b/fs/minix/inode.c index f96073f25432..7cb5fd38eb14 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -277,6 +277,8 @@ static int minix_fill_super(struct super_block *s, void *data, int silent) /* set up enough so that it can read an inode */ s->s_op = &minix_sops; + s->s_time_min = 0; + s->s_time_max = U32_MAX; root_inode = minix_iget(s, MINIX_ROOT_INO); if (IS_ERR(root_inode)) { ret = PTR_ERR(root_inode); diff --git a/fs/namei.c b/fs/namei.c index 209c51a5226c..671c3c1a3425 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -596,14 +596,12 @@ static void terminate_walk(struct nameidata *nd) path_put(&nd->path); for (i = 0; i < nd->depth; i++) path_put(&nd->stack[i].link); - if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { + if (nd->flags & LOOKUP_ROOT_GRABBED) { path_put(&nd->root); - nd->root.mnt = NULL; + nd->flags &= ~LOOKUP_ROOT_GRABBED; } } else { nd->flags &= ~LOOKUP_RCU; - if (!(nd->flags & LOOKUP_ROOT)) - nd->root.mnt = NULL; rcu_read_unlock(); } nd->depth = 0; @@ -641,6 +639,14 @@ static bool legitimize_links(struct nameidata *nd) return true; } +static bool legitimize_root(struct nameidata *nd) +{ + if (!nd->root.mnt || (nd->flags & LOOKUP_ROOT)) + return true; + nd->flags |= LOOKUP_ROOT_GRABBED; + return legitimize_path(nd, &nd->root, nd->root_seq); +} + /* * Path walking has 2 modes, rcu-walk and ref-walk (see * Documentation/filesystems/path-lookup.txt). In situations when we can't @@ -671,23 +677,18 @@ static int unlazy_walk(struct nameidata *nd) nd->flags &= ~LOOKUP_RCU; if (unlikely(!legitimize_links(nd))) - goto out2; - if (unlikely(!legitimize_path(nd, &nd->path, nd->seq))) goto out1; - if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { - if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq))) - goto out; - } + if (unlikely(!legitimize_path(nd, &nd->path, nd->seq))) + goto out; + if (unlikely(!legitimize_root(nd))) + goto out; rcu_read_unlock(); BUG_ON(nd->inode != parent->d_inode); return 0; -out2: +out1: nd->path.mnt = NULL; nd->path.dentry = NULL; -out1: - if (!(nd->flags & LOOKUP_ROOT)) - nd->root.mnt = NULL; out: rcu_read_unlock(); return -ECHILD; @@ -727,23 +728,14 @@ static int unlazy_child(struct nameidata *nd, struct dentry *dentry, unsigned se */ if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) goto out; - if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) { - rcu_read_unlock(); - dput(dentry); - goto drop_root_mnt; - } + if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) + goto out_dput; /* * Sequence counts matched. Now make sure that the root is * still valid and get it if required. */ - if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { - if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq))) { - rcu_read_unlock(); - dput(dentry); - return -ECHILD; - } - } - + if (unlikely(!legitimize_root(nd))) + goto out_dput; rcu_read_unlock(); return 0; @@ -753,9 +745,10 @@ out1: nd->path.dentry = NULL; out: rcu_read_unlock(); -drop_root_mnt: - if (!(nd->flags & LOOKUP_ROOT)) - nd->root.mnt = NULL; + return -ECHILD; +out_dput: + rcu_read_unlock(); + dput(dentry); return -ECHILD; } @@ -819,6 +812,7 @@ static void set_root(struct nameidata *nd) } while (read_seqcount_retry(&fs->seq, seq)); } else { get_fs_root(fs, &nd->root); + nd->flags |= LOOKUP_ROOT_GRABBED; } } @@ -1735,8 +1729,6 @@ static int pick_link(struct nameidata *nd, struct path *link, nd->flags &= ~LOOKUP_RCU; nd->path.mnt = NULL; nd->path.dentry = NULL; - if (!(nd->flags & LOOKUP_ROOT)) - nd->root.mnt = NULL; rcu_read_unlock(); } else if (likely(unlazy_walk(nd)) == 0) error = nd_alloc_stack(nd); @@ -2350,7 +2342,7 @@ int filename_lookup(int dfd, struct filename *name, unsigned flags, retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path); if (likely(!retval)) - audit_inode(name, path->dentry, flags & LOOKUP_PARENT); + audit_inode(name, path->dentry, 0); restore_nameidata(); putname(name); return retval; @@ -2391,7 +2383,7 @@ static struct filename *filename_parentat(int dfd, struct filename *name, if (likely(!retval)) { *last = nd.last; *type = nd.last_type; - audit_inode(name, parent->dentry, LOOKUP_PARENT); + audit_inode(name, parent->dentry, AUDIT_INODE_PARENT); } else { putname(name); name = ERR_PTR(retval); @@ -2718,7 +2710,7 @@ filename_mountpoint(int dfd, struct filename *name, struct path *path, if (unlikely(error == -ESTALE)) error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path); if (likely(!error)) - audit_inode(name, path->dentry, flags & LOOKUP_NO_EVAL); + audit_inode(name, path->dentry, AUDIT_INODE_NOEVAL); restore_nameidata(); putname(name); return error; @@ -3299,7 +3291,7 @@ static int do_last(struct nameidata *nd, if (error) return error; - audit_inode(nd->name, dir, LOOKUP_PARENT); + audit_inode(nd->name, dir, AUDIT_INODE_PARENT); /* trailing slashes? */ if (unlikely(nd->last.name[nd->last.len])) return -EISDIR; diff --git a/fs/namespace.c b/fs/namespace.c index d28d30b13043..93c043245c46 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1643,13 +1643,18 @@ static inline bool may_mount(void) return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); } +#ifdef CONFIG_MANDATORY_FILE_LOCKING static inline bool may_mandlock(void) { -#ifndef CONFIG_MANDATORY_FILE_LOCKING - return false; -#endif return capable(CAP_SYS_ADMIN); } +#else +static inline bool may_mandlock(void) +{ + pr_warn("VFS: \"mand\" mount option not supported"); + return false; +} +#endif /* * Now umount can handle mount points as well as block devices. @@ -1675,8 +1680,6 @@ int ksys_umount(char __user *name, int flags) if (!(flags & UMOUNT_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; - lookup_flags |= LOOKUP_NO_EVAL; - retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path); if (retval) goto out; @@ -2463,6 +2466,26 @@ static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags) unlock_mount_hash(); } +static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt) +{ + struct super_block *sb = mnt->mnt_sb; + + if (!__mnt_is_readonly(mnt) && + (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) { + char *buf = (char *)__get_free_page(GFP_KERNEL); + char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM); + struct tm tm; + + time64_to_tm(sb->s_time_max, 0, &tm); + + pr_warn("Mounted %s file system at %s supports timestamps until %04ld (0x%llx)\n", + sb->s_type->name, mntpath, + tm.tm_year+1900, (unsigned long long)sb->s_time_max); + + free_page((unsigned long)buf); + } +} + /* * Handle reconfiguration of the mountpoint only without alteration of the * superblock it refers to. This is triggered by specifying MS_REMOUNT|MS_BIND @@ -2488,6 +2511,9 @@ static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags) if (ret == 0) set_mount_attributes(mnt, mnt_flags); up_write(&sb->s_umount); + + mnt_warn_timestamp_expiry(path, &mnt->mnt); + return ret; } @@ -2528,6 +2554,9 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags, } up_write(&sb->s_umount); } + + mnt_warn_timestamp_expiry(path, &mnt->mnt); + put_fs_context(fc); return err; } @@ -2736,8 +2765,13 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, return PTR_ERR(mnt); error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags); - if (error < 0) + if (error < 0) { mntput(mnt); + return error; + } + + mnt_warn_timestamp_expiry(mountpoint, mnt); + return error; } @@ -3046,7 +3080,7 @@ long do_mount(const char *dev_name, const char __user *dir_name, return -EINVAL; /* ... and get the mountpoint */ - retval = user_path(dir_name, &path); + retval = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path); if (retval) return retval; @@ -3593,11 +3627,13 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, if (!may_mount()) return -EPERM; - error = user_path_dir(new_root, &new); + error = user_path_at(AT_FDCWD, new_root, + LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new); if (error) goto out0; - error = user_path_dir(put_old, &old); + error = user_path_at(AT_FDCWD, put_old, + LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old); if (error) goto out1; diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 976d4089e267..361cc10d6f95 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -207,7 +207,6 @@ TRACE_DEFINE_ENUM(LOOKUP_PARENT); TRACE_DEFINE_ENUM(LOOKUP_REVAL); TRACE_DEFINE_ENUM(LOOKUP_RCU); TRACE_DEFINE_ENUM(LOOKUP_NO_REVAL); -TRACE_DEFINE_ENUM(LOOKUP_NO_EVAL); TRACE_DEFINE_ENUM(LOOKUP_OPEN); TRACE_DEFINE_ENUM(LOOKUP_CREATE); TRACE_DEFINE_ENUM(LOOKUP_EXCL); @@ -226,7 +225,6 @@ TRACE_DEFINE_ENUM(LOOKUP_DOWN); { LOOKUP_REVAL, "REVAL" }, \ { LOOKUP_RCU, "RCU" }, \ { LOOKUP_NO_REVAL, "NO_REVAL" }, \ - { LOOKUP_NO_EVAL, "NO_EVAL" }, \ { LOOKUP_OPEN, "OPEN" }, \ { LOOKUP_CREATE, "CREATE" }, \ { LOOKUP_EXCL, "EXCL" }, \ diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 703f595dce90..19a76cfa8b1f 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2382,6 +2382,15 @@ void nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info) sb->s_flags |= SB_POSIXACL; sb->s_time_gran = 1; sb->s_export_op = &nfs_export_ops; + } else + sb->s_time_gran = 1000; + + if (server->nfs_client->rpc_ops->version != 4) { + sb->s_time_min = 0; + sb->s_time_max = U32_MAX; + } else { + sb->s_time_min = S64_MIN; + sb->s_time_max = S64_MAX; } nfs_initialise_sb(sb); @@ -2402,7 +2411,6 @@ static void nfs_clone_super(struct super_block *sb, sb->s_maxbytes = old_sb->s_maxbytes; sb->s_xattr = old_sb->s_xattr; sb->s_op = old_sb->s_op; - sb->s_time_gran = 1; sb->s_export_op = old_sb->s_export_op; if (server->nfs_client->rpc_ops->version != 2) { @@ -2410,6 +2418,16 @@ static void nfs_clone_super(struct super_block *sb, * so ourselves when necessary. */ sb->s_flags |= SB_POSIXACL; + sb->s_time_gran = 1; + } else + sb->s_time_gran = 1000; + + if (server->nfs_client->rpc_ops->version != 4) { + sb->s_time_min = 0; + sb->s_time_max = U32_MAX; + } else { + sb->s_time_min = S64_MIN; + sb->s_time_max = S64_MAX; } nfs_initialise_sb(sb); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 3cf4f6aa48d6..2c215171c0eb 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1385,8 +1385,7 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc) static int nfsd_fs_get_tree(struct fs_context *fc) { - fc->s_fs_info = get_net(fc->net_ns); - return vfs_get_super(fc, vfs_get_keyed_super, nfsd_fill_super); + return get_tree_keyed(fc, nfsd_fill_super, get_net(fc->net_ns)); } static void nfsd_fs_free_fc(struct fs_context *fc) diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 8baa34baf548..6c7388430ad3 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -2899,15 +2899,18 @@ int ntfs_setattr(struct dentry *dentry, struct iattr *attr) ia_valid |= ATTR_MTIME | ATTR_CTIME; } } - if (ia_valid & ATTR_ATIME) - vi->i_atime = timespec64_trunc(attr->ia_atime, - vi->i_sb->s_time_gran); - if (ia_valid & ATTR_MTIME) - vi->i_mtime = timespec64_trunc(attr->ia_mtime, - vi->i_sb->s_time_gran); - if (ia_valid & ATTR_CTIME) - vi->i_ctime = timespec64_trunc(attr->ia_ctime, - vi->i_sb->s_time_gran); + if (ia_valid & ATTR_ATIME) { + vi->i_atime = timestamp_truncate(attr->ia_atime, + vi); + } + if (ia_valid & ATTR_MTIME) { + vi->i_mtime = timestamp_truncate(attr->ia_mtime, + vi); + } + if (ia_valid & ATTR_CTIME) { + vi->i_ctime = timestamp_truncate(attr->ia_ctime, + vi); + } mark_inode_dirty(vi); out: return err; diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index 08226a835ec3..b76ec6b88ded 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -478,6 +478,10 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_maxbytes = 0xffffffff; + sb->s_time_gran = NSEC_PER_MSEC; + sb->s_time_min = 0; + sb->s_time_max = U64_MAX / MSEC_PER_SEC; + sb_set_blocksize(sb, 0x200); bh = sb_bread(sb, 0); diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 960f9a3c012d..a5612abc0936 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -555,7 +555,7 @@ static int orangefs_fsync(struct file *file, * Change the file pointer position for an instance of an open file. * * \note If .llseek is overriden, we must acquire lock as described in - * Documentation/filesystems/Locking. + * Documentation/filesystems/locking.rst. * * Future upgrade could support SEEK_DATA and SEEK_HOLE but would * require much changes to the FS diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 0c337d8bdaab..efb12197da18 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -940,7 +940,7 @@ out: int orangefs_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { - int ret = -ENOENT; + int ret; struct inode *inode = path->dentry->d_inode; gossip_debug(GOSSIP_INODE_DEBUG, diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 1dd710e5f376..3e7cf3d0a494 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -224,7 +224,7 @@ static int orangefs_symlink(struct inode *dir, struct orangefs_object_kref ref; struct inode *inode; struct iattr iattr; - int mode = 755; + int mode = 0755; int ret; gossip_debug(GOSSIP_NAME_DEBUG, "%s: called\n", __func__); diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 572dd29fbd54..34a6c99fa29b 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -246,7 +246,7 @@ struct orangefs_read_options { extern struct orangefs_stats orangefs_stats; /* - * NOTE: See Documentation/filesystems/porting for information + * NOTE: See Documentation/filesystems/porting.rst for information * on implementing FOO_I and properly accessing fs private data */ static inline struct orangefs_inode_s *ORANGEFS_I(struct inode *inode) diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index cb8ec1f65c03..73c9775215b3 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -227,9 +227,8 @@ static int ovl_d_to_fh(struct dentry *dentry, char *buf, int buflen) /* Encode an upper or lower file handle */ fh = ovl_encode_real_fh(enc_lower ? ovl_dentry_lower(dentry) : ovl_dentry_upper(dentry), !enc_lower); - err = PTR_ERR(fh); if (IS_ERR(fh)) - goto fail; + return PTR_ERR(fh); err = -EOVERFLOW; if (fh->len > buflen) diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 7663aeb85fa3..bc14781886bf 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -383,7 +383,8 @@ static bool ovl_can_list(const char *s) return true; /* Never list trusted.overlay, list other trusted for superuser only */ - return !ovl_is_private_xattr(s) && capable(CAP_SYS_ADMIN); + return !ovl_is_private_xattr(s) && + ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN); } ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 28a2d12a1029..a8279280e88d 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -66,6 +66,7 @@ struct ovl_fs { bool workdir_locked; /* Traps in ovl inode cache */ struct inode *upperdir_trap; + struct inode *workbasedir_trap; struct inode *workdir_trap; struct inode *indexdir_trap; /* Inode numbers in all layers do not use the high xino_bits */ diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index b368e2e102fa..afbcb116a7f1 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -212,6 +212,7 @@ static void ovl_free_fs(struct ovl_fs *ofs) { unsigned i; + iput(ofs->workbasedir_trap); iput(ofs->indexdir_trap); iput(ofs->workdir_trap); iput(ofs->upperdir_trap); @@ -1003,6 +1004,25 @@ static int ovl_setup_trap(struct super_block *sb, struct dentry *dir, return 0; } +/* + * Determine how we treat concurrent use of upperdir/workdir based on the + * index feature. This is papering over mount leaks of container runtimes, + * for example, an old overlay mount is leaked and now its upperdir is + * attempted to be used as a lower layer in a new overlay mount. + */ +static int ovl_report_in_use(struct ovl_fs *ofs, const char *name) +{ + if (ofs->config.index) { + pr_err("overlayfs: %s is in-use as upperdir/workdir of another mount, mount with '-o index=off' to override exclusive upperdir protection.\n", + name); + return -EBUSY; + } else { + pr_warn("overlayfs: %s is in-use as upperdir/workdir of another mount, accessing files from both mounts will result in undefined behavior.\n", + name); + return 0; + } +} + static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs, struct path *upperpath) { @@ -1040,14 +1060,12 @@ static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs, upper_mnt->mnt_flags &= ~(MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME); ofs->upper_mnt = upper_mnt; - err = -EBUSY; if (ovl_inuse_trylock(ofs->upper_mnt->mnt_root)) { ofs->upperdir_locked = true; - } else if (ofs->config.index) { - pr_err("overlayfs: upperdir is in-use by another mount, mount with '-o index=off' to override exclusive upperdir protection.\n"); - goto out; } else { - pr_warn("overlayfs: upperdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n"); + err = ovl_report_in_use(ofs, "upperdir"); + if (err) + goto out; } err = 0; @@ -1157,16 +1175,19 @@ static int ovl_get_workdir(struct super_block *sb, struct ovl_fs *ofs, ofs->workbasedir = dget(workpath.dentry); - err = -EBUSY; if (ovl_inuse_trylock(ofs->workbasedir)) { ofs->workdir_locked = true; - } else if (ofs->config.index) { - pr_err("overlayfs: workdir is in-use by another mount, mount with '-o index=off' to override exclusive workdir protection.\n"); - goto out; } else { - pr_warn("overlayfs: workdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n"); + err = ovl_report_in_use(ofs, "workdir"); + if (err) + goto out; } + err = ovl_setup_trap(sb, ofs->workbasedir, &ofs->workbasedir_trap, + "workdir"); + if (err) + goto out; + err = ovl_make_workdir(sb, ofs, &workpath); out: @@ -1313,16 +1334,16 @@ static int ovl_get_lower_layers(struct super_block *sb, struct ovl_fs *ofs, if (err < 0) goto out; - err = -EBUSY; - if (ovl_is_inuse(stack[i].dentry)) { - pr_err("overlayfs: lowerdir is in-use as upperdir/workdir\n"); - goto out; - } - err = ovl_setup_trap(sb, stack[i].dentry, &trap, "lowerdir"); if (err) goto out; + if (ovl_is_inuse(stack[i].dentry)) { + err = ovl_report_in_use(ofs, "lowerdir"); + if (err) + goto out; + } + mnt = clone_private_mount(&stack[i]); err = PTR_ERR(mnt); if (IS_ERR(mnt)) { @@ -1469,8 +1490,8 @@ out_err: * - another layer of this overlayfs instance * - upper/work dir of any overlayfs instance */ -static int ovl_check_layer(struct super_block *sb, struct dentry *dentry, - const char *name) +static int ovl_check_layer(struct super_block *sb, struct ovl_fs *ofs, + struct dentry *dentry, const char *name) { struct dentry *next = dentry, *parent; int err = 0; @@ -1482,13 +1503,11 @@ static int ovl_check_layer(struct super_block *sb, struct dentry *dentry, /* Walk back ancestors to root (inclusive) looking for traps */ while (!err && parent != next) { - if (ovl_is_inuse(parent)) { - err = -EBUSY; - pr_err("overlayfs: %s path overlapping in-use upperdir/workdir\n", - name); - } else if (ovl_lookup_trap_inode(sb, parent)) { + if (ovl_lookup_trap_inode(sb, parent)) { err = -ELOOP; pr_err("overlayfs: overlapping %s path\n", name); + } else if (ovl_is_inuse(parent)) { + err = ovl_report_in_use(ofs, name); } next = parent; parent = dget_parent(next); @@ -1509,7 +1528,8 @@ static int ovl_check_overlapping_layers(struct super_block *sb, int i, err; if (ofs->upper_mnt) { - err = ovl_check_layer(sb, ofs->upper_mnt->mnt_root, "upperdir"); + err = ovl_check_layer(sb, ofs, ofs->upper_mnt->mnt_root, + "upperdir"); if (err) return err; @@ -1520,13 +1540,14 @@ static int ovl_check_overlapping_layers(struct super_block *sb, * workbasedir. In that case, we already have their traps in * inode cache and we will catch that case on lookup. */ - err = ovl_check_layer(sb, ofs->workbasedir, "workdir"); + err = ovl_check_layer(sb, ofs, ofs->workbasedir, "workdir"); if (err) return err; } for (i = 0; i < ofs->numlower; i++) { - err = ovl_check_layer(sb, ofs->lower_layers[i].mnt->mnt_root, + err = ovl_check_layer(sb, ofs, + ofs->lower_layers[i].mnt->mnt_root, "lowerdir"); if (err) return err; diff --git a/fs/proc/root.c b/fs/proc/root.c index 33f72d1b92cc..0b7c8dffc9ae 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -157,8 +157,7 @@ static int proc_get_tree(struct fs_context *fc) { struct proc_fs_context *ctx = fc->fs_private; - fc->s_fs_info = ctx->pid_ns; - return vfs_get_super(fc, vfs_get_keyed_super, proc_fill_super); + return get_tree_keyed(fc, proc_fill_super, ctx->pid_ns); } static void proc_fs_context_free(struct fs_context *fc) diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 2bb3468fc93a..8caff834f002 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -144,6 +144,7 @@ static int ramoops_read_kmsg_hdr(char *buffer, struct timespec64 *time, if (sscanf(buffer, RAMOOPS_KERNMSG_HDR "%lld.%lu-%c\n%n", (time64_t *)&time->tv_sec, &time->tv_nsec, &data_type, &header_length) == 3) { + time->tv_nsec *= 1000; if (data_type == 'C') *compressed = true; else @@ -151,6 +152,7 @@ static int ramoops_read_kmsg_hdr(char *buffer, struct timespec64 *time, } else if (sscanf(buffer, RAMOOPS_KERNMSG_HDR "%lld.%lu\n%n", (time64_t *)&time->tv_sec, &time->tv_nsec, &header_length) == 2) { + time->tv_nsec *= 1000; *compressed = false; } else { time->tv_sec = 0; diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 922d083bbc7c..e8da1cde87b9 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -201,6 +201,8 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent) s->s_op = &qnx4_sops; s->s_magic = QNX4_SUPER_MAGIC; s->s_flags |= SB_RDONLY; /* Yup, read-only yet */ + s->s_time_min = 0; + s->s_time_max = U32_MAX; /* Check the superblock signature. Since the qnx4 code is dangerous, we should leave as quickly as possible diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 0f8b0ff1ba43..345db56c98fd 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -429,6 +429,8 @@ mmi_success: s->s_op = &qnx6_sops; s->s_magic = QNX6_SUPER_MAGIC; s->s_flags |= SB_RDONLY; /* Yup, read-only yet */ + s->s_time_min = 0; + s->s_time_max = U32_MAX; /* ease the later tree level calculations */ sbi = QNX6_SB(s); diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 733c6b4193dc..d82636e8eb65 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -36,6 +36,8 @@ #include <linux/magic.h> #include <linux/slab.h> #include <linux/uaccess.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include "internal.h" struct ramfs_mount_opts { @@ -175,62 +177,52 @@ static const struct super_operations ramfs_ops = { .show_options = ramfs_show_options, }; -enum { +enum ramfs_param { Opt_mode, - Opt_err }; -static const match_table_t tokens = { - {Opt_mode, "mode=%o"}, - {Opt_err, NULL} +static const struct fs_parameter_spec ramfs_param_specs[] = { + fsparam_u32oct("mode", Opt_mode), + {} }; -static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts) +const struct fs_parameter_description ramfs_fs_parameters = { + .name = "ramfs", + .specs = ramfs_param_specs, +}; + +static int ramfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - substring_t args[MAX_OPT_ARGS]; - int option; - int token; - char *p; - - opts->mode = RAMFS_DEFAULT_MODE; - - while ((p = strsep(&data, ",")) != NULL) { - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_mode: - if (match_octal(&args[0], &option)) - return -EINVAL; - opts->mode = option & S_IALLUGO; - break; + struct fs_parse_result result; + struct ramfs_fs_info *fsi = fc->s_fs_info; + int opt; + + opt = fs_parse(fc, &ramfs_fs_parameters, param, &result); + if (opt < 0) { /* * We might like to report bad mount options here; * but traditionally ramfs has ignored all mount options, * and as it is used as a !CONFIG_SHMEM simple substitute * for tmpfs, better continue to ignore other mount options. */ - } + if (opt == -ENOPARAM) + opt = 0; + return opt; + } + + switch (opt) { + case Opt_mode: + fsi->mount_opts.mode = result.uint_32 & S_IALLUGO; + break; } return 0; } -int ramfs_fill_super(struct super_block *sb, void *data, int silent) +static int ramfs_fill_super(struct super_block *sb, struct fs_context *fc) { - struct ramfs_fs_info *fsi; + struct ramfs_fs_info *fsi = sb->s_fs_info; struct inode *inode; - int err; - - fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL); - sb->s_fs_info = fsi; - if (!fsi) - return -ENOMEM; - - err = ramfs_parse_options(data, &fsi->mount_opts); - if (err) - return err; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = PAGE_SIZE; @@ -247,10 +239,34 @@ int ramfs_fill_super(struct super_block *sb, void *data, int silent) return 0; } -struct dentry *ramfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int ramfs_get_tree(struct fs_context *fc) { - return mount_nodev(fs_type, flags, data, ramfs_fill_super); + return get_tree_nodev(fc, ramfs_fill_super); +} + +static void ramfs_free_fc(struct fs_context *fc) +{ + kfree(fc->s_fs_info); +} + +static const struct fs_context_operations ramfs_context_ops = { + .free = ramfs_free_fc, + .parse_param = ramfs_parse_param, + .get_tree = ramfs_get_tree, +}; + +int ramfs_init_fs_context(struct fs_context *fc) +{ + struct ramfs_fs_info *fsi; + + fsi = kzalloc(sizeof(*fsi), GFP_KERNEL); + if (!fsi) + return -ENOMEM; + + fsi->mount_opts.mode = RAMFS_DEFAULT_MODE; + fc->s_fs_info = fsi; + fc->ops = &ramfs_context_ops; + return 0; } static void ramfs_kill_sb(struct super_block *sb) @@ -261,7 +277,8 @@ static void ramfs_kill_sb(struct super_block *sb) static struct file_system_type ramfs_fs_type = { .name = "ramfs", - .mount = ramfs_mount, + .init_fs_context = ramfs_init_fs_context, + .parameters = &ramfs_fs_parameters, .kill_sb = ramfs_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index ab028ea0e561..d69b4ac0ae2f 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1976,6 +1976,9 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) goto error_unlocked; } + s->s_time_min = 0; + s->s_time_max = U32_MAX; + rs = SB_DISK_SUPER_BLOCK(s); /* * Let's do basic sanity check to verify that underlying device is not diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 7d580f7c3f1d..e582d001f792 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -65,7 +65,7 @@ #include <linux/slab.h> #include <linux/init.h> #include <linux/blkdev.h> -#include <linux/parser.h> +#include <linux/fs_context.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/statfs.h> @@ -423,10 +423,10 @@ static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf) /* * remounting must involve read-only */ -static int romfs_remount(struct super_block *sb, int *flags, char *data) +static int romfs_reconfigure(struct fs_context *fc) { - sync_filesystem(sb); - *flags |= SB_RDONLY; + sync_filesystem(fc->root->d_sb); + fc->sb_flags |= SB_RDONLY; return 0; } @@ -434,7 +434,6 @@ static const struct super_operations romfs_super_ops = { .alloc_inode = romfs_alloc_inode, .free_inode = romfs_free_inode, .statfs = romfs_statfs, - .remount_fs = romfs_remount, }; /* @@ -457,7 +456,7 @@ static __u32 romfs_checksum(const void *data, int size) /* * fill in the superblock */ -static int romfs_fill_super(struct super_block *sb, void *data, int silent) +static int romfs_fill_super(struct super_block *sb, struct fs_context *fc) { struct romfs_super_block *rsb; struct inode *root; @@ -478,6 +477,8 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_maxbytes = 0xFFFFFFFF; sb->s_magic = ROMFS_MAGIC; sb->s_flags |= SB_RDONLY | SB_NOATIME; + sb->s_time_min = 0; + sb->s_time_max = 0; sb->s_op = &romfs_super_ops; #ifdef CONFIG_ROMFS_ON_MTD @@ -504,8 +505,8 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent) if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 || img_size < ROMFH_SIZE) { - if (!silent) - pr_warn("VFS: Can't find a romfs filesystem on dev %s.\n", + if (!(fc->sb_flags & SB_SILENT)) + errorf(fc, "VFS: Can't find a romfs filesystem on dev %s.\n", sb->s_id); goto error_rsb_inval; } @@ -518,7 +519,7 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent) storage = sb->s_mtd ? "MTD" : "the block layer"; len = strnlen(rsb->name, ROMFS_MAXFN); - if (!silent) + if (!(fc->sb_flags & SB_SILENT)) pr_notice("Mounting image '%*.*s' through %s\n", (unsigned) len, (unsigned) len, rsb->name, storage); @@ -548,23 +549,34 @@ error_rsb: /* * get a superblock for mounting */ -static struct dentry *romfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data) +static int romfs_get_tree(struct fs_context *fc) { - struct dentry *ret = ERR_PTR(-EINVAL); + int ret = -EINVAL; #ifdef CONFIG_ROMFS_ON_MTD - ret = mount_mtd(fs_type, flags, dev_name, data, romfs_fill_super); + ret = get_tree_mtd(fc, romfs_fill_super); #endif #ifdef CONFIG_ROMFS_ON_BLOCK - if (ret == ERR_PTR(-EINVAL)) - ret = mount_bdev(fs_type, flags, dev_name, data, - romfs_fill_super); + if (ret == -EINVAL) + ret = get_tree_bdev(fc, romfs_fill_super); #endif return ret; } +static const struct fs_context_operations romfs_context_ops = { + .get_tree = romfs_get_tree, + .reconfigure = romfs_reconfigure, +}; + +/* + * Set up the filesystem mount context. + */ +static int romfs_init_fs_context(struct fs_context *fc) +{ + fc->ops = &romfs_context_ops; + return 0; +} + /* * destroy a romfs superblock in the appropriate manner */ @@ -587,7 +599,7 @@ static void romfs_kill_sb(struct super_block *sb) static struct file_system_type romfs_fs_type = { .owner = THIS_MODULE, .name = "romfs", - .mount = romfs_mount, + .init_fs_context = romfs_init_fs_context, .kill_sb = romfs_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index effa638d6d85..0cc4ceec0562 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -17,6 +17,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/fs.h> +#include <linux/fs_context.h> #include <linux/vfs.h> #include <linux/slab.h> #include <linux/mutex.h> @@ -36,26 +37,27 @@ static struct file_system_type squashfs_fs_type; static const struct super_operations squashfs_super_ops; -static const struct squashfs_decompressor *supported_squashfs_filesystem(short - major, short minor, short id) +static const struct squashfs_decompressor *supported_squashfs_filesystem( + struct fs_context *fc, + short major, short minor, short id) { const struct squashfs_decompressor *decompressor; if (major < SQUASHFS_MAJOR) { - ERROR("Major/Minor mismatch, older Squashfs %d.%d " - "filesystems are unsupported\n", major, minor); + errorf(fc, "Major/Minor mismatch, older Squashfs %d.%d " + "filesystems are unsupported", major, minor); return NULL; } else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) { - ERROR("Major/Minor mismatch, trying to mount newer " - "%d.%d filesystem\n", major, minor); - ERROR("Please update your kernel\n"); + errorf(fc, "Major/Minor mismatch, trying to mount newer " + "%d.%d filesystem", major, minor); + errorf(fc, "Please update your kernel"); return NULL; } decompressor = squashfs_lookup_decompressor(id); if (!decompressor->supported) { - ERROR("Filesystem uses \"%s\" compression. This is not " - "supported\n", decompressor->name); + errorf(fc, "Filesystem uses \"%s\" compression. This is not supported", + decompressor->name); return NULL; } @@ -63,7 +65,7 @@ static const struct squashfs_decompressor *supported_squashfs_filesystem(short } -static int squashfs_fill_super(struct super_block *sb, void *data, int silent) +static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) { struct squashfs_sb_info *msblk; struct squashfs_super_block *sblk = NULL; @@ -98,7 +100,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) sblk = squashfs_read_table(sb, SQUASHFS_START, sizeof(*sblk)); if (IS_ERR(sblk)) { - ERROR("unable to read squashfs_super_block\n"); + errorf(fc, "unable to read squashfs_super_block"); err = PTR_ERR(sblk); sblk = NULL; goto failed_mount; @@ -109,14 +111,15 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) /* Check it is a SQUASHFS superblock */ sb->s_magic = le32_to_cpu(sblk->s_magic); if (sb->s_magic != SQUASHFS_MAGIC) { - if (!silent) - ERROR("Can't find a SQUASHFS superblock on %pg\n", - sb->s_bdev); + if (!(fc->sb_flags & SB_SILENT)) + errorf(fc, "Can't find a SQUASHFS superblock on %pg", + sb->s_bdev); goto failed_mount; } /* Check the MAJOR & MINOR versions and lookup compression type */ msblk->decompressor = supported_squashfs_filesystem( + fc, le16_to_cpu(sblk->s_major), le16_to_cpu(sblk->s_minor), le16_to_cpu(sblk->compression)); @@ -133,15 +136,15 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) /* Check block size for sanity */ msblk->block_size = le32_to_cpu(sblk->block_size); if (msblk->block_size > SQUASHFS_FILE_MAX_SIZE) - goto failed_mount; + goto insanity; /* * Check the system page size is not larger than the filesystem * block size (by default 128K). This is currently not supported. */ if (PAGE_SIZE > msblk->block_size) { - ERROR("Page size > filesystem block size (%d). This is " - "currently not supported!\n", msblk->block_size); + errorf(fc, "Page size > filesystem block size (%d). This is " + "currently not supported!", msblk->block_size); goto failed_mount; } @@ -152,12 +155,12 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) /* Check that block_size and block_log match */ if (msblk->block_size != (1 << msblk->block_log)) - goto failed_mount; + goto insanity; /* Check the root inode for sanity */ root_inode = le64_to_cpu(sblk->root_inode); if (SQUASHFS_INODE_OFFSET(root_inode) > SQUASHFS_METADATA_SIZE) - goto failed_mount; + goto insanity; msblk->inode_table = le64_to_cpu(sblk->inode_table_start); msblk->directory_table = le64_to_cpu(sblk->directory_table_start); @@ -183,6 +186,8 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) (u64) le64_to_cpu(sblk->id_table_start)); sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_time_min = 0; + sb->s_time_max = U32_MAX; sb->s_flags |= SB_RDONLY; sb->s_op = &squashfs_super_ops; @@ -197,7 +202,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) msblk->read_page = squashfs_cache_init("data", squashfs_max_decompressors(), msblk->block_size); if (msblk->read_page == NULL) { - ERROR("Failed to allocate read_page block\n"); + errorf(fc, "Failed to allocate read_page block"); goto failed_mount; } @@ -205,7 +210,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) if (IS_ERR(msblk->stream)) { err = PTR_ERR(msblk->stream); msblk->stream = NULL; - goto failed_mount; + goto insanity; } /* Handle xattrs */ @@ -220,7 +225,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) msblk->xattr_id_table = squashfs_read_xattr_id_table(sb, xattr_id_table_start, &msblk->xattr_table, &msblk->xattr_ids); if (IS_ERR(msblk->xattr_id_table)) { - ERROR("unable to read xattr id index table\n"); + errorf(fc, "unable to read xattr id index table"); err = PTR_ERR(msblk->xattr_id_table); msblk->xattr_id_table = NULL; if (err != -ENOTSUPP) @@ -234,7 +239,7 @@ allocate_id_index_table: le64_to_cpu(sblk->id_table_start), next_table, le16_to_cpu(sblk->no_ids)); if (IS_ERR(msblk->id_table)) { - ERROR("unable to read id index table\n"); + errorf(fc, "unable to read id index table"); err = PTR_ERR(msblk->id_table); msblk->id_table = NULL; goto failed_mount; @@ -250,7 +255,7 @@ allocate_id_index_table: msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb, lookup_table_start, next_table, msblk->inodes); if (IS_ERR(msblk->inode_lookup_table)) { - ERROR("unable to read inode lookup table\n"); + errorf(fc, "unable to read inode lookup table"); err = PTR_ERR(msblk->inode_lookup_table); msblk->inode_lookup_table = NULL; goto failed_mount; @@ -275,7 +280,7 @@ handle_fragments: msblk->fragment_index = squashfs_read_fragment_index_table(sb, le64_to_cpu(sblk->fragment_table_start), next_table, fragments); if (IS_ERR(msblk->fragment_index)) { - ERROR("unable to read fragment index table\n"); + errorf(fc, "unable to read fragment index table"); err = PTR_ERR(msblk->fragment_index); msblk->fragment_index = NULL; goto failed_mount; @@ -286,13 +291,13 @@ check_directory_table: /* Sanity check directory_table */ if (msblk->directory_table > next_table) { err = -EINVAL; - goto failed_mount; + goto insanity; } /* Sanity check inode_table */ if (msblk->inode_table >= msblk->directory_table) { err = -EINVAL; - goto failed_mount; + goto insanity; } /* allocate root */ @@ -321,6 +326,8 @@ check_directory_table: kfree(sblk); return 0; +insanity: + errorf(fc, "squashfs image failed sanity check"); failed_mount: squashfs_cache_delete(msblk->block_cache); squashfs_cache_delete(msblk->fragment_cache); @@ -336,6 +343,28 @@ failed_mount: return err; } +static int squashfs_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, squashfs_fill_super); +} + +static int squashfs_reconfigure(struct fs_context *fc) +{ + sync_filesystem(fc->root->d_sb); + fc->sb_flags |= SB_RDONLY; + return 0; +} + +static const struct fs_context_operations squashfs_context_ops = { + .get_tree = squashfs_get_tree, + .reconfigure = squashfs_reconfigure, +}; + +static int squashfs_init_fs_context(struct fs_context *fc) +{ + fc->ops = &squashfs_context_ops; + return 0; +} static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf) { @@ -358,14 +387,6 @@ static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf) } -static int squashfs_remount(struct super_block *sb, int *flags, char *data) -{ - sync_filesystem(sb); - *flags |= SB_RDONLY; - return 0; -} - - static void squashfs_put_super(struct super_block *sb) { if (sb->s_fs_info) { @@ -384,14 +405,6 @@ static void squashfs_put_super(struct super_block *sb) } } - -static struct dentry *squashfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return mount_bdev(fs_type, flags, dev_name, data, squashfs_fill_super); -} - - static struct kmem_cache *squashfs_inode_cachep; @@ -468,7 +481,7 @@ static void squashfs_free_inode(struct inode *inode) static struct file_system_type squashfs_fs_type = { .owner = THIS_MODULE, .name = "squashfs", - .mount = squashfs_mount, + .init_fs_context = squashfs_init_fs_context, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV }; @@ -479,7 +492,6 @@ static const struct super_operations squashfs_super_ops = { .free_inode = squashfs_free_inode, .statfs = squashfs_statfs, .put_super = squashfs_put_super, - .remount_fs = squashfs_remount }; module_init(init_squashfs_fs); diff --git a/fs/super.c b/fs/super.c index 5960578a4076..8020974b2a68 100644 --- a/fs/super.c +++ b/fs/super.c @@ -32,6 +32,7 @@ #include <linux/backing-dev.h> #include <linux/rculist_bl.h> #include <linux/cleancache.h> +#include <linux/fscrypt.h> #include <linux/fsnotify.h> #include <linux/lockdep.h> #include <linux/user_namespace.h> @@ -257,6 +258,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, s->s_maxbytes = MAX_NON_LFS; s->s_op = &default_op; s->s_time_gran = 1000000000; + s->s_time_min = TIME64_MIN; + s->s_time_max = TIME64_MAX; s->cleancache_poolid = CLEANCACHE_NO_POOL; s->s_shrink.seeks = DEFAULT_SEEKS; @@ -290,6 +293,7 @@ static void __put_super(struct super_block *s) WARN_ON(s->s_inode_lru.node); WARN_ON(!list_empty(&s->s_mounts)); security_sb_free(s); + fscrypt_sb_free(s); put_user_ns(s->s_user_ns); kfree(s->s_subtype); call_rcu(&s->rcu, destroy_super_rcu); @@ -1160,9 +1164,11 @@ int vfs_get_super(struct fs_context *fc, { int (*test)(struct super_block *, struct fs_context *); struct super_block *sb; + int err; switch (keying) { case vfs_get_single_super: + case vfs_get_single_reconf_super: test = test_single_super; break; case vfs_get_keyed_super: @@ -1180,18 +1186,29 @@ int vfs_get_super(struct fs_context *fc, return PTR_ERR(sb); if (!sb->s_root) { - int err = fill_super(sb, fc); - if (err) { - deactivate_locked_super(sb); - return err; - } + err = fill_super(sb, fc); + if (err) + goto error; sb->s_flags |= SB_ACTIVE; + fc->root = dget(sb->s_root); + } else { + fc->root = dget(sb->s_root); + if (keying == vfs_get_single_reconf_super) { + err = reconfigure_super(fc); + if (err < 0) { + dput(fc->root); + fc->root = NULL; + goto error; + } + } } - BUG_ON(fc->root); - fc->root = dget(sb->s_root); return 0; + +error: + deactivate_locked_super(sb); + return err; } EXPORT_SYMBOL(vfs_get_super); @@ -1211,7 +1228,26 @@ int get_tree_single(struct fs_context *fc, } EXPORT_SYMBOL(get_tree_single); +int get_tree_single_reconf(struct fs_context *fc, + int (*fill_super)(struct super_block *sb, + struct fs_context *fc)) +{ + return vfs_get_super(fc, vfs_get_single_reconf_super, fill_super); +} +EXPORT_SYMBOL(get_tree_single_reconf); + +int get_tree_keyed(struct fs_context *fc, + int (*fill_super)(struct super_block *sb, + struct fs_context *fc), + void *key) +{ + fc->s_fs_info = key; + return vfs_get_super(fc, vfs_get_keyed_super, fill_super); +} +EXPORT_SYMBOL(get_tree_keyed); + #ifdef CONFIG_BLOCK + static int set_bdev_super(struct super_block *s, void *data) { s->s_bdev = data; @@ -1221,6 +1257,99 @@ static int set_bdev_super(struct super_block *s, void *data) return 0; } +static int set_bdev_super_fc(struct super_block *s, struct fs_context *fc) +{ + return set_bdev_super(s, fc->sget_key); +} + +static int test_bdev_super_fc(struct super_block *s, struct fs_context *fc) +{ + return s->s_bdev == fc->sget_key; +} + +/** + * get_tree_bdev - Get a superblock based on a single block device + * @fc: The filesystem context holding the parameters + * @fill_super: Helper to initialise a new superblock + */ +int get_tree_bdev(struct fs_context *fc, + int (*fill_super)(struct super_block *, + struct fs_context *)) +{ + struct block_device *bdev; + struct super_block *s; + fmode_t mode = FMODE_READ | FMODE_EXCL; + int error = 0; + + if (!(fc->sb_flags & SB_RDONLY)) + mode |= FMODE_WRITE; + + if (!fc->source) + return invalf(fc, "No source specified"); + + bdev = blkdev_get_by_path(fc->source, mode, fc->fs_type); + if (IS_ERR(bdev)) { + errorf(fc, "%s: Can't open blockdev", fc->source); + return PTR_ERR(bdev); + } + + /* Once the superblock is inserted into the list by sget_fc(), s_umount + * will protect the lockfs code from trying to start a snapshot while + * we are mounting + */ + mutex_lock(&bdev->bd_fsfreeze_mutex); + if (bdev->bd_fsfreeze_count > 0) { + mutex_unlock(&bdev->bd_fsfreeze_mutex); + warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); + return -EBUSY; + } + + fc->sb_flags |= SB_NOSEC; + fc->sget_key = bdev; + s = sget_fc(fc, test_bdev_super_fc, set_bdev_super_fc); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + if (IS_ERR(s)) + return PTR_ERR(s); + + if (s->s_root) { + /* Don't summarily change the RO/RW state. */ + if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) { + warnf(fc, "%pg: Can't mount, would change RO state", bdev); + deactivate_locked_super(s); + blkdev_put(bdev, mode); + return -EBUSY; + } + + /* + * s_umount nests inside bd_mutex during + * __invalidate_device(). blkdev_put() acquires + * bd_mutex and can't be called under s_umount. Drop + * s_umount temporarily. This is safe as we're + * holding an active reference. + */ + up_write(&s->s_umount); + blkdev_put(bdev, mode); + down_write(&s->s_umount); + } else { + s->s_mode = mode; + snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); + sb_set_blocksize(s, block_size(bdev)); + error = fill_super(s, fc); + if (error) { + deactivate_locked_super(s); + return error; + } + + s->s_flags |= SB_ACTIVE; + bdev->bd_super = s; + } + + BUG_ON(fc->root); + fc->root = dget(s->s_root); + return 0; +} +EXPORT_SYMBOL(get_tree_bdev); + static int test_bdev_super(struct super_block *s, void *data) { return (void *)s->s_bdev == data; diff --git a/fs/sysv/super.c b/fs/sysv/super.c index d788b1daa7eb..cc8e2ed155c8 100644 --- a/fs/sysv/super.c +++ b/fs/sysv/super.c @@ -368,7 +368,8 @@ static int sysv_fill_super(struct super_block *sb, void *data, int silent) sbi->s_block_base = 0; mutex_init(&sbi->s_lock); sb->s_fs_info = sbi; - + sb->s_time_min = 0; + sb->s_time_max = U32_MAX; sb_set_blocksize(sb, BLOCK_SIZE); for (i = 0; i < ARRAY_SIZE(flavours) && !size; i++) { @@ -487,6 +488,8 @@ static int v7_fill_super(struct super_block *sb, void *data, int silent) sbi->s_type = FSTYPE_V7; mutex_init(&sbi->s_lock); sb->s_fs_info = sbi; + sb->s_time_min = 0; + sb->s_time_max = U32_MAX; sb_set_blocksize(sb, 512); diff --git a/fs/timerfd.c b/fs/timerfd.c index 6a6fc8aa1de7..48305ba41e3c 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -471,7 +471,11 @@ static int do_timerfd_settime(int ufd, int flags, break; } spin_unlock_irq(&ctx->wqh.lock); - cpu_relax(); + + if (isalarm(ctx)) + hrtimer_cancel_wait_running(&ctx->t.alarm.timer); + else + hrtimer_cancel_wait_running(&ctx->t.tmr); } /* diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 400970d740bb..cd52585c8f4f 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1078,15 +1078,18 @@ static void do_attr_changes(struct inode *inode, const struct iattr *attr) inode->i_uid = attr->ia_uid; if (attr->ia_valid & ATTR_GID) inode->i_gid = attr->ia_gid; - if (attr->ia_valid & ATTR_ATIME) - inode->i_atime = timespec64_trunc(attr->ia_atime, - inode->i_sb->s_time_gran); - if (attr->ia_valid & ATTR_MTIME) - inode->i_mtime = timespec64_trunc(attr->ia_mtime, - inode->i_sb->s_time_gran); - if (attr->ia_valid & ATTR_CTIME) - inode->i_ctime = timespec64_trunc(attr->ia_ctime, - inode->i_sb->s_time_gran); + if (attr->ia_valid & ATTR_ATIME) { + inode->i_atime = timestamp_truncate(attr->ia_atime, + inode); + } + if (attr->ia_valid & ATTR_MTIME) { + inode->i_mtime = timestamp_truncate(attr->ia_mtime, + inode); + } + if (attr->ia_valid & ATTR_CTIME) { + inode->i_ctime = timestamp_truncate(attr->ia_ctime, + inode); + } if (attr->ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 034ad14710d1..5dc5abca11c7 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -185,6 +185,21 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case FS_IOC_GET_ENCRYPTION_POLICY: return fscrypt_ioctl_get_policy(file, (void __user *)arg); + case FS_IOC_GET_ENCRYPTION_POLICY_EX: + return fscrypt_ioctl_get_policy_ex(file, (void __user *)arg); + + case FS_IOC_ADD_ENCRYPTION_KEY: + return fscrypt_ioctl_add_key(file, (void __user *)arg); + + case FS_IOC_REMOVE_ENCRYPTION_KEY: + return fscrypt_ioctl_remove_key(file, (void __user *)arg); + + case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: + return fscrypt_ioctl_remove_key_all_users(file, + (void __user *)arg); + case FS_IOC_GET_ENCRYPTION_KEY_STATUS: + return fscrypt_ioctl_get_key_status(file, (void __user *)arg); + default: return -ENOTTY; } @@ -202,6 +217,11 @@ long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) break; case FS_IOC_SET_ENCRYPTION_POLICY: case FS_IOC_GET_ENCRYPTION_POLICY: + case FS_IOC_GET_ENCRYPTION_POLICY_EX: + case FS_IOC_ADD_ENCRYPTION_KEY: + case FS_IOC_REMOVE_ENCRYPTION_KEY: + case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: + case FS_IOC_GET_ENCRYPTION_KEY_STATUS: break; default: return -ENOIOCTLCMD; diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 8c1d571334bc..5e1e8ec0589e 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -318,6 +318,16 @@ static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc) return err; } +static int ubifs_drop_inode(struct inode *inode) +{ + int drop = generic_drop_inode(inode); + + if (!drop) + drop = fscrypt_drop_inode(inode); + + return drop; +} + static void ubifs_evict_inode(struct inode *inode) { int err; @@ -1994,6 +2004,7 @@ const struct super_operations ubifs_super_operations = { .free_inode = ubifs_free_inode, .put_super = ubifs_put_super, .write_inode = ubifs_write_inode, + .drop_inode = ubifs_drop_inode, .evict_inode = ubifs_evict_inode, .statfs = ubifs_statfs, .dirty_inode = ubifs_dirty_inode, diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig index fcb41516ea59..6d30adb6b890 100644 --- a/fs/ufs/Kconfig +++ b/fs/ufs/Kconfig @@ -9,7 +9,7 @@ config UFS_FS this file system as well. Saying Y here will allow you to read from these partitions; if you also want to write to them, say Y to the experimental "UFS file system write support", below. Please read the - file <file:Documentation/filesystems/ufs.txt> for more information. + file <file:Documentation/admin-guide/ufs.rst> for more information. The recently released UFS2 variant (used in FreeBSD 5.x) is READ-ONLY supported. diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 4ed0dca52ec8..1da0be667409 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -843,6 +843,10 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_time_gran = NSEC_PER_SEC; + sb->s_time_min = S32_MIN; + sb->s_time_max = S32_MAX; + switch (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) { case UFS_MOUNT_UFSTYPE_44BSD: UFSD("ufstype=44bsd\n"); @@ -861,6 +865,9 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) uspi->s_fshift = 9; uspi->s_sbsize = super_block_size = 1536; uspi->s_sbbase = 0; + sb->s_time_gran = 1; + sb->s_time_min = S64_MIN; + sb->s_time_max = S64_MAX; flags |= UFS_TYPE_UFS2 | UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD; break; diff --git a/fs/utimes.c b/fs/utimes.c index 350c9c16ace1..1ba3f7883870 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -36,16 +36,14 @@ static int utimes_common(const struct path *path, struct timespec64 *times) if (times[0].tv_nsec == UTIME_OMIT) newattrs.ia_valid &= ~ATTR_ATIME; else if (times[0].tv_nsec != UTIME_NOW) { - newattrs.ia_atime.tv_sec = times[0].tv_sec; - newattrs.ia_atime.tv_nsec = times[0].tv_nsec; + newattrs.ia_atime = timestamp_truncate(times[0], inode); newattrs.ia_valid |= ATTR_ATIME_SET; } if (times[1].tv_nsec == UTIME_OMIT) newattrs.ia_valid &= ~ATTR_MTIME; else if (times[1].tv_nsec != UTIME_NOW) { - newattrs.ia_mtime.tv_sec = times[1].tv_sec; - newattrs.ia_mtime.tv_nsec = times[1].tv_nsec; + newattrs.ia_mtime = timestamp_truncate(times[1], inode); newattrs.ia_valid |= ATTR_MTIME_SET; } /* diff --git a/fs/verity/Kconfig b/fs/verity/Kconfig new file mode 100644 index 000000000000..88fb25119899 --- /dev/null +++ b/fs/verity/Kconfig @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: GPL-2.0 + +config FS_VERITY + bool "FS Verity (read-only file-based authenticity protection)" + select CRYPTO + # SHA-256 is selected as it's intended to be the default hash algorithm. + # To avoid bloat, other wanted algorithms must be selected explicitly. + select CRYPTO_SHA256 + help + This option enables fs-verity. fs-verity is the dm-verity + mechanism implemented at the file level. On supported + filesystems (currently EXT4 and F2FS), userspace can use an + ioctl to enable verity for a file, which causes the filesystem + to build a Merkle tree for the file. The filesystem will then + transparently verify any data read from the file against the + Merkle tree. The file is also made read-only. + + This serves as an integrity check, but the availability of the + Merkle tree root hash also allows efficiently supporting + various use cases where normally the whole file would need to + be hashed at once, such as: (a) auditing (logging the file's + hash), or (b) authenticity verification (comparing the hash + against a known good value, e.g. from a digital signature). + + fs-verity is especially useful on large files where not all + the contents may actually be needed. Also, fs-verity verifies + data each time it is paged back in, which provides better + protection against malicious disks vs. an ahead-of-time hash. + + If unsure, say N. + +config FS_VERITY_DEBUG + bool "FS Verity debugging" + depends on FS_VERITY + help + Enable debugging messages related to fs-verity by default. + + Say N unless you are an fs-verity developer. + +config FS_VERITY_BUILTIN_SIGNATURES + bool "FS Verity builtin signature support" + depends on FS_VERITY + select SYSTEM_DATA_VERIFICATION + help + Support verifying signatures of verity files against the X.509 + certificates that have been loaded into the ".fs-verity" + kernel keyring. + + This is meant as a relatively simple mechanism that can be + used to provide an authenticity guarantee for verity files, as + an alternative to IMA appraisal. Userspace programs still + need to check that the verity bit is set in order to get an + authenticity guarantee. + + If unsure, say N. diff --git a/fs/verity/Makefile b/fs/verity/Makefile new file mode 100644 index 000000000000..570e9136334d --- /dev/null +++ b/fs/verity/Makefile @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_FS_VERITY) += enable.o \ + hash_algs.o \ + init.o \ + measure.o \ + open.o \ + verify.o + +obj-$(CONFIG_FS_VERITY_BUILTIN_SIGNATURES) += signature.o diff --git a/fs/verity/enable.c b/fs/verity/enable.c new file mode 100644 index 000000000000..eabc6ac19906 --- /dev/null +++ b/fs/verity/enable.c @@ -0,0 +1,377 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/verity/enable.c: ioctl to enable verity on a file + * + * Copyright 2019 Google LLC + */ + +#include "fsverity_private.h" + +#include <crypto/hash.h> +#include <linux/mount.h> +#include <linux/pagemap.h> +#include <linux/sched/signal.h> +#include <linux/uaccess.h> + +static int build_merkle_tree_level(struct inode *inode, unsigned int level, + u64 num_blocks_to_hash, + const struct merkle_tree_params *params, + u8 *pending_hashes, + struct ahash_request *req) +{ + const struct fsverity_operations *vops = inode->i_sb->s_vop; + unsigned int pending_size = 0; + u64 dst_block_num; + u64 i; + int err; + + if (WARN_ON(params->block_size != PAGE_SIZE)) /* checked earlier too */ + return -EINVAL; + + if (level < params->num_levels) { + dst_block_num = params->level_start[level]; + } else { + if (WARN_ON(num_blocks_to_hash != 1)) + return -EINVAL; + dst_block_num = 0; /* unused */ + } + + for (i = 0; i < num_blocks_to_hash; i++) { + struct page *src_page; + + if ((pgoff_t)i % 10000 == 0 || i + 1 == num_blocks_to_hash) + pr_debug("Hashing block %llu of %llu for level %u\n", + i + 1, num_blocks_to_hash, level); + + if (level == 0) { + /* Leaf: hashing a data block */ + src_page = read_mapping_page(inode->i_mapping, i, NULL); + if (IS_ERR(src_page)) { + err = PTR_ERR(src_page); + fsverity_err(inode, + "Error %d reading data page %llu", + err, i); + return err; + } + } else { + /* Non-leaf: hashing hash block from level below */ + src_page = vops->read_merkle_tree_page(inode, + params->level_start[level - 1] + i); + if (IS_ERR(src_page)) { + err = PTR_ERR(src_page); + fsverity_err(inode, + "Error %d reading Merkle tree page %llu", + err, params->level_start[level - 1] + i); + return err; + } + } + + err = fsverity_hash_page(params, inode, req, src_page, + &pending_hashes[pending_size]); + put_page(src_page); + if (err) + return err; + pending_size += params->digest_size; + + if (level == params->num_levels) /* Root hash? */ + return 0; + + if (pending_size + params->digest_size > params->block_size || + i + 1 == num_blocks_to_hash) { + /* Flush the pending hash block */ + memset(&pending_hashes[pending_size], 0, + params->block_size - pending_size); + err = vops->write_merkle_tree_block(inode, + pending_hashes, + dst_block_num, + params->log_blocksize); + if (err) { + fsverity_err(inode, + "Error %d writing Merkle tree block %llu", + err, dst_block_num); + return err; + } + dst_block_num++; + pending_size = 0; + } + + if (fatal_signal_pending(current)) + return -EINTR; + cond_resched(); + } + return 0; +} + +/* + * Build the Merkle tree for the given inode using the given parameters, and + * return the root hash in @root_hash. + * + * The tree is written to a filesystem-specific location as determined by the + * ->write_merkle_tree_block() method. However, the blocks that comprise the + * tree are the same for all filesystems. + */ +static int build_merkle_tree(struct inode *inode, + const struct merkle_tree_params *params, + u8 *root_hash) +{ + u8 *pending_hashes; + struct ahash_request *req; + u64 blocks; + unsigned int level; + int err = -ENOMEM; + + if (inode->i_size == 0) { + /* Empty file is a special case; root hash is all 0's */ + memset(root_hash, 0, params->digest_size); + return 0; + } + + pending_hashes = kmalloc(params->block_size, GFP_KERNEL); + req = ahash_request_alloc(params->hash_alg->tfm, GFP_KERNEL); + if (!pending_hashes || !req) + goto out; + + /* + * Build each level of the Merkle tree, starting at the leaf level + * (level 0) and ascending to the root node (level 'num_levels - 1'). + * Then at the end (level 'num_levels'), calculate the root hash. + */ + blocks = (inode->i_size + params->block_size - 1) >> + params->log_blocksize; + for (level = 0; level <= params->num_levels; level++) { + err = build_merkle_tree_level(inode, level, blocks, params, + pending_hashes, req); + if (err) + goto out; + blocks = (blocks + params->hashes_per_block - 1) >> + params->log_arity; + } + memcpy(root_hash, pending_hashes, params->digest_size); + err = 0; +out: + kfree(pending_hashes); + ahash_request_free(req); + return err; +} + +static int enable_verity(struct file *filp, + const struct fsverity_enable_arg *arg) +{ + struct inode *inode = file_inode(filp); + const struct fsverity_operations *vops = inode->i_sb->s_vop; + struct merkle_tree_params params = { }; + struct fsverity_descriptor *desc; + size_t desc_size = sizeof(*desc) + arg->sig_size; + struct fsverity_info *vi; + int err; + + /* Start initializing the fsverity_descriptor */ + desc = kzalloc(desc_size, GFP_KERNEL); + if (!desc) + return -ENOMEM; + desc->version = 1; + desc->hash_algorithm = arg->hash_algorithm; + desc->log_blocksize = ilog2(arg->block_size); + + /* Get the salt if the user provided one */ + if (arg->salt_size && + copy_from_user(desc->salt, + (const u8 __user *)(uintptr_t)arg->salt_ptr, + arg->salt_size)) { + err = -EFAULT; + goto out; + } + desc->salt_size = arg->salt_size; + + /* Get the signature if the user provided one */ + if (arg->sig_size && + copy_from_user(desc->signature, + (const u8 __user *)(uintptr_t)arg->sig_ptr, + arg->sig_size)) { + err = -EFAULT; + goto out; + } + desc->sig_size = cpu_to_le32(arg->sig_size); + + desc->data_size = cpu_to_le64(inode->i_size); + + /* Prepare the Merkle tree parameters */ + err = fsverity_init_merkle_tree_params(¶ms, inode, + arg->hash_algorithm, + desc->log_blocksize, + desc->salt, desc->salt_size); + if (err) + goto out; + + /* + * Start enabling verity on this file, serialized by the inode lock. + * Fail if verity is already enabled or is already being enabled. + */ + inode_lock(inode); + if (IS_VERITY(inode)) + err = -EEXIST; + else + err = vops->begin_enable_verity(filp); + inode_unlock(inode); + if (err) + goto out; + + /* + * Build the Merkle tree. Don't hold the inode lock during this, since + * on huge files this may take a very long time and we don't want to + * force unrelated syscalls like chown() to block forever. We don't + * need the inode lock here because deny_write_access() already prevents + * the file from being written to or truncated, and we still serialize + * ->begin_enable_verity() and ->end_enable_verity() using the inode + * lock and only allow one process to be here at a time on a given file. + */ + pr_debug("Building Merkle tree...\n"); + BUILD_BUG_ON(sizeof(desc->root_hash) < FS_VERITY_MAX_DIGEST_SIZE); + err = build_merkle_tree(inode, ¶ms, desc->root_hash); + if (err) { + fsverity_err(inode, "Error %d building Merkle tree", err); + goto rollback; + } + pr_debug("Done building Merkle tree. Root hash is %s:%*phN\n", + params.hash_alg->name, params.digest_size, desc->root_hash); + + /* + * Create the fsverity_info. Don't bother trying to save work by + * reusing the merkle_tree_params from above. Instead, just create the + * fsverity_info from the fsverity_descriptor as if it were just loaded + * from disk. This is simpler, and it serves as an extra check that the + * metadata we're writing is valid before actually enabling verity. + */ + vi = fsverity_create_info(inode, desc, desc_size); + if (IS_ERR(vi)) { + err = PTR_ERR(vi); + goto rollback; + } + + if (arg->sig_size) + pr_debug("Storing a %u-byte PKCS#7 signature alongside the file\n", + arg->sig_size); + + /* + * Tell the filesystem to finish enabling verity on the file. + * Serialized with ->begin_enable_verity() by the inode lock. + */ + inode_lock(inode); + err = vops->end_enable_verity(filp, desc, desc_size, params.tree_size); + inode_unlock(inode); + if (err) { + fsverity_err(inode, "%ps() failed with err %d", + vops->end_enable_verity, err); + fsverity_free_info(vi); + } else if (WARN_ON(!IS_VERITY(inode))) { + err = -EINVAL; + fsverity_free_info(vi); + } else { + /* Successfully enabled verity */ + + /* + * Readers can start using ->i_verity_info immediately, so it + * can't be rolled back once set. So don't set it until just + * after the filesystem has successfully enabled verity. + */ + fsverity_set_info(inode, vi); + } +out: + kfree(params.hashstate); + kfree(desc); + return err; + +rollback: + inode_lock(inode); + (void)vops->end_enable_verity(filp, NULL, 0, params.tree_size); + inode_unlock(inode); + goto out; +} + +/** + * fsverity_ioctl_enable() - enable verity on a file + * + * Enable fs-verity on a file. See the "FS_IOC_ENABLE_VERITY" section of + * Documentation/filesystems/fsverity.rst for the documentation. + * + * Return: 0 on success, -errno on failure + */ +int fsverity_ioctl_enable(struct file *filp, const void __user *uarg) +{ + struct inode *inode = file_inode(filp); + struct fsverity_enable_arg arg; + int err; + + if (copy_from_user(&arg, uarg, sizeof(arg))) + return -EFAULT; + + if (arg.version != 1) + return -EINVAL; + + if (arg.__reserved1 || + memchr_inv(arg.__reserved2, 0, sizeof(arg.__reserved2))) + return -EINVAL; + + if (arg.block_size != PAGE_SIZE) + return -EINVAL; + + if (arg.salt_size > FIELD_SIZEOF(struct fsverity_descriptor, salt)) + return -EMSGSIZE; + + if (arg.sig_size > FS_VERITY_MAX_SIGNATURE_SIZE) + return -EMSGSIZE; + + /* + * Require a regular file with write access. But the actual fd must + * still be readonly so that we can lock out all writers. This is + * needed to guarantee that no writable fds exist to the file once it + * has verity enabled, and to stabilize the data being hashed. + */ + + err = inode_permission(inode, MAY_WRITE); + if (err) + return err; + + if (IS_APPEND(inode)) + return -EPERM; + + if (S_ISDIR(inode->i_mode)) + return -EISDIR; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + err = mnt_want_write_file(filp); + if (err) /* -EROFS */ + return err; + + err = deny_write_access(filp); + if (err) /* -ETXTBSY */ + goto out_drop_write; + + err = enable_verity(filp, &arg); + if (err) + goto out_allow_write_access; + + /* + * Some pages of the file may have been evicted from pagecache after + * being used in the Merkle tree construction, then read into pagecache + * again by another process reading from the file concurrently. Since + * these pages didn't undergo verification against the file measurement + * which fs-verity now claims to be enforcing, we have to wipe the + * pagecache to ensure that all future reads are verified. + */ + filemap_write_and_wait(inode->i_mapping); + invalidate_inode_pages2(inode->i_mapping); + + /* + * allow_write_access() is needed to pair with deny_write_access(). + * Regardless, the filesystem won't allow writing to verity files. + */ +out_allow_write_access: + allow_write_access(filp); +out_drop_write: + mnt_drop_write_file(filp); + return err; +} +EXPORT_SYMBOL_GPL(fsverity_ioctl_enable); diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h new file mode 100644 index 000000000000..e74c79b64d88 --- /dev/null +++ b/fs/verity/fsverity_private.h @@ -0,0 +1,185 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * fs-verity: read-only file-based authenticity protection + * + * Copyright 2019 Google LLC + */ + +#ifndef _FSVERITY_PRIVATE_H +#define _FSVERITY_PRIVATE_H + +#ifdef CONFIG_FS_VERITY_DEBUG +#define DEBUG +#endif + +#define pr_fmt(fmt) "fs-verity: " fmt + +#include <crypto/sha.h> +#include <linux/fsverity.h> + +struct ahash_request; + +/* + * Implementation limit: maximum depth of the Merkle tree. For now 8 is plenty; + * it's enough for over U64_MAX bytes of data using SHA-256 and 4K blocks. + */ +#define FS_VERITY_MAX_LEVELS 8 + +/* + * Largest digest size among all hash algorithms supported by fs-verity. + * Currently assumed to be <= size of fsverity_descriptor::root_hash. + */ +#define FS_VERITY_MAX_DIGEST_SIZE SHA512_DIGEST_SIZE + +/* A hash algorithm supported by fs-verity */ +struct fsverity_hash_alg { + struct crypto_ahash *tfm; /* hash tfm, allocated on demand */ + const char *name; /* crypto API name, e.g. sha256 */ + unsigned int digest_size; /* digest size in bytes, e.g. 32 for SHA-256 */ + unsigned int block_size; /* block size in bytes, e.g. 64 for SHA-256 */ +}; + +/* Merkle tree parameters: hash algorithm, initial hash state, and topology */ +struct merkle_tree_params { + const struct fsverity_hash_alg *hash_alg; /* the hash algorithm */ + const u8 *hashstate; /* initial hash state or NULL */ + unsigned int digest_size; /* same as hash_alg->digest_size */ + unsigned int block_size; /* size of data and tree blocks */ + unsigned int hashes_per_block; /* number of hashes per tree block */ + unsigned int log_blocksize; /* log2(block_size) */ + unsigned int log_arity; /* log2(hashes_per_block) */ + unsigned int num_levels; /* number of levels in Merkle tree */ + u64 tree_size; /* Merkle tree size in bytes */ + + /* + * Starting block index for each tree level, ordered from leaf level (0) + * to root level ('num_levels - 1') + */ + u64 level_start[FS_VERITY_MAX_LEVELS]; +}; + +/** + * fsverity_info - cached verity metadata for an inode + * + * When a verity file is first opened, an instance of this struct is allocated + * and stored in ->i_verity_info; it remains until the inode is evicted. It + * caches information about the Merkle tree that's needed to efficiently verify + * data read from the file. It also caches the file measurement. The Merkle + * tree pages themselves are not cached here, but the filesystem may cache them. + */ +struct fsverity_info { + struct merkle_tree_params tree_params; + u8 root_hash[FS_VERITY_MAX_DIGEST_SIZE]; + u8 measurement[FS_VERITY_MAX_DIGEST_SIZE]; + const struct inode *inode; +}; + +/* + * Merkle tree properties. The file measurement is the hash of this structure + * excluding the signature and with the sig_size field set to 0. + */ +struct fsverity_descriptor { + __u8 version; /* must be 1 */ + __u8 hash_algorithm; /* Merkle tree hash algorithm */ + __u8 log_blocksize; /* log2 of size of data and tree blocks */ + __u8 salt_size; /* size of salt in bytes; 0 if none */ + __le32 sig_size; /* size of signature in bytes; 0 if none */ + __le64 data_size; /* size of file the Merkle tree is built over */ + __u8 root_hash[64]; /* Merkle tree root hash */ + __u8 salt[32]; /* salt prepended to each hashed block */ + __u8 __reserved[144]; /* must be 0's */ + __u8 signature[]; /* optional PKCS#7 signature */ +}; + +/* Arbitrary limit to bound the kmalloc() size. Can be changed. */ +#define FS_VERITY_MAX_DESCRIPTOR_SIZE 16384 + +#define FS_VERITY_MAX_SIGNATURE_SIZE (FS_VERITY_MAX_DESCRIPTOR_SIZE - \ + sizeof(struct fsverity_descriptor)) + +/* + * Format in which verity file measurements are signed. This is the same as + * 'struct fsverity_digest', except here some magic bytes are prepended to + * provide some context about what is being signed in case the same key is used + * for non-fsverity purposes, and here the fields have fixed endianness. + */ +struct fsverity_signed_digest { + char magic[8]; /* must be "FSVerity" */ + __le16 digest_algorithm; + __le16 digest_size; + __u8 digest[]; +}; + +/* hash_algs.c */ + +extern struct fsverity_hash_alg fsverity_hash_algs[]; + +const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, + unsigned int num); +const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg, + const u8 *salt, size_t salt_size); +int fsverity_hash_page(const struct merkle_tree_params *params, + const struct inode *inode, + struct ahash_request *req, struct page *page, u8 *out); +int fsverity_hash_buffer(const struct fsverity_hash_alg *alg, + const void *data, size_t size, u8 *out); +void __init fsverity_check_hash_algs(void); + +/* init.c */ + +extern void __printf(3, 4) __cold +fsverity_msg(const struct inode *inode, const char *level, + const char *fmt, ...); + +#define fsverity_warn(inode, fmt, ...) \ + fsverity_msg((inode), KERN_WARNING, fmt, ##__VA_ARGS__) +#define fsverity_err(inode, fmt, ...) \ + fsverity_msg((inode), KERN_ERR, fmt, ##__VA_ARGS__) + +/* open.c */ + +int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, + const struct inode *inode, + unsigned int hash_algorithm, + unsigned int log_blocksize, + const u8 *salt, size_t salt_size); + +struct fsverity_info *fsverity_create_info(const struct inode *inode, + void *desc, size_t desc_size); + +void fsverity_set_info(struct inode *inode, struct fsverity_info *vi); + +void fsverity_free_info(struct fsverity_info *vi); + +int __init fsverity_init_info_cache(void); +void __init fsverity_exit_info_cache(void); + +/* signature.c */ + +#ifdef CONFIG_FS_VERITY_BUILTIN_SIGNATURES +int fsverity_verify_signature(const struct fsverity_info *vi, + const struct fsverity_descriptor *desc, + size_t desc_size); + +int __init fsverity_init_signature(void); +#else /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */ +static inline int +fsverity_verify_signature(const struct fsverity_info *vi, + const struct fsverity_descriptor *desc, + size_t desc_size) +{ + return 0; +} + +static inline int fsverity_init_signature(void) +{ + return 0; +} +#endif /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */ + +/* verify.c */ + +int __init fsverity_init_workqueue(void); +void __init fsverity_exit_workqueue(void); + +#endif /* _FSVERITY_PRIVATE_H */ diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c new file mode 100644 index 000000000000..31e6d7d2389a --- /dev/null +++ b/fs/verity/hash_algs.c @@ -0,0 +1,280 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/verity/hash_algs.c: fs-verity hash algorithms + * + * Copyright 2019 Google LLC + */ + +#include "fsverity_private.h" + +#include <crypto/hash.h> +#include <linux/scatterlist.h> + +/* The hash algorithms supported by fs-verity */ +struct fsverity_hash_alg fsverity_hash_algs[] = { + [FS_VERITY_HASH_ALG_SHA256] = { + .name = "sha256", + .digest_size = SHA256_DIGEST_SIZE, + .block_size = SHA256_BLOCK_SIZE, + }, + [FS_VERITY_HASH_ALG_SHA512] = { + .name = "sha512", + .digest_size = SHA512_DIGEST_SIZE, + .block_size = SHA512_BLOCK_SIZE, + }, +}; + +/** + * fsverity_get_hash_alg() - validate and prepare a hash algorithm + * @inode: optional inode for logging purposes + * @num: the hash algorithm number + * + * Get the struct fsverity_hash_alg for the given hash algorithm number, and + * ensure it has a hash transform ready to go. The hash transforms are + * allocated on-demand so that we don't waste resources unnecessarily, and + * because the crypto modules may be initialized later than fs/verity/. + * + * Return: pointer to the hash alg on success, else an ERR_PTR() + */ +const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, + unsigned int num) +{ + struct fsverity_hash_alg *alg; + struct crypto_ahash *tfm; + int err; + + if (num >= ARRAY_SIZE(fsverity_hash_algs) || + !fsverity_hash_algs[num].name) { + fsverity_warn(inode, "Unknown hash algorithm number: %u", num); + return ERR_PTR(-EINVAL); + } + alg = &fsverity_hash_algs[num]; + + /* pairs with cmpxchg() below */ + tfm = READ_ONCE(alg->tfm); + if (likely(tfm != NULL)) + return alg; + /* + * Using the shash API would make things a bit simpler, but the ahash + * API is preferable as it allows the use of crypto accelerators. + */ + tfm = crypto_alloc_ahash(alg->name, 0, 0); + if (IS_ERR(tfm)) { + if (PTR_ERR(tfm) == -ENOENT) { + fsverity_warn(inode, + "Missing crypto API support for hash algorithm \"%s\"", + alg->name); + return ERR_PTR(-ENOPKG); + } + fsverity_err(inode, + "Error allocating hash algorithm \"%s\": %ld", + alg->name, PTR_ERR(tfm)); + return ERR_CAST(tfm); + } + + err = -EINVAL; + if (WARN_ON(alg->digest_size != crypto_ahash_digestsize(tfm))) + goto err_free_tfm; + if (WARN_ON(alg->block_size != crypto_ahash_blocksize(tfm))) + goto err_free_tfm; + + pr_info("%s using implementation \"%s\"\n", + alg->name, crypto_ahash_driver_name(tfm)); + + /* pairs with READ_ONCE() above */ + if (cmpxchg(&alg->tfm, NULL, tfm) != NULL) + crypto_free_ahash(tfm); + + return alg; + +err_free_tfm: + crypto_free_ahash(tfm); + return ERR_PTR(err); +} + +/** + * fsverity_prepare_hash_state() - precompute the initial hash state + * @alg: hash algorithm + * @salt: a salt which is to be prepended to all data to be hashed + * @salt_size: salt size in bytes, possibly 0 + * + * Return: NULL if the salt is empty, otherwise the kmalloc()'ed precomputed + * initial hash state on success or an ERR_PTR() on failure. + */ +const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg, + const u8 *salt, size_t salt_size) +{ + u8 *hashstate = NULL; + struct ahash_request *req = NULL; + u8 *padded_salt = NULL; + size_t padded_salt_size; + struct scatterlist sg; + DECLARE_CRYPTO_WAIT(wait); + int err; + + if (salt_size == 0) + return NULL; + + hashstate = kmalloc(crypto_ahash_statesize(alg->tfm), GFP_KERNEL); + if (!hashstate) + return ERR_PTR(-ENOMEM); + + req = ahash_request_alloc(alg->tfm, GFP_KERNEL); + if (!req) { + err = -ENOMEM; + goto err_free; + } + + /* + * Zero-pad the salt to the next multiple of the input size of the hash + * algorithm's compression function, e.g. 64 bytes for SHA-256 or 128 + * bytes for SHA-512. This ensures that the hash algorithm won't have + * any bytes buffered internally after processing the salt, thus making + * salted hashing just as fast as unsalted hashing. + */ + padded_salt_size = round_up(salt_size, alg->block_size); + padded_salt = kzalloc(padded_salt_size, GFP_KERNEL); + if (!padded_salt) { + err = -ENOMEM; + goto err_free; + } + memcpy(padded_salt, salt, salt_size); + + sg_init_one(&sg, padded_salt, padded_salt_size); + ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | + CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &wait); + ahash_request_set_crypt(req, &sg, NULL, padded_salt_size); + + err = crypto_wait_req(crypto_ahash_init(req), &wait); + if (err) + goto err_free; + + err = crypto_wait_req(crypto_ahash_update(req), &wait); + if (err) + goto err_free; + + err = crypto_ahash_export(req, hashstate); + if (err) + goto err_free; +out: + ahash_request_free(req); + kfree(padded_salt); + return hashstate; + +err_free: + kfree(hashstate); + hashstate = ERR_PTR(err); + goto out; +} + +/** + * fsverity_hash_page() - hash a single data or hash page + * @params: the Merkle tree's parameters + * @inode: inode for which the hashing is being done + * @req: preallocated hash request + * @page: the page to hash + * @out: output digest, size 'params->digest_size' bytes + * + * Hash a single data or hash block, assuming block_size == PAGE_SIZE. + * The hash is salted if a salt is specified in the Merkle tree parameters. + * + * Return: 0 on success, -errno on failure + */ +int fsverity_hash_page(const struct merkle_tree_params *params, + const struct inode *inode, + struct ahash_request *req, struct page *page, u8 *out) +{ + struct scatterlist sg; + DECLARE_CRYPTO_WAIT(wait); + int err; + + if (WARN_ON(params->block_size != PAGE_SIZE)) + return -EINVAL; + + sg_init_table(&sg, 1); + sg_set_page(&sg, page, PAGE_SIZE, 0); + ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | + CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &wait); + ahash_request_set_crypt(req, &sg, out, PAGE_SIZE); + + if (params->hashstate) { + err = crypto_ahash_import(req, params->hashstate); + if (err) { + fsverity_err(inode, + "Error %d importing hash state", err); + return err; + } + err = crypto_ahash_finup(req); + } else { + err = crypto_ahash_digest(req); + } + + err = crypto_wait_req(err, &wait); + if (err) + fsverity_err(inode, "Error %d computing page hash", err); + return err; +} + +/** + * fsverity_hash_buffer() - hash some data + * @alg: the hash algorithm to use + * @data: the data to hash + * @size: size of data to hash, in bytes + * @out: output digest, size 'alg->digest_size' bytes + * + * Hash some data which is located in physically contiguous memory (i.e. memory + * allocated by kmalloc(), not by vmalloc()). No salt is used. + * + * Return: 0 on success, -errno on failure + */ +int fsverity_hash_buffer(const struct fsverity_hash_alg *alg, + const void *data, size_t size, u8 *out) +{ + struct ahash_request *req; + struct scatterlist sg; + DECLARE_CRYPTO_WAIT(wait); + int err; + + req = ahash_request_alloc(alg->tfm, GFP_KERNEL); + if (!req) + return -ENOMEM; + + sg_init_one(&sg, data, size); + ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | + CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &wait); + ahash_request_set_crypt(req, &sg, out, size); + + err = crypto_wait_req(crypto_ahash_digest(req), &wait); + + ahash_request_free(req); + return err; +} + +void __init fsverity_check_hash_algs(void) +{ + size_t i; + + /* + * Sanity check the hash algorithms (could be a build-time check, but + * they're in an array) + */ + for (i = 0; i < ARRAY_SIZE(fsverity_hash_algs); i++) { + const struct fsverity_hash_alg *alg = &fsverity_hash_algs[i]; + + if (!alg->name) + continue; + + BUG_ON(alg->digest_size > FS_VERITY_MAX_DIGEST_SIZE); + + /* + * For efficiency, the implementation currently assumes the + * digest and block sizes are powers of 2. This limitation can + * be lifted if the code is updated to handle other values. + */ + BUG_ON(!is_power_of_2(alg->digest_size)); + BUG_ON(!is_power_of_2(alg->block_size)); + } +} diff --git a/fs/verity/init.c b/fs/verity/init.c new file mode 100644 index 000000000000..94c104e00861 --- /dev/null +++ b/fs/verity/init.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/verity/init.c: fs-verity module initialization and logging + * + * Copyright 2019 Google LLC + */ + +#include "fsverity_private.h" + +#include <linux/ratelimit.h> + +void fsverity_msg(const struct inode *inode, const char *level, + const char *fmt, ...) +{ + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + struct va_format vaf; + va_list args; + + if (!__ratelimit(&rs)) + return; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + if (inode) + printk("%sfs-verity (%s, inode %lu): %pV\n", + level, inode->i_sb->s_id, inode->i_ino, &vaf); + else + printk("%sfs-verity: %pV\n", level, &vaf); + va_end(args); +} + +static int __init fsverity_init(void) +{ + int err; + + fsverity_check_hash_algs(); + + err = fsverity_init_info_cache(); + if (err) + return err; + + err = fsverity_init_workqueue(); + if (err) + goto err_exit_info_cache; + + err = fsverity_init_signature(); + if (err) + goto err_exit_workqueue; + + pr_debug("Initialized fs-verity\n"); + return 0; + +err_exit_workqueue: + fsverity_exit_workqueue(); +err_exit_info_cache: + fsverity_exit_info_cache(); + return err; +} +late_initcall(fsverity_init) diff --git a/fs/verity/measure.c b/fs/verity/measure.c new file mode 100644 index 000000000000..05049b68c745 --- /dev/null +++ b/fs/verity/measure.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/verity/measure.c: ioctl to get a verity file's measurement + * + * Copyright 2019 Google LLC + */ + +#include "fsverity_private.h" + +#include <linux/uaccess.h> + +/** + * fsverity_ioctl_measure() - get a verity file's measurement + * + * Retrieve the file measurement that the kernel is enforcing for reads from a + * verity file. See the "FS_IOC_MEASURE_VERITY" section of + * Documentation/filesystems/fsverity.rst for the documentation. + * + * Return: 0 on success, -errno on failure + */ +int fsverity_ioctl_measure(struct file *filp, void __user *_uarg) +{ + const struct inode *inode = file_inode(filp); + struct fsverity_digest __user *uarg = _uarg; + const struct fsverity_info *vi; + const struct fsverity_hash_alg *hash_alg; + struct fsverity_digest arg; + + vi = fsverity_get_info(inode); + if (!vi) + return -ENODATA; /* not a verity file */ + hash_alg = vi->tree_params.hash_alg; + + /* + * The user specifies the digest_size their buffer has space for; we can + * return the digest if it fits in the available space. We write back + * the actual size, which may be shorter than the user-specified size. + */ + + if (get_user(arg.digest_size, &uarg->digest_size)) + return -EFAULT; + if (arg.digest_size < hash_alg->digest_size) + return -EOVERFLOW; + + memset(&arg, 0, sizeof(arg)); + arg.digest_algorithm = hash_alg - fsverity_hash_algs; + arg.digest_size = hash_alg->digest_size; + + if (copy_to_user(uarg, &arg, sizeof(arg))) + return -EFAULT; + + if (copy_to_user(uarg->digest, vi->measurement, hash_alg->digest_size)) + return -EFAULT; + + return 0; +} +EXPORT_SYMBOL_GPL(fsverity_ioctl_measure); diff --git a/fs/verity/open.c b/fs/verity/open.c new file mode 100644 index 000000000000..63d1004b688c --- /dev/null +++ b/fs/verity/open.c @@ -0,0 +1,356 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/verity/open.c: opening fs-verity files + * + * Copyright 2019 Google LLC + */ + +#include "fsverity_private.h" + +#include <linux/slab.h> + +static struct kmem_cache *fsverity_info_cachep; + +/** + * fsverity_init_merkle_tree_params() - initialize Merkle tree parameters + * @params: the parameters struct to initialize + * @inode: the inode for which the Merkle tree is being built + * @hash_algorithm: number of hash algorithm to use + * @log_blocksize: log base 2 of block size to use + * @salt: pointer to salt (optional) + * @salt_size: size of salt, possibly 0 + * + * Validate the hash algorithm and block size, then compute the tree topology + * (num levels, num blocks in each level, etc.) and initialize @params. + * + * Return: 0 on success, -errno on failure + */ +int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, + const struct inode *inode, + unsigned int hash_algorithm, + unsigned int log_blocksize, + const u8 *salt, size_t salt_size) +{ + const struct fsverity_hash_alg *hash_alg; + int err; + u64 blocks; + u64 offset; + int level; + + memset(params, 0, sizeof(*params)); + + hash_alg = fsverity_get_hash_alg(inode, hash_algorithm); + if (IS_ERR(hash_alg)) + return PTR_ERR(hash_alg); + params->hash_alg = hash_alg; + params->digest_size = hash_alg->digest_size; + + params->hashstate = fsverity_prepare_hash_state(hash_alg, salt, + salt_size); + if (IS_ERR(params->hashstate)) { + err = PTR_ERR(params->hashstate); + params->hashstate = NULL; + fsverity_err(inode, "Error %d preparing hash state", err); + goto out_err; + } + + if (log_blocksize != PAGE_SHIFT) { + fsverity_warn(inode, "Unsupported log_blocksize: %u", + log_blocksize); + err = -EINVAL; + goto out_err; + } + params->log_blocksize = log_blocksize; + params->block_size = 1 << log_blocksize; + + if (WARN_ON(!is_power_of_2(params->digest_size))) { + err = -EINVAL; + goto out_err; + } + if (params->block_size < 2 * params->digest_size) { + fsverity_warn(inode, + "Merkle tree block size (%u) too small for hash algorithm \"%s\"", + params->block_size, hash_alg->name); + err = -EINVAL; + goto out_err; + } + params->log_arity = params->log_blocksize - ilog2(params->digest_size); + params->hashes_per_block = 1 << params->log_arity; + + pr_debug("Merkle tree uses %s with %u-byte blocks (%u hashes/block), salt=%*phN\n", + hash_alg->name, params->block_size, params->hashes_per_block, + (int)salt_size, salt); + + /* + * Compute the number of levels in the Merkle tree and create a map from + * level to the starting block of that level. Level 'num_levels - 1' is + * the root and is stored first. Level 0 is the level directly "above" + * the data blocks and is stored last. + */ + + /* Compute number of levels and the number of blocks in each level */ + blocks = (inode->i_size + params->block_size - 1) >> log_blocksize; + pr_debug("Data is %lld bytes (%llu blocks)\n", inode->i_size, blocks); + while (blocks > 1) { + if (params->num_levels >= FS_VERITY_MAX_LEVELS) { + fsverity_err(inode, "Too many levels in Merkle tree"); + err = -EINVAL; + goto out_err; + } + blocks = (blocks + params->hashes_per_block - 1) >> + params->log_arity; + /* temporarily using level_start[] to store blocks in level */ + params->level_start[params->num_levels++] = blocks; + } + + /* Compute the starting block of each level */ + offset = 0; + for (level = (int)params->num_levels - 1; level >= 0; level--) { + blocks = params->level_start[level]; + params->level_start[level] = offset; + pr_debug("Level %d is %llu blocks starting at index %llu\n", + level, blocks, offset); + offset += blocks; + } + + params->tree_size = offset << log_blocksize; + return 0; + +out_err: + kfree(params->hashstate); + memset(params, 0, sizeof(*params)); + return err; +} + +/* + * Compute the file measurement by hashing the fsverity_descriptor excluding the + * signature and with the sig_size field set to 0. + */ +static int compute_file_measurement(const struct fsverity_hash_alg *hash_alg, + struct fsverity_descriptor *desc, + u8 *measurement) +{ + __le32 sig_size = desc->sig_size; + int err; + + desc->sig_size = 0; + err = fsverity_hash_buffer(hash_alg, desc, sizeof(*desc), measurement); + desc->sig_size = sig_size; + + return err; +} + +/* + * Validate the given fsverity_descriptor and create a new fsverity_info from + * it. The signature (if present) is also checked. + */ +struct fsverity_info *fsverity_create_info(const struct inode *inode, + void *_desc, size_t desc_size) +{ + struct fsverity_descriptor *desc = _desc; + struct fsverity_info *vi; + int err; + + if (desc_size < sizeof(*desc)) { + fsverity_err(inode, "Unrecognized descriptor size: %zu bytes", + desc_size); + return ERR_PTR(-EINVAL); + } + + if (desc->version != 1) { + fsverity_err(inode, "Unrecognized descriptor version: %u", + desc->version); + return ERR_PTR(-EINVAL); + } + + if (memchr_inv(desc->__reserved, 0, sizeof(desc->__reserved))) { + fsverity_err(inode, "Reserved bits set in descriptor"); + return ERR_PTR(-EINVAL); + } + + if (desc->salt_size > sizeof(desc->salt)) { + fsverity_err(inode, "Invalid salt_size: %u", desc->salt_size); + return ERR_PTR(-EINVAL); + } + + if (le64_to_cpu(desc->data_size) != inode->i_size) { + fsverity_err(inode, + "Wrong data_size: %llu (desc) != %lld (inode)", + le64_to_cpu(desc->data_size), inode->i_size); + return ERR_PTR(-EINVAL); + } + + vi = kmem_cache_zalloc(fsverity_info_cachep, GFP_KERNEL); + if (!vi) + return ERR_PTR(-ENOMEM); + vi->inode = inode; + + err = fsverity_init_merkle_tree_params(&vi->tree_params, inode, + desc->hash_algorithm, + desc->log_blocksize, + desc->salt, desc->salt_size); + if (err) { + fsverity_err(inode, + "Error %d initializing Merkle tree parameters", + err); + goto out; + } + + memcpy(vi->root_hash, desc->root_hash, vi->tree_params.digest_size); + + err = compute_file_measurement(vi->tree_params.hash_alg, desc, + vi->measurement); + if (err) { + fsverity_err(inode, "Error %d computing file measurement", err); + goto out; + } + pr_debug("Computed file measurement: %s:%*phN\n", + vi->tree_params.hash_alg->name, + vi->tree_params.digest_size, vi->measurement); + + err = fsverity_verify_signature(vi, desc, desc_size); +out: + if (err) { + fsverity_free_info(vi); + vi = ERR_PTR(err); + } + return vi; +} + +void fsverity_set_info(struct inode *inode, struct fsverity_info *vi) +{ + /* + * Multiple processes may race to set ->i_verity_info, so use cmpxchg. + * This pairs with the READ_ONCE() in fsverity_get_info(). + */ + if (cmpxchg(&inode->i_verity_info, NULL, vi) != NULL) + fsverity_free_info(vi); +} + +void fsverity_free_info(struct fsverity_info *vi) +{ + if (!vi) + return; + kfree(vi->tree_params.hashstate); + kmem_cache_free(fsverity_info_cachep, vi); +} + +/* Ensure the inode has an ->i_verity_info */ +static int ensure_verity_info(struct inode *inode) +{ + struct fsverity_info *vi = fsverity_get_info(inode); + struct fsverity_descriptor *desc; + int res; + + if (vi) + return 0; + + res = inode->i_sb->s_vop->get_verity_descriptor(inode, NULL, 0); + if (res < 0) { + fsverity_err(inode, + "Error %d getting verity descriptor size", res); + return res; + } + if (res > FS_VERITY_MAX_DESCRIPTOR_SIZE) { + fsverity_err(inode, "Verity descriptor is too large (%d bytes)", + res); + return -EMSGSIZE; + } + desc = kmalloc(res, GFP_KERNEL); + if (!desc) + return -ENOMEM; + res = inode->i_sb->s_vop->get_verity_descriptor(inode, desc, res); + if (res < 0) { + fsverity_err(inode, "Error %d reading verity descriptor", res); + goto out_free_desc; + } + + vi = fsverity_create_info(inode, desc, res); + if (IS_ERR(vi)) { + res = PTR_ERR(vi); + goto out_free_desc; + } + + fsverity_set_info(inode, vi); + res = 0; +out_free_desc: + kfree(desc); + return res; +} + +/** + * fsverity_file_open() - prepare to open a verity file + * @inode: the inode being opened + * @filp: the struct file being set up + * + * When opening a verity file, deny the open if it is for writing. Otherwise, + * set up the inode's ->i_verity_info if not already done. + * + * When combined with fscrypt, this must be called after fscrypt_file_open(). + * Otherwise, we won't have the key set up to decrypt the verity metadata. + * + * Return: 0 on success, -errno on failure + */ +int fsverity_file_open(struct inode *inode, struct file *filp) +{ + if (!IS_VERITY(inode)) + return 0; + + if (filp->f_mode & FMODE_WRITE) { + pr_debug("Denying opening verity file (ino %lu) for write\n", + inode->i_ino); + return -EPERM; + } + + return ensure_verity_info(inode); +} +EXPORT_SYMBOL_GPL(fsverity_file_open); + +/** + * fsverity_prepare_setattr() - prepare to change a verity inode's attributes + * @dentry: dentry through which the inode is being changed + * @attr: attributes to change + * + * Verity files are immutable, so deny truncates. This isn't covered by the + * open-time check because sys_truncate() takes a path, not a file descriptor. + * + * Return: 0 on success, -errno on failure + */ +int fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr) +{ + if (IS_VERITY(d_inode(dentry)) && (attr->ia_valid & ATTR_SIZE)) { + pr_debug("Denying truncate of verity file (ino %lu)\n", + d_inode(dentry)->i_ino); + return -EPERM; + } + return 0; +} +EXPORT_SYMBOL_GPL(fsverity_prepare_setattr); + +/** + * fsverity_cleanup_inode() - free the inode's verity info, if present + * + * Filesystems must call this on inode eviction to free ->i_verity_info. + */ +void fsverity_cleanup_inode(struct inode *inode) +{ + fsverity_free_info(inode->i_verity_info); + inode->i_verity_info = NULL; +} +EXPORT_SYMBOL_GPL(fsverity_cleanup_inode); + +int __init fsverity_init_info_cache(void) +{ + fsverity_info_cachep = KMEM_CACHE_USERCOPY(fsverity_info, + SLAB_RECLAIM_ACCOUNT, + measurement); + if (!fsverity_info_cachep) + return -ENOMEM; + return 0; +} + +void __init fsverity_exit_info_cache(void) +{ + kmem_cache_destroy(fsverity_info_cachep); + fsverity_info_cachep = NULL; +} diff --git a/fs/verity/signature.c b/fs/verity/signature.c new file mode 100644 index 000000000000..c8b255232de5 --- /dev/null +++ b/fs/verity/signature.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/verity/signature.c: verification of builtin signatures + * + * Copyright 2019 Google LLC + */ + +#include "fsverity_private.h" + +#include <linux/cred.h> +#include <linux/key.h> +#include <linux/slab.h> +#include <linux/verification.h> + +/* + * /proc/sys/fs/verity/require_signatures + * If 1, all verity files must have a valid builtin signature. + */ +static int fsverity_require_signatures; + +/* + * Keyring that contains the trusted X.509 certificates. + * + * Only root (kuid=0) can modify this. Also, root may use + * keyctl_restrict_keyring() to prevent any more additions. + */ +static struct key *fsverity_keyring; + +/** + * fsverity_verify_signature() - check a verity file's signature + * + * If the file's fs-verity descriptor includes a signature of the file + * measurement, verify it against the certificates in the fs-verity keyring. + * + * Return: 0 on success (signature valid or not required); -errno on failure + */ +int fsverity_verify_signature(const struct fsverity_info *vi, + const struct fsverity_descriptor *desc, + size_t desc_size) +{ + const struct inode *inode = vi->inode; + const struct fsverity_hash_alg *hash_alg = vi->tree_params.hash_alg; + const u32 sig_size = le32_to_cpu(desc->sig_size); + struct fsverity_signed_digest *d; + int err; + + if (sig_size == 0) { + if (fsverity_require_signatures) { + fsverity_err(inode, + "require_signatures=1, rejecting unsigned file!"); + return -EPERM; + } + return 0; + } + + if (sig_size > desc_size - sizeof(*desc)) { + fsverity_err(inode, "Signature overflows verity descriptor"); + return -EBADMSG; + } + + d = kzalloc(sizeof(*d) + hash_alg->digest_size, GFP_KERNEL); + if (!d) + return -ENOMEM; + memcpy(d->magic, "FSVerity", 8); + d->digest_algorithm = cpu_to_le16(hash_alg - fsverity_hash_algs); + d->digest_size = cpu_to_le16(hash_alg->digest_size); + memcpy(d->digest, vi->measurement, hash_alg->digest_size); + + err = verify_pkcs7_signature(d, sizeof(*d) + hash_alg->digest_size, + desc->signature, sig_size, + fsverity_keyring, + VERIFYING_UNSPECIFIED_SIGNATURE, + NULL, NULL); + kfree(d); + + if (err) { + if (err == -ENOKEY) + fsverity_err(inode, + "File's signing cert isn't in the fs-verity keyring"); + else if (err == -EKEYREJECTED) + fsverity_err(inode, "Incorrect file signature"); + else if (err == -EBADMSG) + fsverity_err(inode, "Malformed file signature"); + else + fsverity_err(inode, "Error %d verifying file signature", + err); + return err; + } + + pr_debug("Valid signature for file measurement %s:%*phN\n", + hash_alg->name, hash_alg->digest_size, vi->measurement); + return 0; +} + +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *fsverity_sysctl_header; + +static const struct ctl_path fsverity_sysctl_path[] = { + { .procname = "fs", }, + { .procname = "verity", }, + { } +}; + +static struct ctl_table fsverity_sysctl_table[] = { + { + .procname = "require_signatures", + .data = &fsverity_require_signatures, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { } +}; + +static int __init fsverity_sysctl_init(void) +{ + fsverity_sysctl_header = register_sysctl_paths(fsverity_sysctl_path, + fsverity_sysctl_table); + if (!fsverity_sysctl_header) { + pr_err("sysctl registration failed!\n"); + return -ENOMEM; + } + return 0; +} +#else /* !CONFIG_SYSCTL */ +static inline int __init fsverity_sysctl_init(void) +{ + return 0; +} +#endif /* !CONFIG_SYSCTL */ + +int __init fsverity_init_signature(void) +{ + struct key *ring; + int err; + + ring = keyring_alloc(".fs-verity", KUIDT_INIT(0), KGIDT_INIT(0), + current_cred(), KEY_POS_SEARCH | + KEY_USR_VIEW | KEY_USR_READ | KEY_USR_WRITE | + KEY_USR_SEARCH | KEY_USR_SETATTR, + KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); + if (IS_ERR(ring)) + return PTR_ERR(ring); + + err = fsverity_sysctl_init(); + if (err) + goto err_put_ring; + + fsverity_keyring = ring; + return 0; + +err_put_ring: + key_put(ring); + return err; +} diff --git a/fs/verity/verify.c b/fs/verity/verify.c new file mode 100644 index 000000000000..3e8f2de44667 --- /dev/null +++ b/fs/verity/verify.c @@ -0,0 +1,281 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/verity/verify.c: data verification functions, i.e. hooks for ->readpages() + * + * Copyright 2019 Google LLC + */ + +#include "fsverity_private.h" + +#include <crypto/hash.h> +#include <linux/bio.h> +#include <linux/ratelimit.h> + +static struct workqueue_struct *fsverity_read_workqueue; + +/** + * hash_at_level() - compute the location of the block's hash at the given level + * + * @params: (in) the Merkle tree parameters + * @dindex: (in) the index of the data block being verified + * @level: (in) the level of hash we want (0 is leaf level) + * @hindex: (out) the index of the hash block containing the wanted hash + * @hoffset: (out) the byte offset to the wanted hash within the hash block + */ +static void hash_at_level(const struct merkle_tree_params *params, + pgoff_t dindex, unsigned int level, pgoff_t *hindex, + unsigned int *hoffset) +{ + pgoff_t position; + + /* Offset of the hash within the level's region, in hashes */ + position = dindex >> (level * params->log_arity); + + /* Index of the hash block in the tree overall */ + *hindex = params->level_start[level] + (position >> params->log_arity); + + /* Offset of the wanted hash (in bytes) within the hash block */ + *hoffset = (position & ((1 << params->log_arity) - 1)) << + (params->log_blocksize - params->log_arity); +} + +/* Extract a hash from a hash page */ +static void extract_hash(struct page *hpage, unsigned int hoffset, + unsigned int hsize, u8 *out) +{ + void *virt = kmap_atomic(hpage); + + memcpy(out, virt + hoffset, hsize); + kunmap_atomic(virt); +} + +static inline int cmp_hashes(const struct fsverity_info *vi, + const u8 *want_hash, const u8 *real_hash, + pgoff_t index, int level) +{ + const unsigned int hsize = vi->tree_params.digest_size; + + if (memcmp(want_hash, real_hash, hsize) == 0) + return 0; + + fsverity_err(vi->inode, + "FILE CORRUPTED! index=%lu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN", + index, level, + vi->tree_params.hash_alg->name, hsize, want_hash, + vi->tree_params.hash_alg->name, hsize, real_hash); + return -EBADMSG; +} + +/* + * Verify a single data page against the file's Merkle tree. + * + * In principle, we need to verify the entire path to the root node. However, + * for efficiency the filesystem may cache the hash pages. Therefore we need + * only ascend the tree until an already-verified page is seen, as indicated by + * the PageChecked bit being set; then verify the path to that page. + * + * This code currently only supports the case where the verity block size is + * equal to PAGE_SIZE. Doing otherwise would be possible but tricky, since we + * wouldn't be able to use the PageChecked bit. + * + * Note that multiple processes may race to verify a hash page and mark it + * Checked, but it doesn't matter; the result will be the same either way. + * + * Return: true if the page is valid, else false. + */ +static bool verify_page(struct inode *inode, const struct fsverity_info *vi, + struct ahash_request *req, struct page *data_page) +{ + const struct merkle_tree_params *params = &vi->tree_params; + const unsigned int hsize = params->digest_size; + const pgoff_t index = data_page->index; + int level; + u8 _want_hash[FS_VERITY_MAX_DIGEST_SIZE]; + const u8 *want_hash; + u8 real_hash[FS_VERITY_MAX_DIGEST_SIZE]; + struct page *hpages[FS_VERITY_MAX_LEVELS]; + unsigned int hoffsets[FS_VERITY_MAX_LEVELS]; + int err; + + if (WARN_ON_ONCE(!PageLocked(data_page) || PageUptodate(data_page))) + return false; + + pr_debug_ratelimited("Verifying data page %lu...\n", index); + + /* + * Starting at the leaf level, ascend the tree saving hash pages along + * the way until we find a verified hash page, indicated by PageChecked; + * or until we reach the root. + */ + for (level = 0; level < params->num_levels; level++) { + pgoff_t hindex; + unsigned int hoffset; + struct page *hpage; + + hash_at_level(params, index, level, &hindex, &hoffset); + + pr_debug_ratelimited("Level %d: hindex=%lu, hoffset=%u\n", + level, hindex, hoffset); + + hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode, + hindex); + if (IS_ERR(hpage)) { + err = PTR_ERR(hpage); + fsverity_err(inode, + "Error %d reading Merkle tree page %lu", + err, hindex); + goto out; + } + + if (PageChecked(hpage)) { + extract_hash(hpage, hoffset, hsize, _want_hash); + want_hash = _want_hash; + put_page(hpage); + pr_debug_ratelimited("Hash page already checked, want %s:%*phN\n", + params->hash_alg->name, + hsize, want_hash); + goto descend; + } + pr_debug_ratelimited("Hash page not yet checked\n"); + hpages[level] = hpage; + hoffsets[level] = hoffset; + } + + want_hash = vi->root_hash; + pr_debug("Want root hash: %s:%*phN\n", + params->hash_alg->name, hsize, want_hash); +descend: + /* Descend the tree verifying hash pages */ + for (; level > 0; level--) { + struct page *hpage = hpages[level - 1]; + unsigned int hoffset = hoffsets[level - 1]; + + err = fsverity_hash_page(params, inode, req, hpage, real_hash); + if (err) + goto out; + err = cmp_hashes(vi, want_hash, real_hash, index, level - 1); + if (err) + goto out; + SetPageChecked(hpage); + extract_hash(hpage, hoffset, hsize, _want_hash); + want_hash = _want_hash; + put_page(hpage); + pr_debug("Verified hash page at level %d, now want %s:%*phN\n", + level - 1, params->hash_alg->name, hsize, want_hash); + } + + /* Finally, verify the data page */ + err = fsverity_hash_page(params, inode, req, data_page, real_hash); + if (err) + goto out; + err = cmp_hashes(vi, want_hash, real_hash, index, -1); +out: + for (; level > 0; level--) + put_page(hpages[level - 1]); + + return err == 0; +} + +/** + * fsverity_verify_page() - verify a data page + * + * Verify a page that has just been read from a verity file. The page must be a + * pagecache page that is still locked and not yet uptodate. + * + * Return: true if the page is valid, else false. + */ +bool fsverity_verify_page(struct page *page) +{ + struct inode *inode = page->mapping->host; + const struct fsverity_info *vi = inode->i_verity_info; + struct ahash_request *req; + bool valid; + + req = ahash_request_alloc(vi->tree_params.hash_alg->tfm, GFP_NOFS); + if (unlikely(!req)) + return false; + + valid = verify_page(inode, vi, req, page); + + ahash_request_free(req); + + return valid; +} +EXPORT_SYMBOL_GPL(fsverity_verify_page); + +#ifdef CONFIG_BLOCK +/** + * fsverity_verify_bio() - verify a 'read' bio that has just completed + * + * Verify a set of pages that have just been read from a verity file. The pages + * must be pagecache pages that are still locked and not yet uptodate. Pages + * that fail verification are set to the Error state. Verification is skipped + * for pages already in the Error state, e.g. due to fscrypt decryption failure. + * + * This is a helper function for use by the ->readpages() method of filesystems + * that issue bios to read data directly into the page cache. Filesystems that + * populate the page cache without issuing bios (e.g. non block-based + * filesystems) must instead call fsverity_verify_page() directly on each page. + * All filesystems must also call fsverity_verify_page() on holes. + */ +void fsverity_verify_bio(struct bio *bio) +{ + struct inode *inode = bio_first_page_all(bio)->mapping->host; + const struct fsverity_info *vi = inode->i_verity_info; + struct ahash_request *req; + struct bio_vec *bv; + struct bvec_iter_all iter_all; + + req = ahash_request_alloc(vi->tree_params.hash_alg->tfm, GFP_NOFS); + if (unlikely(!req)) { + bio_for_each_segment_all(bv, bio, iter_all) + SetPageError(bv->bv_page); + return; + } + + bio_for_each_segment_all(bv, bio, iter_all) { + struct page *page = bv->bv_page; + + if (!PageError(page) && !verify_page(inode, vi, req, page)) + SetPageError(page); + } + + ahash_request_free(req); +} +EXPORT_SYMBOL_GPL(fsverity_verify_bio); +#endif /* CONFIG_BLOCK */ + +/** + * fsverity_enqueue_verify_work() - enqueue work on the fs-verity workqueue + * + * Enqueue verification work for asynchronous processing. + */ +void fsverity_enqueue_verify_work(struct work_struct *work) +{ + queue_work(fsverity_read_workqueue, work); +} +EXPORT_SYMBOL_GPL(fsverity_enqueue_verify_work); + +int __init fsverity_init_workqueue(void) +{ + /* + * Use an unbound workqueue to allow bios to be verified in parallel + * even when they happen to complete on the same CPU. This sacrifices + * locality, but it's worthwhile since hashing is CPU-intensive. + * + * Also use a high-priority workqueue to prioritize verification work, + * which blocks reads from completing, over regular application tasks. + */ + fsverity_read_workqueue = alloc_workqueue("fsverity_read_queue", + WQ_UNBOUND | WQ_HIGHPRI, + num_online_cpus()); + if (!fsverity_read_workqueue) + return -ENOMEM; + return 0; +} + +void __init fsverity_exit_workqueue(void) +{ + destroy_workqueue(fsverity_read_workqueue); + fsverity_read_workqueue = NULL; +} diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 16bb9a328678..da031b93e182 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -3,10 +3,10 @@ * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. */ -#include <linux/sched/mm.h> +#include "xfs.h" #include <linux/backing-dev.h> -#include "kmem.h" #include "xfs_message.h" +#include "xfs_trace.h" void * kmem_alloc(size_t size, xfs_km_flags_t flags) @@ -15,9 +15,11 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) gfp_t lflags = kmem_flags_convert(flags); void *ptr; + trace_kmem_alloc(size, flags, _RET_IP_); + do { ptr = kmalloc(size, lflags); - if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) + if (ptr || (flags & KM_MAYFAIL)) return ptr; if (!(++retries % 100)) xfs_err(NULL, @@ -28,28 +30,24 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) } while (1); } -void * -kmem_alloc_large(size_t size, xfs_km_flags_t flags) + +/* + * __vmalloc() will allocate data pages and auxillary structures (e.g. + * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context here. Hence + * we need to tell memory reclaim that we are in such a context via + * PF_MEMALLOC_NOFS to prevent memory reclaim re-entering the filesystem here + * and potentially deadlocking. + */ +static void * +__kmem_vmalloc(size_t size, xfs_km_flags_t flags) { unsigned nofs_flag = 0; void *ptr; - gfp_t lflags; - - ptr = kmem_alloc(size, flags | KM_MAYFAIL); - if (ptr) - return ptr; + gfp_t lflags = kmem_flags_convert(flags); - /* - * __vmalloc() will allocate data pages and auxillary structures (e.g. - * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context - * here. Hence we need to tell memory reclaim that we are in such a - * context via PF_MEMALLOC_NOFS to prevent memory reclaim re-entering - * the filesystem here and potentially deadlocking. - */ if (flags & KM_NOFS) nofs_flag = memalloc_nofs_save(); - lflags = kmem_flags_convert(flags); ptr = __vmalloc(size, lflags, PAGE_KERNEL); if (flags & KM_NOFS) @@ -58,6 +56,44 @@ kmem_alloc_large(size_t size, xfs_km_flags_t flags) return ptr; } +/* + * Same as kmem_alloc_large, except we guarantee the buffer returned is aligned + * to the @align_mask. We only guarantee alignment up to page size, we'll clamp + * alignment at page size if it is larger. vmalloc always returns a PAGE_SIZE + * aligned region. + */ +void * +kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags) +{ + void *ptr; + + trace_kmem_alloc_io(size, flags, _RET_IP_); + + if (WARN_ON_ONCE(align_mask >= PAGE_SIZE)) + align_mask = PAGE_SIZE - 1; + + ptr = kmem_alloc(size, flags | KM_MAYFAIL); + if (ptr) { + if (!((uintptr_t)ptr & align_mask)) + return ptr; + kfree(ptr); + } + return __kmem_vmalloc(size, flags); +} + +void * +kmem_alloc_large(size_t size, xfs_km_flags_t flags) +{ + void *ptr; + + trace_kmem_alloc_large(size, flags, _RET_IP_); + + ptr = kmem_alloc(size, flags | KM_MAYFAIL); + if (ptr) + return ptr; + return __kmem_vmalloc(size, flags); +} + void * kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags) { @@ -65,9 +101,11 @@ kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags) gfp_t lflags = kmem_flags_convert(flags); void *ptr; + trace_kmem_realloc(newsize, flags, _RET_IP_); + do { ptr = krealloc(old, newsize, lflags); - if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) + if (ptr || (flags & KM_MAYFAIL)) return ptr; if (!(++retries % 100)) xfs_err(NULL, @@ -85,9 +123,10 @@ kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags) gfp_t lflags = kmem_flags_convert(flags); void *ptr; + trace_kmem_zone_alloc(kmem_cache_size(zone), flags, _RET_IP_); do { ptr = kmem_cache_alloc(zone, lflags); - if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) + if (ptr || (flags & KM_MAYFAIL)) return ptr; if (!(++retries % 100)) xfs_err(NULL, diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 267655acd426..8170d95cf930 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -16,8 +16,6 @@ */ typedef unsigned __bitwise xfs_km_flags_t; -#define KM_SLEEP ((__force xfs_km_flags_t)0x0001u) -#define KM_NOSLEEP ((__force xfs_km_flags_t)0x0002u) #define KM_NOFS ((__force xfs_km_flags_t)0x0004u) #define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u) #define KM_ZERO ((__force xfs_km_flags_t)0x0010u) @@ -32,15 +30,11 @@ kmem_flags_convert(xfs_km_flags_t flags) { gfp_t lflags; - BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_ZERO)); + BUG_ON(flags & ~(KM_NOFS|KM_MAYFAIL|KM_ZERO)); - if (flags & KM_NOSLEEP) { - lflags = GFP_ATOMIC | __GFP_NOWARN; - } else { - lflags = GFP_KERNEL | __GFP_NOWARN; - if (flags & KM_NOFS) - lflags &= ~__GFP_FS; - } + lflags = GFP_KERNEL | __GFP_NOWARN; + if (flags & KM_NOFS) + lflags &= ~__GFP_FS; /* * Default page/slab allocator behavior is to retry for ever @@ -59,6 +53,7 @@ kmem_flags_convert(xfs_km_flags_t flags) } extern void *kmem_alloc(size_t, xfs_km_flags_t); +extern void *kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags); extern void *kmem_alloc_large(size_t size, xfs_km_flags_t); extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t); static inline void kmem_free(const void *ptr) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 372ad55631fc..533b04aaf6f6 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2205,7 +2205,7 @@ xfs_defer_agfl_block( ASSERT(xfs_bmap_free_item_zone != NULL); ASSERT(oinfo != NULL); - new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); + new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0); new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); new->xefi_blockcount = 1; new->xefi_oinfo = *oinfo; diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index d6ed5d2c07c2..58fa85cec325 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -81,10 +81,9 @@ typedef struct xfs_alloc_arg { /* * Defines for datatype */ -#define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/ -#define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */ -#define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */ -#define XFS_ALLOC_NOBUSY (1 << 3)/* Busy extents not allowed */ +#define XFS_ALLOC_INITIAL_USER_DATA (1 << 0)/* special case start of file */ +#define XFS_ALLOC_USERDATA_ZERO (1 << 1)/* zero extent on allocation */ +#define XFS_ALLOC_NOBUSY (1 << 2)/* Busy extents not allowed */ static inline bool xfs_alloc_is_userdata(int datatype) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index d48fcf11cc35..510ca6974604 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -97,7 +97,10 @@ xfs_inode_hasattr( * Overall external interface routines. *========================================================================*/ -/* Retrieve an extended attribute and its value. Must have ilock. */ +/* + * Retrieve an extended attribute and its value. Must have ilock. + * Returns 0 on successful retrieval, otherwise an error. + */ int xfs_attr_get_ilocked( struct xfs_inode *ip, @@ -115,12 +118,28 @@ xfs_attr_get_ilocked( return xfs_attr_node_get(args); } -/* Retrieve an extended attribute by name, and its value. */ +/* + * Retrieve an extended attribute by name, and its value if requested. + * + * If ATTR_KERNOVAL is set in @flags, then the caller does not want the value, + * just an indication whether the attribute exists and the size of the value if + * it exists. The size is returned in @valuelenp, + * + * If the attribute is found, but exceeds the size limit set by the caller in + * @valuelenp, return -ERANGE with the size of the attribute that was found in + * @valuelenp. + * + * If ATTR_ALLOC is set in @flags, allocate the buffer for the value after + * existence of the attribute has been determined. On success, return that + * buffer to the caller and leave them to free it. On failure, free any + * allocated buffer and ensure the buffer pointer returned to the caller is + * null. + */ int xfs_attr_get( struct xfs_inode *ip, const unsigned char *name, - unsigned char *value, + unsigned char **value, int *valuelenp, int flags) { @@ -128,6 +147,8 @@ xfs_attr_get( uint lock_mode; int error; + ASSERT((flags & (ATTR_ALLOC | ATTR_KERNOVAL)) || *value); + XFS_STATS_INC(ip->i_mount, xs_attr_get); if (XFS_FORCED_SHUTDOWN(ip->i_mount)) @@ -137,17 +158,29 @@ xfs_attr_get( if (error) return error; - args.value = value; - args.valuelen = *valuelenp; /* Entirely possible to look up a name which doesn't exist */ args.op_flags = XFS_DA_OP_OKNOENT; + if (flags & ATTR_ALLOC) + args.op_flags |= XFS_DA_OP_ALLOCVAL; + else + args.value = *value; + args.valuelen = *valuelenp; lock_mode = xfs_ilock_attr_map_shared(ip); error = xfs_attr_get_ilocked(ip, &args); xfs_iunlock(ip, lock_mode); - *valuelenp = args.valuelen; - return error == -EEXIST ? 0 : error; + + /* on error, we have to clean up allocated value buffers */ + if (error) { + if (flags & ATTR_ALLOC) { + kmem_free(args.value); + *value = NULL; + } + return error; + } + *value = args.value; + return 0; } /* @@ -768,6 +801,8 @@ xfs_attr_leaf_removename( * * This leaf block cannot have a "remote" value, we only call this routine * if bmap_one_block() says there is only one block (ie: no remote blks). + * + * Returns 0 on successful retrieval, otherwise an error. */ STATIC int xfs_attr_leaf_get(xfs_da_args_t *args) @@ -789,9 +824,6 @@ xfs_attr_leaf_get(xfs_da_args_t *args) } error = xfs_attr3_leaf_getvalue(bp, args); xfs_trans_brelse(args->trans, bp); - if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) { - error = xfs_attr_rmtval_get(args); - } return error; } @@ -1268,11 +1300,13 @@ xfs_attr_refillstate(xfs_da_state_t *state) } /* - * Look up a filename in a node attribute list. + * Retrieve the attribute data from a node attribute list. * * This routine gets called for any attribute fork that has more than one * block, ie: both true Btree attr lists and for single-leaf-blocks with * "remote" values taking up more blocks. + * + * Returns 0 on successful retrieval, otherwise an error. */ STATIC int xfs_attr_node_get(xfs_da_args_t *args) @@ -1294,24 +1328,21 @@ xfs_attr_node_get(xfs_da_args_t *args) error = xfs_da3_node_lookup_int(state, &retval); if (error) { retval = error; - } else if (retval == -EEXIST) { - blk = &state->path.blk[ state->path.active-1 ]; - ASSERT(blk->bp != NULL); - ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - - /* - * Get the value, local or "remote" - */ - retval = xfs_attr3_leaf_getvalue(blk->bp, args); - if (!retval && (args->rmtblkno > 0) - && !(args->flags & ATTR_KERNOVAL)) { - retval = xfs_attr_rmtval_get(args); - } + goto out_release; } + if (retval != -EEXIST) + goto out_release; + + /* + * Get the value, local or "remote" + */ + blk = &state->path.blk[state->path.active - 1]; + retval = xfs_attr3_leaf_getvalue(blk->bp, args); /* * If not in a transaction, we have to release all the buffers. */ +out_release: for (i = 0; i < state->path.active; i++) { xfs_trans_brelse(args->trans, state->path.blk[i].bp); state->path.blk[i].bp = NULL; diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index ff28ebf3b635..94badfa1743e 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -37,6 +37,7 @@ struct xfs_attr_list_context; #define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */ #define ATTR_INCOMPLETE 0x4000 /* [kernel] return INCOMPLETE attr keys */ +#define ATTR_ALLOC 0x8000 /* allocate xattr buffer on demand */ #define XFS_ATTR_FLAGS \ { ATTR_DONTFOLLOW, "DONTFOLLOW" }, \ @@ -47,7 +48,8 @@ struct xfs_attr_list_context; { ATTR_REPLACE, "REPLACE" }, \ { ATTR_KERNOTIME, "KERNOTIME" }, \ { ATTR_KERNOVAL, "KERNOVAL" }, \ - { ATTR_INCOMPLETE, "INCOMPLETE" } + { ATTR_INCOMPLETE, "INCOMPLETE" }, \ + { ATTR_ALLOC, "ALLOC" } /* * The maximum size (into the kernel or returned from the kernel) of an @@ -143,7 +145,7 @@ int xfs_attr_list_int(struct xfs_attr_list_context *); int xfs_inode_hasattr(struct xfs_inode *ip); int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args); int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, - unsigned char *value, int *valuelenp, int flags); + unsigned char **value, int *valuelenp, int flags); int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, unsigned char *value, int valuelen, int flags); int xfs_attr_set_args(struct xfs_da_args *args); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 70eb941d02e4..b9f019603d0b 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -393,6 +393,50 @@ xfs_attr_namesp_match(int arg_flags, int ondisk_flags) return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags); } +static int +xfs_attr_copy_value( + struct xfs_da_args *args, + unsigned char *value, + int valuelen) +{ + /* + * No copy if all we have to do is get the length + */ + if (args->flags & ATTR_KERNOVAL) { + args->valuelen = valuelen; + return 0; + } + + /* + * No copy if the length of the existing buffer is too small + */ + if (args->valuelen < valuelen) { + args->valuelen = valuelen; + return -ERANGE; + } + + if (args->op_flags & XFS_DA_OP_ALLOCVAL) { + args->value = kmem_alloc_large(valuelen, 0); + if (!args->value) + return -ENOMEM; + } + args->valuelen = valuelen; + + /* remote block xattr requires IO for copy-in */ + if (args->rmtblkno) + return xfs_attr_rmtval_get(args); + + /* + * This is to prevent a GCC warning because the remote xattr case + * doesn't have a value to pass in. In that case, we never reach here, + * but GCC can't work that out and so throws a "passing NULL to + * memcpy" warning. + */ + if (!value) + return -EINVAL; + memcpy(args->value, value, valuelen); + return 0; +} /*======================================================================== * External routines when attribute fork size < XFS_LITINO(mp). @@ -720,15 +764,19 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args) } /* - * Look up a name in a shortform attribute list structure. + * Retreive the attribute value and length. + * + * If ATTR_KERNOVAL is specified, only the length needs to be returned. + * Unlike a lookup, we only return an error if the attribute does not + * exist or we can't retrieve the value. */ -/*ARGSUSED*/ int -xfs_attr_shortform_getvalue(xfs_da_args_t *args) +xfs_attr_shortform_getvalue( + struct xfs_da_args *args) { - xfs_attr_shortform_t *sf; - xfs_attr_sf_entry_t *sfe; - int i; + struct xfs_attr_shortform *sf; + struct xfs_attr_sf_entry *sfe; + int i; ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE); sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data; @@ -741,18 +789,8 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args) continue; if (!xfs_attr_namesp_match(args->flags, sfe->flags)) continue; - if (args->flags & ATTR_KERNOVAL) { - args->valuelen = sfe->valuelen; - return -EEXIST; - } - if (args->valuelen < sfe->valuelen) { - args->valuelen = sfe->valuelen; - return -ERANGE; - } - args->valuelen = sfe->valuelen; - memcpy(args->value, &sfe->nameval[args->namelen], - args->valuelen); - return -EEXIST; + return xfs_attr_copy_value(args, &sfe->nameval[args->namelen], + sfe->valuelen); } return -ENOATTR; } @@ -782,7 +820,7 @@ xfs_attr_shortform_to_leaf( ifp = dp->i_afp; sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; size = be16_to_cpu(sf->hdr.totsize); - tmpbuffer = kmem_alloc(size, KM_SLEEP); + tmpbuffer = kmem_alloc(size, 0); ASSERT(tmpbuffer != NULL); memcpy(tmpbuffer, ifp->if_u1.if_data, size); sf = (xfs_attr_shortform_t *)tmpbuffer; @@ -985,7 +1023,7 @@ xfs_attr3_leaf_to_shortform( trace_xfs_attr_leaf_to_sf(args); - tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP); + tmpbuffer = kmem_alloc(args->geo->blksize, 0); if (!tmpbuffer) return -ENOMEM; @@ -1448,7 +1486,7 @@ xfs_attr3_leaf_compact( trace_xfs_attr_leaf_compact(args); - tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP); + tmpbuffer = kmem_alloc(args->geo->blksize, 0); memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); memset(bp->b_addr, 0, args->geo->blksize); leaf_src = (xfs_attr_leafblock_t *)tmpbuffer; @@ -2167,7 +2205,7 @@ xfs_attr3_leaf_unbalance( struct xfs_attr_leafblock *tmp_leaf; struct xfs_attr3_icleaf_hdr tmphdr; - tmp_leaf = kmem_zalloc(state->args->geo->blksize, KM_SLEEP); + tmp_leaf = kmem_zalloc(state->args->geo->blksize, 0); /* * Copy the header into the temp leaf so that all the stuff @@ -2350,6 +2388,10 @@ xfs_attr3_leaf_lookup_int( /* * Get the value associated with an attribute name from a leaf attribute * list structure. + * + * If ATTR_KERNOVAL is specified, only the length needs to be returned. + * Unlike a lookup, we only return an error if the attribute does not + * exist or we can't retrieve the value. */ int xfs_attr3_leaf_getvalue( @@ -2361,7 +2403,6 @@ xfs_attr3_leaf_getvalue( struct xfs_attr_leaf_entry *entry; struct xfs_attr_leaf_name_local *name_loc; struct xfs_attr_leaf_name_remote *name_rmt; - int valuelen; leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); @@ -2373,36 +2414,19 @@ xfs_attr3_leaf_getvalue( name_loc = xfs_attr3_leaf_name_local(leaf, args->index); ASSERT(name_loc->namelen == args->namelen); ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0); - valuelen = be16_to_cpu(name_loc->valuelen); - if (args->flags & ATTR_KERNOVAL) { - args->valuelen = valuelen; - return 0; - } - if (args->valuelen < valuelen) { - args->valuelen = valuelen; - return -ERANGE; - } - args->valuelen = valuelen; - memcpy(args->value, &name_loc->nameval[args->namelen], valuelen); - } else { - name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); - ASSERT(name_rmt->namelen == args->namelen); - ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); - args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); - args->rmtblkno = be32_to_cpu(name_rmt->valueblk); - args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount, - args->rmtvaluelen); - if (args->flags & ATTR_KERNOVAL) { - args->valuelen = args->rmtvaluelen; - return 0; - } - if (args->valuelen < args->rmtvaluelen) { - args->valuelen = args->rmtvaluelen; - return -ERANGE; - } - args->valuelen = args->rmtvaluelen; - } - return 0; + return xfs_attr_copy_value(args, + &name_loc->nameval[args->namelen], + be16_to_cpu(name_loc->valuelen)); + } + + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); + ASSERT(name_rmt->namelen == args->namelen); + ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); + args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); + args->rmtblkno = be32_to_cpu(name_rmt->valueblk); + args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount, + args->rmtvaluelen); + return xfs_attr_copy_value(args, NULL, args->rmtvaluelen); } /*======================================================================== diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 4eb30d357045..3e39b7d40f25 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -358,6 +358,8 @@ xfs_attr_rmtval_copyin( /* * Read the value associated with an attribute from the out-of-line buffer * that we stored it in. + * + * Returns 0 on successful retrieval, otherwise an error. */ int xfs_attr_rmtval_get( diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 07aad70f3931..054b4ce30033 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -553,7 +553,7 @@ __xfs_bmap_add_free( #endif ASSERT(xfs_bmap_free_item_zone != NULL); - new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); + new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0); new->xefi_startblock = bno; new->xefi_blockcount = (xfs_extlen_t)len; if (oinfo) @@ -1099,7 +1099,7 @@ xfs_bmap_add_attrfork( if (error) goto trans_cancel; ASSERT(ip->i_afp == NULL); - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); + ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, 0); ip->i_afp->if_flags = XFS_IFEXTENTS; logflags = 0; switch (ip->i_d.di_format) { @@ -1985,11 +1985,8 @@ xfs_bmap_add_extent_delay_real( } /* add reverse mapping unless caller opted out */ - if (!(bma->flags & XFS_BMAPI_NORMAP)) { - error = xfs_rmap_map_extent(bma->tp, bma->ip, whichfork, new); - if (error) - goto done; - } + if (!(bma->flags & XFS_BMAPI_NORMAP)) + xfs_rmap_map_extent(bma->tp, bma->ip, whichfork, new); /* convert to a btree if necessary */ if (xfs_bmap_needs_btree(bma->ip, whichfork)) { @@ -2471,9 +2468,7 @@ xfs_bmap_add_extent_unwritten_real( } /* update reverse mappings */ - error = xfs_rmap_convert_extent(mp, tp, ip, whichfork, new); - if (error) - goto done; + xfs_rmap_convert_extent(mp, tp, ip, whichfork, new); /* convert to a btree if necessary */ if (xfs_bmap_needs_btree(ip, whichfork)) { @@ -2832,11 +2827,8 @@ xfs_bmap_add_extent_hole_real( } /* add reverse mapping unless caller opted out */ - if (!(flags & XFS_BMAPI_NORMAP)) { - error = xfs_rmap_map_extent(tp, ip, whichfork, new); - if (error) - goto done; - } + if (!(flags & XFS_BMAPI_NORMAP)) + xfs_rmap_map_extent(tp, ip, whichfork, new); /* convert to a btree if necessary */ if (xfs_bmap_needs_btree(ip, whichfork)) { @@ -4050,12 +4042,8 @@ xfs_bmapi_allocate( */ if (!(bma->flags & XFS_BMAPI_METADATA)) { bma->datatype = XFS_ALLOC_NOBUSY; - if (whichfork == XFS_DATA_FORK) { - if (bma->offset == 0) - bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; - else - bma->datatype |= XFS_ALLOC_USERDATA; - } + if (whichfork == XFS_DATA_FORK && bma->offset == 0) + bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; if (bma->flags & XFS_BMAPI_ZERO) bma->datatype |= XFS_ALLOC_USERDATA_ZERO; } @@ -4401,12 +4389,9 @@ xfs_bmapi_write( * If this is a CoW allocation, record the data in * the refcount btree for orphan recovery. */ - if (whichfork == XFS_COW_FORK) { - error = xfs_refcount_alloc_cow_extent(tp, - bma.blkno, bma.length); - if (error) - goto error0; - } + if (whichfork == XFS_COW_FORK) + xfs_refcount_alloc_cow_extent(tp, bma.blkno, + bma.length); } /* Deal with the allocated space we found. */ @@ -4530,7 +4515,7 @@ xfs_bmapi_convert_delalloc( if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK)) goto out_finish; error = -EFSCORRUPTED; - if (WARN_ON_ONCE(!bma.got.br_startblock && !XFS_IS_REALTIME_INODE(ip))) + if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock))) goto out_finish; XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length)); @@ -4540,12 +4525,8 @@ xfs_bmapi_convert_delalloc( *imap = bma.got; *seq = READ_ONCE(ifp->if_seq); - if (whichfork == XFS_COW_FORK) { - error = xfs_refcount_alloc_cow_extent(tp, bma.blkno, - bma.length); - if (error) - goto out_finish; - } + if (whichfork == XFS_COW_FORK) + xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length); error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, whichfork); @@ -5149,18 +5130,14 @@ xfs_bmap_del_extent_real( } /* remove reverse mapping */ - error = xfs_rmap_unmap_extent(tp, ip, whichfork, del); - if (error) - goto done; + xfs_rmap_unmap_extent(tp, ip, whichfork, del); /* * If we need to, add to list of extents to delete. */ if (do_fx && !(bflags & XFS_BMAPI_REMAP)) { if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { - error = xfs_refcount_decrease_extent(tp, del); - if (error) - goto done; + xfs_refcount_decrease_extent(tp, del); } else { __xfs_bmap_add_free(tp, del->br_startblock, del->br_blockcount, NULL, @@ -5651,12 +5628,11 @@ done: &new); /* update reverse mapping. rmap functions merge the rmaps for us */ - error = xfs_rmap_unmap_extent(tp, ip, whichfork, got); - if (error) - return error; + xfs_rmap_unmap_extent(tp, ip, whichfork, got); memcpy(&new, got, sizeof(new)); new.br_startoff = left->br_startoff + left->br_blockcount; - return xfs_rmap_map_extent(tp, ip, whichfork, &new); + xfs_rmap_map_extent(tp, ip, whichfork, &new); + return 0; } static int @@ -5695,10 +5671,9 @@ xfs_bmap_shift_update_extent( got); /* update reverse mapping */ - error = xfs_rmap_unmap_extent(tp, ip, whichfork, &prev); - if (error) - return error; - return xfs_rmap_map_extent(tp, ip, whichfork, got); + xfs_rmap_unmap_extent(tp, ip, whichfork, &prev); + xfs_rmap_map_extent(tp, ip, whichfork, got); + return 0; } int @@ -6094,7 +6069,7 @@ __xfs_bmap_add( bmap->br_blockcount, bmap->br_state); - bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_SLEEP | KM_NOFS); + bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_NOFS); INIT_LIST_HEAD(&bi->bi_list); bi->bi_type = type; bi->bi_owner = ip; @@ -6106,29 +6081,29 @@ __xfs_bmap_add( } /* Map an extent into a file. */ -int +void xfs_bmap_map_extent( struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_bmbt_irec *PREV) { if (!xfs_bmap_is_update_needed(PREV)) - return 0; + return; - return __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV); + __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV); } /* Unmap an extent out of a file. */ -int +void xfs_bmap_unmap_extent( struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_bmbt_irec *PREV) { if (!xfs_bmap_is_update_needed(PREV)) - return 0; + return; - return __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV); + __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV); } /* diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 8f597f9abdbe..5bb446d80542 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -171,6 +171,13 @@ static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) !isnullstartblock(irec->br_startblock); } +/* + * Check the mapping for obviously garbage allocations that could trash the + * filesystem immediately. + */ +#define xfs_valid_startblock(ip, startblock) \ + ((startblock) != 0 || XFS_IS_REALTIME_INODE(ip)) + void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); @@ -254,9 +261,9 @@ int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_inode *ip, enum xfs_bmap_intent_type type, int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, xfs_filblks_t *blockcount, xfs_exntst_t state); -int xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, +void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_bmbt_irec *imap); -int xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, +void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_bmbt_irec *imap); static inline int xfs_bmap_fork_to_state(int whichfork) diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index fbb18ba5d905..ffe608d2a2d9 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -400,8 +400,20 @@ xfs_bmbt_diff_two_keys( union xfs_btree_key *k1, union xfs_btree_key *k2) { - return (int64_t)be64_to_cpu(k1->bmbt.br_startoff) - - be64_to_cpu(k2->bmbt.br_startoff); + uint64_t a = be64_to_cpu(k1->bmbt.br_startoff); + uint64_t b = be64_to_cpu(k2->bmbt.br_startoff); + + /* + * Note: This routine previously casted a and b to int64 and subtracted + * them to generate a result. This lead to problems if b was the + * "maximum" key value (all ones) being signed incorrectly, hence this + * somewhat less efficient version. + */ + if (a > b) + return 1; + if (b > a) + return -1; + return 0; } static xfs_failaddr_t diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index f1048efa4268..71de937f9e64 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -4466,8 +4466,6 @@ xfs_btree_lblock_verify( * btree block * * @bp: buffer containing the btree block - * @max_recs: pointer to the m_*_mxr max records field in the xfs mount - * @pag_max_level: pointer to the per-ag max level field */ xfs_failaddr_t xfs_btree_sblock_v5hdr_verify( @@ -4600,7 +4598,7 @@ xfs_btree_simple_query_range( /* Callback */ error = fn(cur, recp, priv); - if (error < 0 || error == XFS_BTREE_QUERY_RANGE_ABORT) + if (error) break; advloop: @@ -4702,8 +4700,7 @@ pop_up: */ if (ldiff >= 0 && hdiff >= 0) { error = fn(cur, recp, priv); - if (error < 0 || - error == XFS_BTREE_QUERY_RANGE_ABORT) + if (error) break; } else if (hdiff < 0) { /* Record is larger than high key; pop. */ @@ -4774,8 +4771,7 @@ out: * Query a btree for all records overlapping a given interval of keys. The * supplied function will be called with each record found; return one of the * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error - * code. This function returns XFS_BTREE_QUERY_RANGE_ABORT, zero, or a - * negative error code. + * code. This function returns -ECANCELED, zero, or a negative error code. */ int xfs_btree_query_range( @@ -4891,7 +4887,7 @@ xfs_btree_has_record_helper( union xfs_btree_rec *rec, void *priv) { - return XFS_BTREE_QUERY_RANGE_ABORT; + return -ECANCELED; } /* Is there a record covering a given range of keys? */ @@ -4906,7 +4902,7 @@ xfs_btree_has_record( error = xfs_btree_query_range(cur, low, high, &xfs_btree_has_record_helper, NULL); - if (error == XFS_BTREE_QUERY_RANGE_ABORT) { + if (error == -ECANCELED) { *exists = true; return 0; } diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index fa3cd8ab9aba..ced1e65d1483 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -464,9 +464,13 @@ xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp, uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len); unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len); -/* return codes */ -#define XFS_BTREE_QUERY_RANGE_CONTINUE (XFS_ITER_CONTINUE) /* keep iterating */ -#define XFS_BTREE_QUERY_RANGE_ABORT (XFS_ITER_ABORT) /* stop iterating */ +/* + * Return codes for the query range iterator function are 0 to continue + * iterating, and non-zero to stop iterating. Any non-zero value will be + * passed up to the _query_range caller. The special value -ECANCELED can be + * used to stop iteration, because _query_range never generates that error + * code on its own. + */ typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur, union xfs_btree_rec *rec, void *priv); diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 0bf56e94bfe9..4fd1223c1bd5 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -2098,7 +2098,7 @@ xfs_da_grow_inode_int( * If we didn't get it and the block might work if fragmented, * try without the CONTIG flag. Loop until we get it all. */ - mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP); + mapp = kmem_alloc(sizeof(*mapp) * count, 0); for (b = *bno, mapi = 0; b < *bno + count; ) { nmap = min(XFS_BMAP_MAX_NMAP, count); c = (int)(*bno + count - b); @@ -2480,7 +2480,7 @@ xfs_buf_map_from_irec( if (nirecs > 1) { map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), - KM_SLEEP | KM_NOFS); + KM_NOFS); if (!map) return -ENOMEM; *mapp = map; @@ -2539,7 +2539,7 @@ xfs_dabuf_map( */ if (nfsb != 1) irecs = kmem_zalloc(sizeof(irec) * nfsb, - KM_SLEEP | KM_NOFS); + KM_NOFS); nirecs = nfsb; error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs, diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index 84dd865b6c3d..ae0bbd20d9ca 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -81,13 +81,15 @@ typedef struct xfs_da_args { #define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */ #define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */ #define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */ +#define XFS_DA_OP_ALLOCVAL 0x0020 /* lookup to alloc buffer if found */ #define XFS_DA_OP_FLAGS \ { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ { XFS_DA_OP_RENAME, "RENAME" }, \ { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ - { XFS_DA_OP_CILOOKUP, "CILOOKUP" } + { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \ + { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" } /* * Storage for holding state during Btree searches and split/join ops. diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index eb2be2a6a25a..22557527cfdb 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -517,7 +517,7 @@ xfs_defer_add( } if (!dfp) { dfp = kmem_alloc(sizeof(struct xfs_defer_pending), - KM_SLEEP | KM_NOFS); + KM_NOFS); dfp->dfp_type = type; dfp->dfp_intent = NULL; dfp->dfp_done = NULL; diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 67840723edbb..867c5dee0751 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -110,9 +110,9 @@ xfs_da_mount( nodehdr_size = mp->m_dir_inode_ops->node_hdr_size; mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), - KM_SLEEP | KM_MAYFAIL); + KM_MAYFAIL); mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), - KM_SLEEP | KM_MAYFAIL); + KM_MAYFAIL); if (!mp->m_dir_geo || !mp->m_attr_geo) { kmem_free(mp->m_dir_geo); kmem_free(mp->m_attr_geo); @@ -217,7 +217,7 @@ xfs_dir_init( if (error) return error; - args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + args = kmem_zalloc(sizeof(*args), KM_NOFS); if (!args) return -ENOMEM; @@ -254,7 +254,7 @@ xfs_dir_createname( XFS_STATS_INC(dp->i_mount, xs_dir_create); } - args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + args = kmem_zalloc(sizeof(*args), KM_NOFS); if (!args) return -ENOMEM; @@ -353,7 +353,7 @@ xfs_dir_lookup( * lockdep Doing this avoids having to add a bunch of lockdep class * annotations into the reclaim path for the ilock. */ - args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + args = kmem_zalloc(sizeof(*args), KM_NOFS); args->geo = dp->i_mount->m_dir_geo; args->name = name->name; args->namelen = name->len; @@ -422,7 +422,7 @@ xfs_dir_removename( ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_remove); - args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + args = kmem_zalloc(sizeof(*args), KM_NOFS); if (!args) return -ENOMEM; @@ -483,7 +483,7 @@ xfs_dir_replace( if (rval) return rval; - args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + args = kmem_zalloc(sizeof(*args), KM_NOFS); if (!args) return -ENOMEM; diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index a6fb0cc2085e..9595ced393dc 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -1092,7 +1092,7 @@ xfs_dir2_sf_to_block( * Copy the directory into a temporary buffer. * Then pitch the incore inode data so we can make extents. */ - sfp = kmem_alloc(ifp->if_bytes, KM_SLEEP); + sfp = kmem_alloc(ifp->if_bytes, 0); memcpy(sfp, oldsfp, ifp->if_bytes); xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK); diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 1fc44efc344d..705c4f562758 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -32,8 +32,6 @@ static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state, static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp, int index, xfs_da_state_blk_t *dblk, int *rval); -static int xfs_dir2_node_addname_int(xfs_da_args_t *args, - xfs_da_state_blk_t *fblk); /* * Check internal consistency of a leafn block. @@ -1611,113 +1609,152 @@ xfs_dir2_leafn_unbalance( } /* - * Top-level node form directory addname routine. + * Add a new data block to the directory at the free space index that the caller + * has specified. */ -int /* error */ -xfs_dir2_node_addname( - xfs_da_args_t *args) /* operation arguments */ +static int +xfs_dir2_node_add_datablk( + struct xfs_da_args *args, + struct xfs_da_state_blk *fblk, + xfs_dir2_db_t *dbno, + struct xfs_buf **dbpp, + struct xfs_buf **fbpp, + int *findex) { - xfs_da_state_blk_t *blk; /* leaf block for insert */ - int error; /* error return value */ - int rval; /* sub-return value */ - xfs_da_state_t *state; /* btree cursor */ + struct xfs_inode *dp = args->dp; + struct xfs_trans *tp = args->trans; + struct xfs_mount *mp = dp->i_mount; + struct xfs_dir3_icfree_hdr freehdr; + struct xfs_dir2_data_free *bf; + struct xfs_dir2_data_hdr *hdr; + struct xfs_dir2_free *free = NULL; + xfs_dir2_db_t fbno; + struct xfs_buf *fbp; + struct xfs_buf *dbp; + __be16 *bests = NULL; + int error; - trace_xfs_dir2_node_addname(args); + /* Not allowed to allocate, return failure. */ + if (args->total == 0) + return -ENOSPC; + + /* Allocate and initialize the new data block. */ + error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, dbno); + if (error) + return error; + error = xfs_dir3_data_init(args, *dbno, &dbp); + if (error) + return error; /* - * Allocate and initialize the state (btree cursor). - */ - state = xfs_da_state_alloc(); - state->args = args; - state->mp = args->dp->i_mount; - /* - * Look up the name. We're not supposed to find it, but - * this gives us the insertion point. + * Get the freespace block corresponding to the data block + * that was just allocated. */ - error = xfs_da3_node_lookup_int(state, &rval); + fbno = dp->d_ops->db_to_fdb(args->geo, *dbno); + error = xfs_dir2_free_try_read(tp, dp, + xfs_dir2_db_to_da(args->geo, fbno), &fbp); if (error) - rval = error; - if (rval != -ENOENT) { - goto done; - } + return error; + /* - * Add the data entry to a data block. - * Extravalid is set to a freeblock found by lookup. + * If there wasn't a freespace block, the read will + * return a NULL fbp. Allocate and initialize a new one. */ - rval = xfs_dir2_node_addname_int(args, - state->extravalid ? &state->extrablk : NULL); - if (rval) { - goto done; + if (!fbp) { + error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fbno); + if (error) + return error; + + if (dp->d_ops->db_to_fdb(args->geo, *dbno) != fbno) { + xfs_alert(mp, +"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld", + __func__, (unsigned long long)dp->i_ino, + (long long)dp->d_ops->db_to_fdb(args->geo, *dbno), + (long long)*dbno, (long long)fbno); + if (fblk) { + xfs_alert(mp, + " fblk "PTR_FMT" blkno %llu index %d magic 0x%x", + fblk, (unsigned long long)fblk->blkno, + fblk->index, fblk->magic); + } else { + xfs_alert(mp, " ... fblk is NULL"); + } + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + /* Get a buffer for the new block. */ + error = xfs_dir3_free_get_buf(args, fbno, &fbp); + if (error) + return error; + free = fbp->b_addr; + bests = dp->d_ops->free_bests_p(free); + dp->d_ops->free_hdr_from_disk(&freehdr, free); + + /* Remember the first slot as our empty slot. */ + freehdr.firstdb = (fbno - xfs_dir2_byte_to_db(args->geo, + XFS_DIR2_FREE_OFFSET)) * + dp->d_ops->free_max_bests(args->geo); + } else { + free = fbp->b_addr; + bests = dp->d_ops->free_bests_p(free); + dp->d_ops->free_hdr_from_disk(&freehdr, free); } - blk = &state->path.blk[state->path.active - 1]; - ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); + + /* Set the freespace block index from the data block number. */ + *findex = dp->d_ops->db_to_fdindex(args->geo, *dbno); + + /* Extend the freespace table if the new data block is off the end. */ + if (*findex >= freehdr.nvalid) { + ASSERT(*findex < dp->d_ops->free_max_bests(args->geo)); + freehdr.nvalid = *findex + 1; + bests[*findex] = cpu_to_be16(NULLDATAOFF); + } + /* - * Add the new leaf entry. + * If this entry was for an empty data block (this should always be + * true) then update the header. */ - rval = xfs_dir2_leafn_add(blk->bp, args, blk->index); - if (rval == 0) { - /* - * It worked, fix the hash values up the btree. - */ - if (!(args->op_flags & XFS_DA_OP_JUSTCHECK)) - xfs_da3_fixhashpath(state, &state->path); - } else { - /* - * It didn't work, we need to split the leaf block. - */ - if (args->total == 0) { - ASSERT(rval == -ENOSPC); - goto done; - } - /* - * Split the leaf block and insert the new entry. - */ - rval = xfs_da3_split(state); + if (bests[*findex] == cpu_to_be16(NULLDATAOFF)) { + freehdr.nused++; + dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr); + xfs_dir2_free_log_header(args, fbp); } -done: - xfs_da_state_free(state); - return rval; + + /* Update the freespace value for the new block in the table. */ + hdr = dbp->b_addr; + bf = dp->d_ops->data_bestfree_p(hdr); + bests[*findex] = bf[0].length; + + *dbpp = dbp; + *fbpp = fbp; + return 0; } -/* - * Add the data entry for a node-format directory name addition. - * The leaf entry is added in xfs_dir2_leafn_add. - * We may enter with a freespace block that the lookup found. - */ -static int /* error */ -xfs_dir2_node_addname_int( - xfs_da_args_t *args, /* operation arguments */ - xfs_da_state_blk_t *fblk) /* optional freespace block */ +static int +xfs_dir2_node_find_freeblk( + struct xfs_da_args *args, + struct xfs_da_state_blk *fblk, + xfs_dir2_db_t *dbnop, + struct xfs_buf **fbpp, + int *findexp, + int length) { - xfs_dir2_data_hdr_t *hdr; /* data block header */ - xfs_dir2_db_t dbno; /* data block number */ - struct xfs_buf *dbp; /* data block buffer */ - xfs_dir2_data_entry_t *dep; /* data entry pointer */ - xfs_inode_t *dp; /* incore directory inode */ - xfs_dir2_data_unused_t *dup; /* data unused entry pointer */ - int error; /* error return value */ - xfs_dir2_db_t fbno; /* freespace block number */ - struct xfs_buf *fbp; /* freespace buffer */ - int findex; /* freespace entry index */ - xfs_dir2_free_t *free=NULL; /* freespace block structure */ - xfs_dir2_db_t ifbno; /* initial freespace block no */ - xfs_dir2_db_t lastfbno=0; /* highest freespace block no */ - int length; /* length of the new entry */ - int logfree; /* need to log free entry */ - xfs_mount_t *mp; /* filesystem mount point */ - int needlog; /* need to log data header */ - int needscan; /* need to rescan data frees */ - __be16 *tagp; /* data entry tag pointer */ - xfs_trans_t *tp; /* transaction pointer */ - __be16 *bests; struct xfs_dir3_icfree_hdr freehdr; - struct xfs_dir2_data_free *bf; - xfs_dir2_data_aoff_t aoff; + struct xfs_dir2_free *free = NULL; + struct xfs_inode *dp = args->dp; + struct xfs_trans *tp = args->trans; + struct xfs_buf *fbp = NULL; + xfs_dir2_db_t firstfbno; + xfs_dir2_db_t lastfbno; + xfs_dir2_db_t ifbno = -1; + xfs_dir2_db_t dbno = -1; + xfs_dir2_db_t fbno; + xfs_fileoff_t fo; + __be16 *bests = NULL; + int findex = 0; + int error; - dp = args->dp; - mp = dp->i_mount; - tp = args->trans; - length = dp->d_ops->data_entsize(args->namelen); /* * If we came in with a freespace block that means that lookup * found an entry with our hash value. This is the freespace @@ -1725,288 +1762,157 @@ xfs_dir2_node_addname_int( */ if (fblk) { fbp = fblk->bp; - /* - * Remember initial freespace block number. - */ - ifbno = fblk->blkno; free = fbp->b_addr; findex = fblk->index; - bests = dp->d_ops->free_bests_p(free); - dp->d_ops->free_hdr_from_disk(&freehdr, free); - - /* - * This means the free entry showed that the data block had - * space for our entry, so we remembered it. - * Use that data block. - */ if (findex >= 0) { + /* caller already found the freespace for us. */ + bests = dp->d_ops->free_bests_p(free); + dp->d_ops->free_hdr_from_disk(&freehdr, free); + ASSERT(findex < freehdr.nvalid); ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF); ASSERT(be16_to_cpu(bests[findex]) >= length); dbno = freehdr.firstdb + findex; - } else { - /* - * The data block looked at didn't have enough room. - * We'll start at the beginning of the freespace entries. - */ - dbno = -1; - findex = 0; + goto found_block; } - } else { + /* - * Didn't come in with a freespace block, so no data block. + * The data block looked at didn't have enough room. + * We'll start at the beginning of the freespace entries. */ - ifbno = dbno = -1; + ifbno = fblk->blkno; + xfs_trans_brelse(tp, fbp); fbp = NULL; - findex = 0; + fblk->bp = NULL; } /* - * If we don't have a data block yet, we're going to scan the - * freespace blocks looking for one. Figure out what the - * highest freespace block number is. - */ - if (dbno == -1) { - xfs_fileoff_t fo; /* freespace block number */ - - if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK))) - return error; - lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo); - fbno = ifbno; - } - /* - * While we haven't identified a data block, search the freeblock - * data for a good data block. If we find a null freeblock entry, - * indicating a hole in the data blocks, remember that. + * If we don't have a data block yet, we're going to scan the freespace + * data for a data block with enough free space in it. */ - while (dbno == -1) { - /* - * If we don't have a freeblock in hand, get the next one. - */ - if (fbp == NULL) { - /* - * Happens the first time through unless lookup gave - * us a freespace block to start with. - */ - if (++fbno == 0) - fbno = xfs_dir2_byte_to_db(args->geo, - XFS_DIR2_FREE_OFFSET); - /* - * If it's ifbno we already looked at it. - */ - if (fbno == ifbno) - fbno++; - /* - * If it's off the end we're done. - */ - if (fbno >= lastfbno) - break; - /* - * Read the block. There can be holes in the - * freespace blocks, so this might not succeed. - * This should be really rare, so there's no reason - * to avoid it. - */ - error = xfs_dir2_free_try_read(tp, dp, - xfs_dir2_db_to_da(args->geo, fbno), - &fbp); - if (error) - return error; - if (!fbp) - continue; - free = fbp->b_addr; - findex = 0; - } - /* - * Look at the current free entry. Is it good enough? - * - * The bests initialisation should be where the bufer is read in - * the above branch. But gcc is too stupid to realise that bests - * and the freehdr are actually initialised if they are placed - * there, so we have to do it here to avoid warnings. Blech. - */ - bests = dp->d_ops->free_bests_p(free); - dp->d_ops->free_hdr_from_disk(&freehdr, free); - if (be16_to_cpu(bests[findex]) != NULLDATAOFF && - be16_to_cpu(bests[findex]) >= length) - dbno = freehdr.firstdb + findex; - else { - /* - * Are we done with the freeblock? - */ - if (++findex == freehdr.nvalid) { - /* - * Drop the block. - */ - xfs_trans_brelse(tp, fbp); - fbp = NULL; - if (fblk && fblk->bp) - fblk->bp = NULL; - } - } - } - /* - * If we don't have a data block, we need to allocate one and make - * the freespace entries refer to it. - */ - if (unlikely(dbno == -1)) { - /* - * Not allowed to allocate, return failure. - */ - if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) - return -ENOSPC; - - /* - * Allocate and initialize the new data block. - */ - if (unlikely((error = xfs_dir2_grow_inode(args, - XFS_DIR2_DATA_SPACE, - &dbno)) || - (error = xfs_dir3_data_init(args, dbno, &dbp)))) - return error; + error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK); + if (error) + return error; + lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo); + firstfbno = xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET); - /* - * If (somehow) we have a freespace block, get rid of it. - */ - if (fbp) - xfs_trans_brelse(tp, fbp); - if (fblk && fblk->bp) - fblk->bp = NULL; + for (fbno = lastfbno - 1; fbno >= firstfbno; fbno--) { + /* If it's ifbno we already looked at it. */ + if (fbno == ifbno) + continue; /* - * Get the freespace block corresponding to the data block - * that was just allocated. + * Read the block. There can be holes in the freespace blocks, + * so this might not succeed. This should be really rare, so + * there's no reason to avoid it. */ - fbno = dp->d_ops->db_to_fdb(args->geo, dbno); error = xfs_dir2_free_try_read(tp, dp, - xfs_dir2_db_to_da(args->geo, fbno), - &fbp); + xfs_dir2_db_to_da(args->geo, fbno), + &fbp); if (error) return error; + if (!fbp) + continue; - /* - * If there wasn't a freespace block, the read will - * return a NULL fbp. Allocate and initialize a new one. - */ - if (!fbp) { - error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, - &fbno); - if (error) - return error; + free = fbp->b_addr; + bests = dp->d_ops->free_bests_p(free); + dp->d_ops->free_hdr_from_disk(&freehdr, free); - if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) { - xfs_alert(mp, -"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld ifbno %llu lastfbno %d", - __func__, (unsigned long long)dp->i_ino, - (long long)dp->d_ops->db_to_fdb( - args->geo, dbno), - (long long)dbno, (long long)fbno, - (unsigned long long)ifbno, lastfbno); - if (fblk) { - xfs_alert(mp, - " fblk "PTR_FMT" blkno %llu index %d magic 0x%x", - fblk, - (unsigned long long)fblk->blkno, - fblk->index, - fblk->magic); - } else { - xfs_alert(mp, " ... fblk is NULL"); - } - XFS_ERROR_REPORT("xfs_dir2_node_addname_int", - XFS_ERRLEVEL_LOW, mp); - return -EFSCORRUPTED; + /* Scan the free entry array for a large enough free space. */ + for (findex = freehdr.nvalid - 1; findex >= 0; findex--) { + if (be16_to_cpu(bests[findex]) != NULLDATAOFF && + be16_to_cpu(bests[findex]) >= length) { + dbno = freehdr.firstdb + findex; + goto found_block; } - - /* - * Get a buffer for the new block. - */ - error = xfs_dir3_free_get_buf(args, fbno, &fbp); - if (error) - return error; - free = fbp->b_addr; - bests = dp->d_ops->free_bests_p(free); - dp->d_ops->free_hdr_from_disk(&freehdr, free); - - /* - * Remember the first slot as our empty slot. - */ - freehdr.firstdb = - (fbno - xfs_dir2_byte_to_db(args->geo, - XFS_DIR2_FREE_OFFSET)) * - dp->d_ops->free_max_bests(args->geo); - } else { - free = fbp->b_addr; - bests = dp->d_ops->free_bests_p(free); - dp->d_ops->free_hdr_from_disk(&freehdr, free); } - /* - * Set the freespace block index from the data block number. - */ - findex = dp->d_ops->db_to_fdindex(args->geo, dbno); - /* - * If it's after the end of the current entries in the - * freespace block, extend that table. - */ - if (findex >= freehdr.nvalid) { - ASSERT(findex < dp->d_ops->free_max_bests(args->geo)); - freehdr.nvalid = findex + 1; - /* - * Tag new entry so nused will go up. - */ - bests[findex] = cpu_to_be16(NULLDATAOFF); - } - /* - * If this entry was for an empty data block - * (this should always be true) then update the header. - */ - if (bests[findex] == cpu_to_be16(NULLDATAOFF)) { - freehdr.nused++; - dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr); - xfs_dir2_free_log_header(args, fbp); - } - /* - * Update the real value in the table. - * We haven't allocated the data entry yet so this will - * change again. - */ - hdr = dbp->b_addr; - bf = dp->d_ops->data_bestfree_p(hdr); - bests[findex] = bf[0].length; - logfree = 1; + /* Didn't find free space, go on to next free block */ + xfs_trans_brelse(tp, fbp); } + +found_block: + *dbnop = dbno; + *fbpp = fbp; + *findexp = findex; + return 0; +} + + +/* + * Add the data entry for a node-format directory name addition. + * The leaf entry is added in xfs_dir2_leafn_add. + * We may enter with a freespace block that the lookup found. + */ +static int +xfs_dir2_node_addname_int( + struct xfs_da_args *args, /* operation arguments */ + struct xfs_da_state_blk *fblk) /* optional freespace block */ +{ + struct xfs_dir2_data_unused *dup; /* data unused entry pointer */ + struct xfs_dir2_data_entry *dep; /* data entry pointer */ + struct xfs_dir2_data_hdr *hdr; /* data block header */ + struct xfs_dir2_data_free *bf; + struct xfs_dir2_free *free = NULL; /* freespace block structure */ + struct xfs_trans *tp = args->trans; + struct xfs_inode *dp = args->dp; + struct xfs_buf *dbp; /* data block buffer */ + struct xfs_buf *fbp; /* freespace buffer */ + xfs_dir2_data_aoff_t aoff; + xfs_dir2_db_t dbno; /* data block number */ + int error; /* error return value */ + int findex; /* freespace entry index */ + int length; /* length of the new entry */ + int logfree = 0; /* need to log free entry */ + int needlog = 0; /* need to log data header */ + int needscan = 0; /* need to rescan data frees */ + __be16 *tagp; /* data entry tag pointer */ + __be16 *bests; + + length = dp->d_ops->data_entsize(args->namelen); + error = xfs_dir2_node_find_freeblk(args, fblk, &dbno, &fbp, &findex, + length); + if (error) + return error; + /* - * We had a data block so we don't have to make a new one. + * Now we know if we must allocate blocks, so if we are checking whether + * we can insert without allocation then we can return now. */ - else { - /* - * If just checking, we succeeded. - */ - if (args->op_flags & XFS_DA_OP_JUSTCHECK) - return 0; + if (args->op_flags & XFS_DA_OP_JUSTCHECK) { + if (dbno == -1) + return -ENOSPC; + return 0; + } - /* - * Read the data block in. - */ + /* + * If we don't have a data block, we need to allocate one and make + * the freespace entries refer to it. + */ + if (dbno == -1) { + /* we're going to have to log the free block index later */ + logfree = 1; + error = xfs_dir2_node_add_datablk(args, fblk, &dbno, &dbp, &fbp, + &findex); + } else { + /* Read the data block in. */ error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, dbno), -1, &dbp); - if (error) - return error; - hdr = dbp->b_addr; - bf = dp->d_ops->data_bestfree_p(hdr); - logfree = 0; } + if (error) + return error; + + /* setup for data block up now */ + hdr = dbp->b_addr; + bf = dp->d_ops->data_bestfree_p(hdr); ASSERT(be16_to_cpu(bf[0].length) >= length); - /* - * Point to the existing unused space. - */ + + /* Point to the existing unused space. */ dup = (xfs_dir2_data_unused_t *) ((char *)hdr + be16_to_cpu(bf[0].offset)); - needscan = needlog = 0; - /* - * Mark the first part of the unused space, inuse for us. - */ + + /* Mark the first part of the unused space, inuse for us. */ aoff = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr); error = xfs_dir2_data_use_free(args, dbp, dup, aoff, length, &needlog, &needscan); @@ -2014,9 +1920,8 @@ xfs_dir2_node_addname_int( xfs_trans_brelse(tp, dbp); return error; } - /* - * Fill in the new entry and log it. - */ + + /* Fill in the new entry and log it. */ dep = (xfs_dir2_data_entry_t *)dup; dep->inumber = cpu_to_be64(args->inumber); dep->namelen = args->namelen; @@ -2025,38 +1930,101 @@ xfs_dir2_node_addname_int( tagp = dp->d_ops->data_entry_tag_p(dep); *tagp = cpu_to_be16((char *)dep - (char *)hdr); xfs_dir2_data_log_entry(args, dbp, dep); - /* - * Rescan the block for bestfree if needed. - */ + + /* Rescan the freespace and log the data block if needed. */ if (needscan) xfs_dir2_data_freescan(dp, hdr, &needlog); - /* - * Log the data block header if needed. - */ if (needlog) xfs_dir2_data_log_header(args, dbp); - /* - * If the freespace entry is now wrong, update it. - */ - bests = dp->d_ops->free_bests_p(free); /* gcc is so stupid */ - if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) { + + /* If the freespace block entry is now wrong, update it. */ + free = fbp->b_addr; + bests = dp->d_ops->free_bests_p(free); + if (bests[findex] != bf[0].length) { bests[findex] = bf[0].length; logfree = 1; } - /* - * Log the freespace entry if needed. - */ + + /* Log the freespace entry if needed. */ if (logfree) xfs_dir2_free_log_bests(args, fbp, findex, findex); - /* - * Return the data block and offset in args, then drop the data block. - */ + + /* Return the data block and offset in args. */ args->blkno = (xfs_dablk_t)dbno; args->index = be16_to_cpu(*tagp); return 0; } /* + * Top-level node form directory addname routine. + */ +int /* error */ +xfs_dir2_node_addname( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_da_state_blk_t *blk; /* leaf block for insert */ + int error; /* error return value */ + int rval; /* sub-return value */ + xfs_da_state_t *state; /* btree cursor */ + + trace_xfs_dir2_node_addname(args); + + /* + * Allocate and initialize the state (btree cursor). + */ + state = xfs_da_state_alloc(); + state->args = args; + state->mp = args->dp->i_mount; + /* + * Look up the name. We're not supposed to find it, but + * this gives us the insertion point. + */ + error = xfs_da3_node_lookup_int(state, &rval); + if (error) + rval = error; + if (rval != -ENOENT) { + goto done; + } + /* + * Add the data entry to a data block. + * Extravalid is set to a freeblock found by lookup. + */ + rval = xfs_dir2_node_addname_int(args, + state->extravalid ? &state->extrablk : NULL); + if (rval) { + goto done; + } + blk = &state->path.blk[state->path.active - 1]; + ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); + /* + * Add the new leaf entry. + */ + rval = xfs_dir2_leafn_add(blk->bp, args, blk->index); + if (rval == 0) { + /* + * It worked, fix the hash values up the btree. + */ + if (!(args->op_flags & XFS_DA_OP_JUSTCHECK)) + xfs_da3_fixhashpath(state, &state->path); + } else { + /* + * It didn't work, we need to split the leaf block. + */ + if (args->total == 0) { + ASSERT(rval == -ENOSPC); + goto done; + } + /* + * Split the leaf block and insert the new entry. + */ + rval = xfs_da3_split(state); + } +done: + xfs_da_state_free(state); + return rval; +} + +/* * Lookup an entry in a node-format directory. * All the real work happens in xfs_da3_node_lookup_int. * The only real output is the inode number of the entry. diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 033589257f54..85f14fc2a8da 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -164,7 +164,7 @@ xfs_dir2_block_to_sf( * can free the block and copy the formatted data into the inode literal * area. */ - dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP); + dst = kmem_alloc(mp->m_sb.sb_inodesize, 0); hdr = bp->b_addr; /* @@ -436,7 +436,7 @@ xfs_dir2_sf_addname_hard( sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; old_isize = (int)dp->i_d.di_size; - buf = kmem_alloc(old_isize, KM_SLEEP); + buf = kmem_alloc(old_isize, 0); oldsfp = (xfs_dir2_sf_hdr_t *)buf; memcpy(oldsfp, sfp, old_isize); /* @@ -1096,7 +1096,7 @@ xfs_dir2_sf_toino4( * Don't want xfs_idata_realloc copying the data here. */ oldsize = dp->i_df.if_bytes; - buf = kmem_alloc(oldsize, KM_SLEEP); + buf = kmem_alloc(oldsize, 0); oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; ASSERT(oldsfp->i8count == 1); memcpy(buf, oldsfp, oldsize); @@ -1169,7 +1169,7 @@ xfs_dir2_sf_toino8( * Don't want xfs_idata_realloc copying the data here. */ oldsize = dp->i_df.if_bytes; - buf = kmem_alloc(oldsize, KM_SLEEP); + buf = kmem_alloc(oldsize, 0); oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; ASSERT(oldsfp->i8count == 0); memcpy(buf, oldsfp, oldsize); diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 52d03a3a02a4..39dd2b908106 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -287,7 +287,7 @@ struct xfs_ag_geometry { uint32_t ag_ifree; /* o: inodes free */ uint32_t ag_sick; /* o: sick things in ag */ uint32_t ag_checked; /* o: checked metadata in ag */ - uint32_t ag_reserved32; /* o: zero */ + uint32_t ag_flags; /* i/o: flags for this ag */ uint64_t ag_reserved[12];/* o: zero */ }; #define XFS_AG_GEOM_SICK_SB (1 << 0) /* superblock */ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 04377ab75863..588d44613094 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2787,8 +2787,13 @@ xfs_ialloc_setup_geometry( igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr, inodes); - /* Set the maximum inode count for this filesystem. */ - if (sbp->sb_imax_pct) { + /* + * Set the maximum inode count for this filesystem, being careful not + * to use obviously garbage sb_inopblog/sb_inopblock values. Regular + * users should never get here due to failing sb verification, but + * certain users (xfs_db) need to be usable even with corrupt metadata. + */ + if (sbp->sb_imax_pct && igeo->ialloc_blks) { /* * Make sure the maximum inode count is a multiple * of the units we allocate inodes in. diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c index 27aa3f2bc4bc..7bc87408f1a0 100644 --- a/fs/xfs/libxfs/xfs_iext_tree.c +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -616,7 +616,7 @@ xfs_iext_realloc_root( * sequence counter is seen before the modifications to the extent tree itself * take effect. */ -static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp, int state) +static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp) { WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1); } @@ -633,7 +633,7 @@ xfs_iext_insert( struct xfs_iext_leaf *new = NULL; int nr_entries, i; - xfs_iext_inc_seq(ifp, state); + xfs_iext_inc_seq(ifp); if (ifp->if_height == 0) xfs_iext_alloc_root(ifp, cur); @@ -875,7 +875,7 @@ xfs_iext_remove( ASSERT(ifp->if_u1.if_root != NULL); ASSERT(xfs_iext_valid(ifp, cur)); - xfs_iext_inc_seq(ifp, state); + xfs_iext_inc_seq(ifp); nr_entries = xfs_iext_leaf_nr_entries(ifp, leaf, cur->pos) - 1; for (i = cur->pos; i < nr_entries; i++) @@ -983,7 +983,7 @@ xfs_iext_update_extent( { struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state); - xfs_iext_inc_seq(ifp, state); + xfs_iext_inc_seq(ifp); if (cur->pos == 0) { struct xfs_bmbt_irec old; diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index bf3e04018246..c643beeb5a24 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -94,7 +94,7 @@ xfs_iformat_fork( return 0; ASSERT(ip->i_afp == NULL); - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS); + ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_NOFS); switch (dip->di_aformat) { case XFS_DINODE_FMT_LOCAL: @@ -147,7 +147,7 @@ xfs_init_local_fork( if (size) { real_size = roundup(mem_size, 4); - ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); + ifp->if_u1.if_data = kmem_alloc(real_size, KM_NOFS); memcpy(ifp->if_u1.if_data, data, size); if (zero_terminate) ifp->if_u1.if_data[size] = '\0'; @@ -302,7 +302,7 @@ xfs_iformat_btree( } ifp->if_broot_bytes = size; - ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS); + ifp->if_broot = kmem_alloc(size, KM_NOFS); ASSERT(ifp->if_broot != NULL); /* * Copy and convert from the on-disk structure @@ -367,7 +367,7 @@ xfs_iroot_realloc( */ if (ifp->if_broot_bytes == 0) { new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff); - ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); + ifp->if_broot = kmem_alloc(new_size, KM_NOFS); ifp->if_broot_bytes = (int)new_size; return; } @@ -382,7 +382,7 @@ xfs_iroot_realloc( new_max = cur_max + rec_diff; new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, - KM_SLEEP | KM_NOFS); + KM_NOFS); op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, ifp->if_broot_bytes); np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, @@ -408,7 +408,7 @@ xfs_iroot_realloc( else new_size = 0; if (new_size > 0) { - new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); + new_broot = kmem_alloc(new_size, KM_NOFS); /* * First copy over the btree block header. */ @@ -492,7 +492,7 @@ xfs_idata_realloc( * We enforce that here. */ ifp->if_u1.if_data = kmem_realloc(ifp->if_u1.if_data, - roundup(new_size, 4), KM_SLEEP | KM_NOFS); + roundup(new_size, 4), KM_NOFS); ifp->if_bytes = new_size; } @@ -683,7 +683,7 @@ xfs_ifork_init_cow( return; ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone, - KM_SLEEP | KM_NOFS); + KM_NOFS); ip->i_cowfp->if_flags = XFS_IFEXTENTS; ip->i_cformat = XFS_DINODE_FMT_EXTENTS; ip->i_cnextents = 0; diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 51bb9bdb0e84..9a7fadb1361c 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1174,7 +1174,7 @@ out_cur: /* * Record a refcount intent for later processing. */ -static int +static void __xfs_refcount_add( struct xfs_trans *tp, enum xfs_refcount_intent_type type, @@ -1189,44 +1189,43 @@ __xfs_refcount_add( blockcount); ri = kmem_alloc(sizeof(struct xfs_refcount_intent), - KM_SLEEP | KM_NOFS); + KM_NOFS); INIT_LIST_HEAD(&ri->ri_list); ri->ri_type = type; ri->ri_startblock = startblock; ri->ri_blockcount = blockcount; xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list); - return 0; } /* * Increase the reference count of the blocks backing a file's extent. */ -int +void xfs_refcount_increase_extent( struct xfs_trans *tp, struct xfs_bmbt_irec *PREV) { if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb)) - return 0; + return; - return __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, - PREV->br_startblock, PREV->br_blockcount); + __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, PREV->br_startblock, + PREV->br_blockcount); } /* * Decrease the reference count of the blocks backing a file's extent. */ -int +void xfs_refcount_decrease_extent( struct xfs_trans *tp, struct xfs_bmbt_irec *PREV) { if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb)) - return 0; + return; - return __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, - PREV->br_startblock, PREV->br_blockcount); + __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, PREV->br_startblock, + PREV->br_blockcount); } /* @@ -1541,47 +1540,40 @@ __xfs_refcount_cow_free( } /* Record a CoW staging extent in the refcount btree. */ -int +void xfs_refcount_alloc_cow_extent( struct xfs_trans *tp, xfs_fsblock_t fsb, xfs_extlen_t len) { struct xfs_mount *mp = tp->t_mountp; - int error; if (!xfs_sb_version_hasreflink(&mp->m_sb)) - return 0; + return; - error = __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len); - if (error) - return error; + __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len); /* Add rmap entry */ - return xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb), + xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb), XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW); } /* Forget a CoW staging event in the refcount btree. */ -int +void xfs_refcount_free_cow_extent( struct xfs_trans *tp, xfs_fsblock_t fsb, xfs_extlen_t len) { struct xfs_mount *mp = tp->t_mountp; - int error; if (!xfs_sb_version_hasreflink(&mp->m_sb)) - return 0; + return; /* Remove rmap entry */ - error = xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb), + xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb), XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW); - if (error) - return error; - - return __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len); + __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len); } struct xfs_refcount_recovery { @@ -1602,7 +1594,7 @@ xfs_refcount_recover_extent( if (be32_to_cpu(rec->refc.rc_refcount) != 1) return -EFSCORRUPTED; - rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), KM_SLEEP); + rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), 0); xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec); list_add_tail(&rr->rr_list, debris); @@ -1679,10 +1671,8 @@ xfs_refcount_recover_cow_leftovers( /* Free the orphan record */ agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START; fsb = XFS_AGB_TO_FSB(mp, agno, agbno); - error = xfs_refcount_free_cow_extent(tp, fsb, + xfs_refcount_free_cow_extent(tp, fsb, rr->rr_rrec.rc_blockcount); - if (error) - goto out_trans; /* Free the block. */ xfs_bmap_add_free(tp, fsb, rr->rr_rrec.rc_blockcount, NULL); diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 1d9c518575e7..209795539c8d 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -29,9 +29,9 @@ struct xfs_refcount_intent { xfs_extlen_t ri_blockcount; }; -extern int xfs_refcount_increase_extent(struct xfs_trans *tp, +void xfs_refcount_increase_extent(struct xfs_trans *tp, struct xfs_bmbt_irec *irec); -extern int xfs_refcount_decrease_extent(struct xfs_trans *tp, +void xfs_refcount_decrease_extent(struct xfs_trans *tp, struct xfs_bmbt_irec *irec); extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp, @@ -45,10 +45,10 @@ extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur, xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_end_of_shared); -extern int xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, - xfs_fsblock_t fsb, xfs_extlen_t len); -extern int xfs_refcount_free_cow_extent(struct xfs_trans *tp, - xfs_fsblock_t fsb, xfs_extlen_t len); +void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb, + xfs_extlen_t len); +void xfs_refcount_free_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb, + xfs_extlen_t len); extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, xfs_agnumber_t agno); diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index e6aeb390b2fb..38e9414878b3 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -168,7 +168,6 @@ xfs_rmap_btrec_to_irec( union xfs_btree_rec *rec, struct xfs_rmap_irec *irec) { - irec->rm_flags = 0; irec->rm_startblock = be32_to_cpu(rec->rmap.rm_startblock); irec->rm_blockcount = be32_to_cpu(rec->rmap.rm_blockcount); irec->rm_owner = be64_to_cpu(rec->rmap.rm_owner); @@ -254,15 +253,15 @@ xfs_rmap_find_left_neighbor_helper( rec->rm_flags); if (rec->rm_owner != info->high.rm_owner) - return XFS_BTREE_QUERY_RANGE_CONTINUE; + return 0; if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) && !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) && rec->rm_offset + rec->rm_blockcount - 1 != info->high.rm_offset) - return XFS_BTREE_QUERY_RANGE_CONTINUE; + return 0; *info->irec = *rec; *info->stat = 1; - return XFS_BTREE_QUERY_RANGE_ABORT; + return -ECANCELED; } /* @@ -305,7 +304,7 @@ xfs_rmap_find_left_neighbor( error = xfs_rmap_query_range(cur, &info.high, &info.high, xfs_rmap_find_left_neighbor_helper, &info); - if (error == XFS_BTREE_QUERY_RANGE_ABORT) + if (error == -ECANCELED) error = 0; if (*stat) trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, @@ -330,16 +329,16 @@ xfs_rmap_lookup_le_range_helper( rec->rm_flags); if (rec->rm_owner != info->high.rm_owner) - return XFS_BTREE_QUERY_RANGE_CONTINUE; + return 0; if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) && !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) && (rec->rm_offset > info->high.rm_offset || rec->rm_offset + rec->rm_blockcount <= info->high.rm_offset)) - return XFS_BTREE_QUERY_RANGE_CONTINUE; + return 0; *info->irec = *rec; *info->stat = 1; - return XFS_BTREE_QUERY_RANGE_ABORT; + return -ECANCELED; } /* @@ -377,7 +376,7 @@ xfs_rmap_lookup_le_range( cur->bc_private.a.agno, bno, 0, owner, offset, flags); error = xfs_rmap_query_range(cur, &info.high, &info.high, xfs_rmap_lookup_le_range_helper, &info); - if (error == XFS_BTREE_QUERY_RANGE_ABORT) + if (error == -ECANCELED) error = 0; if (*stat) trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, @@ -2268,7 +2267,7 @@ xfs_rmap_update_is_needed( * Record a rmap intent; the list is kept sorted first by AG and then by * increasing age. */ -static int +static void __xfs_rmap_add( struct xfs_trans *tp, enum xfs_rmap_intent_type type, @@ -2287,7 +2286,7 @@ __xfs_rmap_add( bmap->br_blockcount, bmap->br_state); - ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_SLEEP | KM_NOFS); + ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_NOFS); INIT_LIST_HEAD(&ri->ri_list); ri->ri_type = type; ri->ri_owner = owner; @@ -2295,11 +2294,10 @@ __xfs_rmap_add( ri->ri_bmap = *bmap; xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list); - return 0; } /* Map an extent into a file. */ -int +void xfs_rmap_map_extent( struct xfs_trans *tp, struct xfs_inode *ip, @@ -2307,15 +2305,15 @@ xfs_rmap_map_extent( struct xfs_bmbt_irec *PREV) { if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork)) - return 0; + return; - return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? + __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? XFS_RMAP_MAP_SHARED : XFS_RMAP_MAP, ip->i_ino, whichfork, PREV); } /* Unmap an extent out of a file. */ -int +void xfs_rmap_unmap_extent( struct xfs_trans *tp, struct xfs_inode *ip, @@ -2323,9 +2321,9 @@ xfs_rmap_unmap_extent( struct xfs_bmbt_irec *PREV) { if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork)) - return 0; + return; - return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? + __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? XFS_RMAP_UNMAP_SHARED : XFS_RMAP_UNMAP, ip->i_ino, whichfork, PREV); } @@ -2336,7 +2334,7 @@ xfs_rmap_unmap_extent( * Note that tp can be NULL here as no transaction is used for COW fork * unwritten conversion. */ -int +void xfs_rmap_convert_extent( struct xfs_mount *mp, struct xfs_trans *tp, @@ -2345,15 +2343,15 @@ xfs_rmap_convert_extent( struct xfs_bmbt_irec *PREV) { if (!xfs_rmap_update_is_needed(mp, whichfork)) - return 0; + return; - return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? + __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? XFS_RMAP_CONVERT_SHARED : XFS_RMAP_CONVERT, ip->i_ino, whichfork, PREV); } /* Schedule the creation of an rmap for non-file data. */ -int +void xfs_rmap_alloc_extent( struct xfs_trans *tp, xfs_agnumber_t agno, @@ -2364,18 +2362,18 @@ xfs_rmap_alloc_extent( struct xfs_bmbt_irec bmap; if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK)) - return 0; + return; bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno); bmap.br_blockcount = len; bmap.br_startoff = 0; bmap.br_state = XFS_EXT_NORM; - return __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap); + __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap); } /* Schedule the deletion of an rmap for non-file data. */ -int +void xfs_rmap_free_extent( struct xfs_trans *tp, xfs_agnumber_t agno, @@ -2386,14 +2384,14 @@ xfs_rmap_free_extent( struct xfs_bmbt_irec bmap; if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK)) - return 0; + return; bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno); bmap.br_blockcount = len; bmap.br_startoff = 0; bmap.br_state = XFS_EXT_NORM; - return __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap); + __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap); } /* Compare rmap records. Returns -1 if a < b, 1 if a > b, and 0 if equal. */ @@ -2511,7 +2509,7 @@ xfs_rmap_has_other_keys_helper( ((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags) return 0; rks->has_rmap = true; - return XFS_BTREE_QUERY_RANGE_ABORT; + return -ECANCELED; } /* @@ -2540,8 +2538,11 @@ xfs_rmap_has_other_keys( error = xfs_rmap_query_range(cur, &low, &high, xfs_rmap_has_other_keys_helper, &rks); + if (error < 0) + return error; + *has_rmap = rks.has_rmap; - return error; + return 0; } const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = { diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index e21ed0294e5c..abe633403fd1 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -68,6 +68,7 @@ xfs_rmap_irec_offset_unpack( if (offset & ~(XFS_RMAP_OFF_MASK | XFS_RMAP_OFF_FLAGS)) return -EFSCORRUPTED; irec->rm_offset = XFS_RMAP_OFF(offset); + irec->rm_flags = 0; if (offset & XFS_RMAP_OFF_ATTR_FORK) irec->rm_flags |= XFS_RMAP_ATTR_FORK; if (offset & XFS_RMAP_OFF_BMBT_BLOCK) @@ -161,16 +162,16 @@ struct xfs_rmap_intent { }; /* functions for updating the rmapbt based on bmbt map/unmap operations */ -int xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, +void xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *imap); -int xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, +void xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *imap); -int xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp, +void xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *imap); -int xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno, +void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner); -int xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno, +void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner); void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp, diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index e0641b7337b3..c45acbd3add9 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -177,10 +177,4 @@ struct xfs_ino_geometry { unsigned int agino_log; /* #bits for agino in inum */ }; -/* Keep iterating the data structure. */ -#define XFS_ITER_CONTINUE (0) - -/* Stop iterating the data structure. */ -#define XFS_ITER_ABORT (1) - #endif /* __XFS_SHARED_H__ */ diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 802b34cd10fe..300b3e91ca3a 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -169,6 +169,14 @@ typedef struct xfs_bmbt_irec xfs_exntst_t br_state; /* extent state */ } xfs_bmbt_irec_t; +/* per-AG block reservation types */ +enum xfs_ag_resv_type { + XFS_AG_RESV_NONE = 0, + XFS_AG_RESV_AGFL, + XFS_AG_RESV_METADATA, + XFS_AG_RESV_RMAPBT, +}; + /* * Type verifier functions */ diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 16b09b941441..ba0f747c82e8 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -639,7 +639,7 @@ xchk_agfl_block( xchk_agfl_block_xref(sc, agbno); if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return XFS_ITER_ABORT; + return -ECANCELED; return 0; } @@ -730,7 +730,7 @@ xchk_agfl( /* Check the blocks in the AGFL. */ error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp), sc->sa.agfl_bp, xchk_agfl_block, &sai); - if (error == XFS_ITER_ABORT) { + if (error == -ECANCELED) { error = 0; goto out_free; } diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 1afc58bf71dd..0edc7f8eb96e 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -80,7 +80,7 @@ xchk_setup_xattr( * without the inode lock held, which means we can sleep. */ if (sc->flags & XCHK_TRY_HARDER) { - error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, KM_SLEEP); + error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, 0); if (error) return error; } @@ -163,8 +163,6 @@ xchk_xattr_listent( args.valuelen = valuelen; error = xfs_attr_get_ilocked(context->dp, &args); - if (error == -EEXIST) - error = 0; if (!xchk_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno, &error)) goto fail_xref; @@ -173,7 +171,7 @@ xchk_xattr_listent( args.blkno); fail_xref: if (sx->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - context->seen_enough = XFS_ITER_ABORT; + context->seen_enough = 1; return; } diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 1bd29fdc2ab5..fa6ea6407992 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -75,6 +75,7 @@ struct xchk_bmap_info { xfs_fileoff_t lastoff; bool is_rt; bool is_shared; + bool was_loaded; int whichfork; }; @@ -213,25 +214,20 @@ xchk_bmap_xref_rmap( /* Cross-reference a single rtdev extent record. */ STATIC void -xchk_bmap_rt_extent_xref( - struct xchk_bmap_info *info, +xchk_bmap_rt_iextent_xref( struct xfs_inode *ip, - struct xfs_btree_cur *cur, + struct xchk_bmap_info *info, struct xfs_bmbt_irec *irec) { - if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return; - xchk_xref_is_used_rt_space(info->sc, irec->br_startblock, irec->br_blockcount); } /* Cross-reference a single datadev extent record. */ STATIC void -xchk_bmap_extent_xref( - struct xchk_bmap_info *info, +xchk_bmap_iextent_xref( struct xfs_inode *ip, - struct xfs_btree_cur *cur, + struct xchk_bmap_info *info, struct xfs_bmbt_irec *irec) { struct xfs_mount *mp = info->sc->mp; @@ -240,9 +236,6 @@ xchk_bmap_extent_xref( xfs_extlen_t len; int error; - if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return; - agno = XFS_FSB_TO_AGNO(mp, irec->br_startblock); agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock); len = irec->br_blockcount; @@ -300,20 +293,15 @@ xchk_bmap_dirattr_extent( /* Scrub a single extent record. */ STATIC int -xchk_bmap_extent( +xchk_bmap_iextent( struct xfs_inode *ip, - struct xfs_btree_cur *cur, struct xchk_bmap_info *info, struct xfs_bmbt_irec *irec) { struct xfs_mount *mp = info->sc->mp; - struct xfs_buf *bp = NULL; xfs_filblks_t end; int error = 0; - if (cur) - xfs_btree_get_block(cur, 0, &bp); - /* * Check for out-of-order extents. This record could have come * from the incore list, for which there is no ordering check. @@ -364,10 +352,13 @@ xchk_bmap_extent( xchk_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); + if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + if (info->is_rt) - xchk_bmap_rt_extent_xref(info, ip, cur, irec); + xchk_bmap_rt_iextent_xref(ip, info, irec); else - xchk_bmap_extent_xref(info, ip, cur, irec); + xchk_bmap_iextent_xref(ip, info, irec); info->lastoff = irec->br_startoff + irec->br_blockcount; return error; @@ -380,10 +371,13 @@ xchk_bmapbt_rec( union xfs_btree_rec *rec) { struct xfs_bmbt_irec irec; + struct xfs_bmbt_irec iext_irec; + struct xfs_iext_cursor icur; struct xchk_bmap_info *info = bs->private; struct xfs_inode *ip = bs->cur->bc_private.b.ip; struct xfs_buf *bp = NULL; struct xfs_btree_block *block; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, info->whichfork); uint64_t owner; int i; @@ -402,9 +396,26 @@ xchk_bmapbt_rec( } } - /* Set up the in-core record and scrub it. */ + /* + * Check that the incore extent tree contains an extent that matches + * this one exactly. We validate those cached bmaps later, so we don't + * need to check them here. If the incore extent tree was just loaded + * from disk by the scrubber, we assume that its contents match what's + * on disk (we still hold the ILOCK) and skip the equivalence check. + */ + if (!info->was_loaded) + return 0; + xfs_bmbt_disk_get_all(&rec->bmbt, &irec); - return xchk_bmap_extent(ip, bs->cur, info, &irec); + if (!xfs_iext_lookup_extent(ip, ifp, irec.br_startoff, &icur, + &iext_irec) || + irec.br_startoff != iext_irec.br_startoff || + irec.br_startblock != iext_irec.br_startblock || + irec.br_blockcount != iext_irec.br_blockcount || + irec.br_state != iext_irec.br_state) + xchk_fblock_set_corrupt(bs->sc, info->whichfork, + irec.br_startoff); + return 0; } /* Scan the btree records. */ @@ -415,15 +426,26 @@ xchk_bmap_btree( struct xchk_bmap_info *info) { struct xfs_owner_info oinfo; + struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, whichfork); struct xfs_mount *mp = sc->mp; struct xfs_inode *ip = sc->ip; struct xfs_btree_cur *cur; int error; + /* Load the incore bmap cache if it's not loaded. */ + info->was_loaded = ifp->if_flags & XFS_IFEXTENTS; + if (!info->was_loaded) { + error = xfs_iread_extents(sc->tp, ip, whichfork); + if (!xchk_fblock_process_error(sc, whichfork, 0, &error)) + goto out; + } + + /* Check the btree structure. */ cur = xfs_bmbt_init_cursor(mp, sc->tp, ip, whichfork); xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork); error = xchk_btree(sc, cur, xchk_bmapbt_rec, &oinfo, info); xfs_btree_del_cursor(cur, error); +out: return error; } @@ -500,7 +522,7 @@ xchk_bmap_check_rmap( out: if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return XFS_BTREE_QUERY_RANGE_ABORT; + return -ECANCELED; return 0; } @@ -529,7 +551,7 @@ xchk_bmap_check_ag_rmaps( sbcri.sc = sc; sbcri.whichfork = whichfork; error = xfs_rmap_query_all(cur, xchk_bmap_check_rmap, &sbcri); - if (error == XFS_BTREE_QUERY_RANGE_ABORT) + if (error == -ECANCELED) error = 0; xfs_btree_del_cursor(cur, error); @@ -671,13 +693,6 @@ xchk_bmap( if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) goto out; - /* Now try to scrub the in-memory extent list. */ - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(sc->tp, ip, whichfork); - if (!xchk_fblock_process_error(sc, whichfork, 0, &error)) - goto out; - } - /* Find the offset of the last extent in the mapping. */ error = xfs_bmap_last_offset(ip, &endoff, whichfork); if (!xchk_fblock_process_error(sc, whichfork, 0, &error)) @@ -689,7 +704,7 @@ xchk_bmap( for_each_xfs_iext(ifp, &icur, &irec) { if (xchk_should_terminate(sc, &error) || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) - break; + goto out; if (isnullstartblock(irec.br_startblock)) continue; if (irec.br_startoff >= endoff) { @@ -697,7 +712,7 @@ xchk_bmap( irec.br_startoff); goto out; } - error = xchk_bmap_extent(ip, NULL, &info, &irec); + error = xchk_bmap_iextent(ip, &info, &irec); if (error) goto out; } diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index fc3f510c9034..98f82d7c8b40 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -125,7 +125,7 @@ xchk_setup_fscounters( struct xchk_fscounters *fsc; int error; - sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), KM_SLEEP); + sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), 0); if (!sc->buf) return -ENOMEM; fsc = sc->buf; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 4cfeec57fb05..b70a88bc975e 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -351,7 +351,7 @@ xrep_init_btblock( xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF); - xfs_trans_log_buf(tp, bp, 0, bp->b_length); + xfs_trans_log_buf(tp, bp, 0, BBTOB(bp->b_length) - 1); bp->b_ops = ops; *bpp = bp; @@ -664,7 +664,7 @@ xrep_findroot_agfl_walk( { xfs_agblock_t *agbno = priv; - return (*agbno == bno) ? XFS_ITER_ABORT : 0; + return (*agbno == bno) ? -ECANCELED : 0; } /* Does this block match the btree information passed in? */ @@ -694,7 +694,7 @@ xrep_findroot_block( if (owner == XFS_RMAP_OWN_AG) { error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp, xrep_findroot_agfl_walk, &agbno); - if (error == XFS_ITER_ABORT) + if (error == -ECANCELED) return 0; if (error) return error; diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index 99c0b1234c3c..5641ae512c9e 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -22,7 +22,7 @@ xchk_setup_symlink( struct xfs_inode *ip) { /* Allocate the buffer without the inode lock held. */ - sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, KM_SLEEP); + sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, 0); if (!sc->buf) return -ENOMEM; diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index cbda40d40326..96d7071cfa46 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -112,7 +112,7 @@ xfs_get_acl(struct inode *inode, int type) { struct xfs_inode *ip = XFS_I(inode); struct posix_acl *acl = NULL; - struct xfs_acl *xfs_acl; + struct xfs_acl *xfs_acl = NULL; unsigned char *ea_name; int error; int len; @@ -135,12 +135,8 @@ xfs_get_acl(struct inode *inode, int type) * go out to the disk. */ len = XFS_ACL_MAX_SIZE(ip->i_mount); - xfs_acl = kmem_zalloc_large(len, KM_SLEEP); - if (!xfs_acl) - return ERR_PTR(-ENOMEM); - - error = xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl, - &len, ATTR_ROOT); + error = xfs_attr_get(ip, ea_name, (unsigned char **)&xfs_acl, &len, + ATTR_ALLOC | ATTR_ROOT); if (error) { /* * If the attribute doesn't exist make sure we have a negative @@ -151,8 +147,8 @@ xfs_get_acl(struct inode *inode, int type) } else { acl = xfs_acl_from_disk(xfs_acl, len, XFS_ACL_MAX_ENTRIES(ip->i_mount)); + kmem_free(xfs_acl); } - kmem_free(xfs_acl); return acl; } @@ -180,7 +176,7 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) struct xfs_acl *xfs_acl; int len = XFS_ACL_MAX_SIZE(ip->i_mount); - xfs_acl = kmem_zalloc_large(len, KM_SLEEP); + xfs_acl = kmem_zalloc_large(len, 0); if (!xfs_acl) return -ENOMEM; diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index dc93c51c17de..a640a285cc52 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -147,7 +147,7 @@ xfs_attr3_leaf_inactive( * Allocate storage for a list of all the "remote" value extents. */ size = count * sizeof(xfs_attr_inactive_list_t); - list = kmem_alloc(size, KM_SLEEP); + list = kmem_alloc(size, 0); /* * Identify each of the "remote" value extents. diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 58fc820a70c6..00758fdc2fec 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -109,7 +109,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) * It didn't all fit, so we have to sort everything on hashval. */ sbsize = sf->hdr.count * sizeof(*sbuf); - sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS); + sbp = sbuf = kmem_alloc(sbsize, KM_NOFS); /* * Scan the attribute list for the rest of the entries, storing diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 9fa4a7ee8cfc..83d24e983d4c 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -141,7 +141,7 @@ xfs_bui_init( { struct xfs_bui_log_item *buip; - buip = kmem_zone_zalloc(xfs_bui_zone, KM_SLEEP); + buip = kmem_zone_zalloc(xfs_bui_zone, 0); xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops); buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS; @@ -218,7 +218,7 @@ xfs_trans_get_bud( { struct xfs_bud_log_item *budp; - budp = kmem_zone_zalloc(xfs_bud_zone, KM_SLEEP); + budp = kmem_zone_zalloc(xfs_bud_zone, 0); xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD, &xfs_bud_item_ops); budp->bud_buip = buip; @@ -542,9 +542,7 @@ xfs_bui_recover( irec.br_blockcount = count; irec.br_startoff = bmap->me_startoff; irec.br_state = state; - error = xfs_bmap_unmap_extent(tp, ip, &irec); - if (error) - goto err_inode; + xfs_bmap_unmap_extent(tp, ip, &irec); } set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 98c6a7a71427..0910cb75b65d 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -39,9 +39,9 @@ xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) { - return (XFS_IS_REALTIME_INODE(ip) ? \ - (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \ - XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); + if (XFS_IS_REALTIME_INODE(ip)) + return XFS_FSB_TO_BB(ip->i_mount, fsb); + return XFS_FSB_TO_DADDR(ip->i_mount, fsb); } /* @@ -1532,24 +1532,16 @@ xfs_swap_extent_rmap( trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec); /* Remove the mapping from the donor file. */ - error = xfs_bmap_unmap_extent(tp, tip, &uirec); - if (error) - goto out; + xfs_bmap_unmap_extent(tp, tip, &uirec); /* Remove the mapping from the source file. */ - error = xfs_bmap_unmap_extent(tp, ip, &irec); - if (error) - goto out; + xfs_bmap_unmap_extent(tp, ip, &irec); /* Map the donor file's blocks into the source file. */ - error = xfs_bmap_map_extent(tp, ip, &uirec); - if (error) - goto out; + xfs_bmap_map_extent(tp, ip, &uirec); /* Map the source file's blocks into the donor file. */ - error = xfs_bmap_map_extent(tp, tip, &irec); - if (error) - goto out; + xfs_bmap_map_extent(tp, tip, &irec); error = xfs_defer_finish(tpp); tp = *tpp; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index ca0849043f54..120ef99d09e8 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -353,7 +353,8 @@ xfs_buf_allocate_memory( */ size = BBTOB(bp->b_length); if (size < PAGE_SIZE) { - bp->b_addr = kmem_alloc(size, KM_NOFS); + int align_mask = xfs_buftarg_dma_alignment(bp->b_target); + bp->b_addr = kmem_alloc_io(size, align_mask, KM_NOFS); if (!bp->b_addr) { /* low memory - use alloc_page loop instead */ goto use_alloc_page; @@ -368,7 +369,7 @@ xfs_buf_allocate_memory( } bp->b_offset = offset_in_page(bp->b_addr); bp->b_pages = bp->b_page_array; - bp->b_pages[0] = virt_to_page(bp->b_addr); + bp->b_pages[0] = kmem_to_page(bp->b_addr); bp->b_page_count = 1; bp->b_flags |= _XBF_KMEM; return 0; @@ -1741,7 +1742,7 @@ xfs_alloc_buftarg( { xfs_buftarg_t *btp; - btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS); + btp = kmem_zalloc(sizeof(*btp), KM_NOFS); btp->bt_mount = mp; btp->bt_dev = bdev->bd_dev; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index c6e57a3f409e..f6ce17d8d848 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -350,6 +350,12 @@ extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int); #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) +static inline int +xfs_buftarg_dma_alignment(struct xfs_buftarg *bt) +{ + return queue_dma_alignment(bt->bt_bdev->bd_disk->queue); +} + int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops); bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic); bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 7dcaec54a20b..d74fbd1e9d3e 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -702,7 +702,7 @@ xfs_buf_item_get_format( } bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format), - KM_SLEEP); + 0); if (!bip->bli_formats) return -ENOMEM; return 0; @@ -747,7 +747,7 @@ xfs_buf_item_init( return 0; } - bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP); + bip = kmem_zone_zalloc(xfs_buf_item_zone, 0); xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); bip->bli_buf = bp; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index fb1ad4483081..aeb95e7391c1 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -440,7 +440,7 @@ xfs_dquot_alloc( { struct xfs_dquot *dqp; - dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP); + dqp = kmem_zone_zalloc(xfs_qm_dqzone, 0); dqp->dq_flags = type; dqp->q_core.d_id = cpu_to_be32(id); @@ -1239,7 +1239,7 @@ xfs_qm_exit(void) /* * Iterate every dquot of a particular type. The caller must ensure that the * particular quota type is active. iter_fn can return negative error codes, - * or XFS_ITER_ABORT to indicate that it wants to stop iterating. + * or -ECANCELED to indicate that it wants to stop iterating. */ int xfs_qm_dqiterate( diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 282ec5af293e..d60647d7197b 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -347,7 +347,7 @@ xfs_qm_qoff_logitem_init( { struct xfs_qoff_logitem *qf; - qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), KM_SLEEP); + qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), 0); xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ? &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops); diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 544c9482a0ef..849fd4476950 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -213,7 +213,7 @@ xfs_errortag_init( struct xfs_mount *mp) { mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX, - KM_SLEEP | KM_MAYFAIL); + KM_MAYFAIL); if (!mp->m_errortag) return -ENOMEM; diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 0ed68379e551..2183d87be4cf 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -33,7 +33,7 @@ xfs_extent_busy_insert( struct rb_node **rbp; struct rb_node *parent = NULL; - new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_SLEEP); + new = kmem_zalloc(sizeof(struct xfs_extent_busy), 0); new->agno = agno; new->bno = bno; new->length = len; diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 86f6512d6864..e44efc41a041 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -163,9 +163,9 @@ xfs_efi_init( if (nextents > XFS_EFI_MAX_FAST_EXTENTS) { size = (uint)(sizeof(xfs_efi_log_item_t) + ((nextents - 1) * sizeof(xfs_extent_t))); - efip = kmem_zalloc(size, KM_SLEEP); + efip = kmem_zalloc(size, 0); } else { - efip = kmem_zone_zalloc(xfs_efi_zone, KM_SLEEP); + efip = kmem_zone_zalloc(xfs_efi_zone, 0); } xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); @@ -333,9 +333,9 @@ xfs_trans_get_efd( if (nextents > XFS_EFD_MAX_FAST_EXTENTS) { efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) + (nextents - 1) * sizeof(struct xfs_extent), - KM_SLEEP); + 0); } else { - efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP); + efdp = kmem_zone_zalloc(xfs_efd_zone, 0); } xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 28101bbc0b78..d952d5962e93 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -28,6 +28,7 @@ #include <linux/falloc.h> #include <linux/backing-dev.h> #include <linux/mman.h> +#include <linux/fadvise.h> static const struct vm_operations_struct xfs_file_vm_ops; @@ -933,6 +934,30 @@ out_unlock: return error; } +STATIC int +xfs_file_fadvise( + struct file *file, + loff_t start, + loff_t end, + int advice) +{ + struct xfs_inode *ip = XFS_I(file_inode(file)); + int ret; + int lockflags = 0; + + /* + * Operations creating pages in page cache need protection from hole + * punching and similar ops + */ + if (advice == POSIX_FADV_WILLNEED) { + lockflags = XFS_IOLOCK_SHARED; + xfs_ilock(ip, lockflags); + } + ret = generic_fadvise(file, start, end, advice); + if (lockflags) + xfs_iunlock(ip, lockflags); + return ret; +} STATIC loff_t xfs_file_remap_range( @@ -1232,6 +1257,7 @@ const struct file_operations xfs_file_operations = { .fsync = xfs_file_fsync, .get_unmapped_area = thp_get_unmapped_area, .fallocate = xfs_file_fallocate, + .fadvise = xfs_file_fadvise, .remap_file_range = xfs_file_remap_range, }; diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 5a8f9641562a..d082143feb5a 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -250,7 +250,7 @@ xfs_getfsmap_helper( rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); if (info->next_daddr < rec_daddr) info->next_daddr = rec_daddr; - return XFS_BTREE_QUERY_RANGE_CONTINUE; + return 0; } /* Are we just counting mappings? */ @@ -259,14 +259,14 @@ xfs_getfsmap_helper( info->head->fmh_entries++; if (info->last) - return XFS_BTREE_QUERY_RANGE_CONTINUE; + return 0; info->head->fmh_entries++; rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); if (info->next_daddr < rec_daddr) info->next_daddr = rec_daddr; - return XFS_BTREE_QUERY_RANGE_CONTINUE; + return 0; } /* @@ -276,7 +276,7 @@ xfs_getfsmap_helper( */ if (rec_daddr > info->next_daddr) { if (info->head->fmh_entries >= info->head->fmh_count) - return XFS_BTREE_QUERY_RANGE_ABORT; + return -ECANCELED; fmr.fmr_device = info->dev; fmr.fmr_physical = info->next_daddr; @@ -295,7 +295,7 @@ xfs_getfsmap_helper( /* Fill out the extent we found */ if (info->head->fmh_entries >= info->head->fmh_count) - return XFS_BTREE_QUERY_RANGE_ABORT; + return -ECANCELED; trace_xfs_fsmap_mapping(mp, info->dev, info->agno, rec); @@ -328,7 +328,7 @@ out: rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); if (info->next_daddr < rec_daddr) info->next_daddr = rec_daddr; - return XFS_BTREE_QUERY_RANGE_CONTINUE; + return 0; } /* Transform a rmapbt irec into a fsmap */ diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 0b0fd10a36d4..944add5ff8e0 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -40,7 +40,7 @@ xfs_inode_alloc( * KM_MAYFAIL and return NULL here on ENOMEM. Set the * code up to do this anyway. */ - ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); + ip = kmem_zone_alloc(xfs_inode_zone, 0); if (!ip) return NULL; if (inode_init_always(mp->m_super, VFS_I(ip))) { diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index d99a0a3e5f40..3ebd1b7f49d8 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -89,7 +89,7 @@ xfs_icreate_log( { struct xfs_icreate_item *icp; - icp = kmem_zone_zalloc(xfs_icreate_zone, KM_SLEEP); + icp = kmem_zone_zalloc(xfs_icreate_zone, 0); xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE, &xfs_icreate_item_ops); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 6467d5e1df2d..18f4b262e61c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2018,7 +2018,7 @@ xfs_iunlink_add_backref( if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK)) return 0; - iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS); + iu = kmem_zalloc(sizeof(*iu), KM_NOFS); iu->iu_agino = prev_agino; iu->iu_next_unlinked = this_agino; @@ -3282,7 +3282,8 @@ xfs_rename( spaceres); /* - * Set up the target. + * Check for expected errors before we dirty the transaction + * so we can return an error without a transaction abort. */ if (target_ip == NULL) { /* @@ -3294,6 +3295,46 @@ xfs_rename( if (error) goto out_trans_cancel; } + } else { + /* + * If target exists and it's a directory, check that whether + * it can be destroyed. + */ + if (S_ISDIR(VFS_I(target_ip)->i_mode) && + (!xfs_dir_isempty(target_ip) || + (VFS_I(target_ip)->i_nlink > 2))) { + error = -EEXIST; + goto out_trans_cancel; + } + } + + /* + * Directory entry creation below may acquire the AGF. Remove + * the whiteout from the unlinked list first to preserve correct + * AGI/AGF locking order. This dirties the transaction so failures + * after this point will abort and log recovery will clean up the + * mess. + * + * For whiteouts, we need to bump the link count on the whiteout + * inode. After this point, we have a real link, clear the tmpfile + * state flag from the inode so it doesn't accidentally get misused + * in future. + */ + if (wip) { + ASSERT(VFS_I(wip)->i_nlink == 0); + error = xfs_iunlink_remove(tp, wip); + if (error) + goto out_trans_cancel; + + xfs_bumplink(tp, wip); + xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE); + VFS_I(wip)->i_state &= ~I_LINKABLE; + } + + /* + * Set up the target. + */ + if (target_ip == NULL) { /* * If target does not exist and the rename crosses * directories, adjust the target directory link count @@ -3312,22 +3353,6 @@ xfs_rename( } } else { /* target_ip != NULL */ /* - * If target exists and it's a directory, check that both - * target and source are directories and that target can be - * destroyed, or that neither is a directory. - */ - if (S_ISDIR(VFS_I(target_ip)->i_mode)) { - /* - * Make sure target dir is empty. - */ - if (!(xfs_dir_isempty(target_ip)) || - (VFS_I(target_ip)->i_nlink > 2)) { - error = -EEXIST; - goto out_trans_cancel; - } - } - - /* * Link the source inode under the target name. * If the source inode is a directory and we are moving * it across directories, its ".." entry will be @@ -3417,30 +3442,6 @@ xfs_rename( if (error) goto out_trans_cancel; - /* - * For whiteouts, we need to bump the link count on the whiteout inode. - * This means that failures all the way up to this point leave the inode - * on the unlinked list and so cleanup is a simple matter of dropping - * the remaining reference to it. If we fail here after bumping the link - * count, we're shutting down the filesystem so we'll never see the - * intermediate state on disk. - */ - if (wip) { - ASSERT(VFS_I(wip)->i_nlink == 0); - xfs_bumplink(tp, wip); - error = xfs_iunlink_remove(tp, wip); - if (error) - goto out_trans_cancel; - xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE); - - /* - * Now we have a real link, clear the "I'm a tmpfile" state - * flag from the inode so it doesn't accidentally get misused in - * future. - */ - VFS_I(wip)->i_state &= ~I_LINKABLE; - } - xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); if (new_parent) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index c9a502eed204..bb8f076805b9 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -651,7 +651,7 @@ xfs_inode_item_init( struct xfs_inode_log_item *iip; ASSERT(ip->i_itemp == NULL); - iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP); + iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, 0); iip->ili_inode = ip; xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE, diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 6f7848cd5527..d58f0d6a699e 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -67,7 +67,7 @@ xfs_find_handle( return -EBADF; inode = file_inode(f.file); } else { - error = user_lpath((const char __user *)hreq->path, &path); + error = user_path_at(AT_FDCWD, hreq->path, 0, &path); if (error) return error; inode = d_inode(path.dentry); @@ -396,7 +396,7 @@ xfs_attrlist_by_handle( if (IS_ERR(dentry)) return PTR_ERR(dentry); - kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP); + kbuf = kmem_zalloc_large(al_hreq.buflen, 0); if (!kbuf) goto out_dput; @@ -434,11 +434,11 @@ xfs_attrmulti_attr_get( if (*len > XFS_XATTR_SIZE_MAX) return -EINVAL; - kbuf = kmem_zalloc_large(*len, KM_SLEEP); + kbuf = kmem_zalloc_large(*len, 0); if (!kbuf) return -ENOMEM; - error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags); + error = xfs_attr_get(XFS_I(inode), name, &kbuf, (int *)len, flags); if (error) goto out_kfree; @@ -831,7 +831,7 @@ xfs_bulkstat_fmt( /* * Check the incoming bulk request @hdr from userspace and initialize the * internal @breq bulk request appropriately. Returns 0 if the bulk request - * should proceed; XFS_ITER_ABORT if there's nothing to do; or the usual + * should proceed; -ECANCELED if there's nothing to do; or the usual * negative error code. */ static int @@ -889,13 +889,13 @@ xfs_bulk_ireq_setup( /* Asking for an inode past the end of the AG? We're done! */ if (XFS_INO_TO_AGNO(mp, breq->startino) > hdr->agno) - return XFS_ITER_ABORT; + return -ECANCELED; } else if (hdr->agno) return -EINVAL; /* Asking for an inode past the end of the FS? We're done! */ if (XFS_INO_TO_AGNO(mp, breq->startino) >= mp->m_sb.sb_agcount) - return XFS_ITER_ABORT; + return -ECANCELED; return 0; } @@ -936,7 +936,7 @@ xfs_ioc_bulkstat( return -EFAULT; error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->bulkstat); - if (error == XFS_ITER_ABORT) + if (error == -ECANCELED) goto out_teardown; if (error < 0) return error; @@ -986,7 +986,7 @@ xfs_ioc_inumbers( return -EFAULT; error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->inumbers); - if (error == XFS_ITER_ABORT) + if (error == -ECANCELED) goto out_teardown; if (error < 0) return error; @@ -1038,6 +1038,10 @@ xfs_ioc_ag_geometry( if (copy_from_user(&ageo, arg, sizeof(ageo))) return -EFAULT; + if (ageo.ag_flags) + return -EINVAL; + if (memchr_inv(&ageo.ag_reserved, 0, sizeof(ageo.ag_reserved))) + return -EINVAL; error = xfs_ag_get_geometry(mp, ageo.ag_number, &ageo); if (error) @@ -1309,8 +1313,7 @@ xfs_ioctl_setattr_dax_invalidate( if (fa->fsx_xflags & FS_XFLAG_DAX) { if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) return -EINVAL; - if (S_ISREG(inode->i_mode) && - !bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)), + if (!bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)), sb->s_blocksize)) return -EINVAL; } @@ -1881,7 +1884,7 @@ xfs_ioc_getfsmap( info.mp = ip->i_mount; info.data = arg; error = xfs_getfsmap(ip->i_mount, &xhead, xfs_getfsmap_format, &info); - if (error == XFS_BTREE_QUERY_RANGE_ABORT) { + if (error == -ECANCELED) { error = 0; aborted = true; } else if (error) diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 7bd7534f5051..1e08bf79b478 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -381,7 +381,7 @@ xfs_compat_attrlist_by_handle( return PTR_ERR(dentry); error = -ENOMEM; - kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP); + kbuf = kmem_zalloc_large(al_hreq.buflen, 0); if (!kbuf) goto out_dput; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 3a4310d7cb59..f780e223b118 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -58,7 +58,7 @@ xfs_bmbt_to_iomap( { struct xfs_mount *mp = ip->i_mount; - if (unlikely(!imap->br_startblock && !XFS_IS_REALTIME_INODE(ip))) + if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) return xfs_alert_fsblock_zero(ip, imap); if (imap->br_startblock == HOLESTARTBLOCK) { @@ -297,7 +297,7 @@ xfs_iomap_write_direct( goto out_unlock; } - if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) + if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) error = xfs_alert_fsblock_zero(ip, imap); out_unlock: @@ -814,7 +814,7 @@ xfs_iomap_write_unwritten( if (error) return error; - if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) + if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock))) return xfs_alert_fsblock_zero(ip, &imap); if ((numblks_fsb = imap.br_blockcount) == 0) { diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index f5c955d35be4..884950adbd16 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -137,7 +137,7 @@ xfs_bulkstat_one_int( xfs_irele(ip); error = bc->formatter(bc->breq, buf); - if (error == XFS_IBULK_ABORT) + if (error == -ECANCELED) goto out_advance; if (error) goto out; @@ -169,7 +169,7 @@ xfs_bulkstat_one( ASSERT(breq->icount == 1); bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat), - KM_SLEEP | KM_MAYFAIL); + KM_MAYFAIL); if (!bc.buf) return -ENOMEM; @@ -181,7 +181,7 @@ xfs_bulkstat_one( * If we reported one inode to userspace then we abort because we hit * the end of the buffer. Don't leak that back to userspace. */ - if (error == XFS_IWALK_ABORT) + if (error == -ECANCELED) error = 0; return error; @@ -243,7 +243,7 @@ xfs_bulkstat( return 0; bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat), - KM_SLEEP | KM_MAYFAIL); + KM_MAYFAIL); if (!bc.buf) return -ENOMEM; @@ -342,7 +342,7 @@ xfs_inumbers_walk( int error; error = ic->formatter(ic->breq, &inogrp); - if (error && error != XFS_IBULK_ABORT) + if (error && error != -ECANCELED) return error; ic->breq->startino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino) + diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index e90c1fc5b981..96a1e2a9be3f 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -18,9 +18,6 @@ struct xfs_ibulk { /* Only iterate within the same AG as startino */ #define XFS_IBULK_SAME_AG (XFS_IWALK_SAME_AG) -/* Return value that means we want to abort the walk. */ -#define XFS_IBULK_ABORT (XFS_IWALK_ABORT) - /* * Advance the user buffer pointer by one record of the given size. If the * buffer is now full, return the appropriate error code. @@ -34,13 +31,21 @@ xfs_ibulk_advance( breq->ubuffer = b + bytes; breq->ocount++; - return breq->ocount == breq->icount ? XFS_IBULK_ABORT : 0; + return breq->ocount == breq->icount ? -ECANCELED : 0; } /* * Return stat information in bulk (by-inode) for the filesystem. */ +/* + * Return codes for the formatter function are 0 to continue iterating, and + * non-zero to stop iterating. Any non-zero value will be passed up to the + * bulkstat/inumbers caller. The special value -ECANCELED can be used to stop + * iteration, as neither bulkstat nor inumbers will ever generate that error + * code on their own. + */ + typedef int (*bulkstat_one_fmt_pf)(struct xfs_ibulk *breq, const struct xfs_bulkstat *bstat); diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index 8c7d727149ea..aa375cf53021 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -31,7 +31,7 @@ * inode it finds, it calls a walk function with the relevant inode number and * a pointer to caller-provided data. The walk function can return the usual * negative error code to stop the iteration; 0 to continue the iteration; or - * XFS_IWALK_ABORT to stop the iteration. This return value is returned to the + * -ECANCELED to stop the iteration. This return value is returned to the * caller. * * Internally, we allow the walk function to do anything, which means that we @@ -616,7 +616,7 @@ xfs_iwalk_threaded( if (xfs_pwork_ctl_want_abort(&pctl)) break; - iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), KM_SLEEP); + iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), 0); iwag->mp = mp; iwag->iwalk_fn = iwalk_fn; iwag->data = data; diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h index 6c960e10ed4d..37a795f03267 100644 --- a/fs/xfs/xfs_iwalk.h +++ b/fs/xfs/xfs_iwalk.h @@ -6,12 +6,17 @@ #ifndef __XFS_IWALK_H__ #define __XFS_IWALK_H__ +/* + * Return codes for the inode/inobt walk function are 0 to continue iterating, + * and non-zero to stop iterating. Any non-zero value will be passed up to the + * iwalk or inobt_walk caller. The special value -ECANCELED can be used to + * stop iteration, as neither iwalk nor inobt_walk will ever generate that + * error code on their own. + */ + /* Walk all inodes in the filesystem starting from @startino. */ typedef int (*xfs_iwalk_fn)(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, void *data); -/* Return values for xfs_iwalk_fn. */ -#define XFS_IWALK_CONTINUE (XFS_ITER_CONTINUE) -#define XFS_IWALK_ABORT (XFS_ITER_ABORT) int xfs_iwalk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino, unsigned int flags, xfs_iwalk_fn iwalk_fn, @@ -30,8 +35,6 @@ typedef int (*xfs_inobt_walk_fn)(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, const struct xfs_inobt_rec_incore *irec, void *data); -/* Return value (for xfs_inobt_walk_fn) that aborts the walk immediately. */ -#define XFS_INOBT_WALK_ABORT (XFS_IWALK_ABORT) int xfs_inobt_walk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino, unsigned int flags, diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 7fc3c1ad36bc..a2beee9f74da 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -214,15 +214,42 @@ xlog_grant_head_wake( { struct xlog_ticket *tic; int need_bytes; + bool woken_task = false; list_for_each_entry(tic, &head->waiters, t_queue) { + + /* + * There is a chance that the size of the CIL checkpoints in + * progress at the last AIL push target calculation resulted in + * limiting the target to the log head (l_last_sync_lsn) at the + * time. This may not reflect where the log head is now as the + * CIL checkpoints may have completed. + * + * Hence when we are woken here, it may be that the head of the + * log that has moved rather than the tail. As the tail didn't + * move, there still won't be space available for the + * reservation we require. However, if the AIL has already + * pushed to the target defined by the old log head location, we + * will hang here waiting for something else to update the AIL + * push target. + * + * Therefore, if there isn't space to wake the first waiter on + * the grant head, we need to push the AIL again to ensure the + * target reflects both the current log tail and log head + * position before we wait for the tail to move again. + */ + need_bytes = xlog_ticket_reservation(log, head, tic); - if (*free_bytes < need_bytes) + if (*free_bytes < need_bytes) { + if (!woken_task) + xlog_grant_push_ail(log, need_bytes); return false; + } *free_bytes -= need_bytes; trace_xfs_log_grant_wake_up(log, tic); wake_up_process(tic->t_task); + woken_task = true; } return true; @@ -428,8 +455,7 @@ xfs_log_reserve( XFS_STATS_INC(mp, xs_try_logspace); ASSERT(*ticp == NULL); - tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, - KM_SLEEP); + tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, 0); *ticp = tic; xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt @@ -1404,6 +1430,7 @@ xlog_alloc_log( */ ASSERT(log->l_iclog_size >= 4096); for (i = 0; i < log->l_iclog_bufs; i++) { + int align_mask = xfs_buftarg_dma_alignment(mp->m_logdev_targp); size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) * sizeof(struct bio_vec); @@ -1415,8 +1442,8 @@ xlog_alloc_log( iclog->ic_prev = prev_iclog; prev_iclog = iclog; - iclog->ic_data = kmem_alloc_large(log->l_iclog_size, - KM_MAYFAIL); + iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask, + KM_MAYFAIL); if (!iclog->ic_data) goto out_free_iclog; #ifdef DEBUG @@ -2496,21 +2523,35 @@ next_lv: ***************************************************************************** */ -/* Clean iclogs starting from the head. This ordering must be - * maintained, so an iclog doesn't become ACTIVE beyond one that - * is SYNCING. This is also required to maintain the notion that we use - * a ordered wait queue to hold off would be writers to the log when every - * iclog is trying to sync to disk. +/* + * An iclog has just finished IO completion processing, so we need to update + * the iclog state and propagate that up into the overall log state. Hence we + * prepare the iclog for cleaning, and then clean all the pending dirty iclogs + * starting from the head, and then wake up any threads that are waiting for the + * iclog to be marked clean. + * + * The ordering of marking iclogs ACTIVE must be maintained, so an iclog + * doesn't become ACTIVE beyond one that is SYNCING. This is also required to + * maintain the notion that we use a ordered wait queue to hold off would be + * writers to the log when every iclog is trying to sync to disk. + * + * Caller must hold the icloglock before calling us. * - * State Change: DIRTY -> ACTIVE + * State Change: !IOERROR -> DIRTY -> ACTIVE */ STATIC void -xlog_state_clean_log( - struct xlog *log) +xlog_state_clean_iclog( + struct xlog *log, + struct xlog_in_core *dirty_iclog) { - xlog_in_core_t *iclog; - int changed = 0; + struct xlog_in_core *iclog; + int changed = 0; + /* Prepare the completed iclog. */ + if (!(dirty_iclog->ic_state & XLOG_STATE_IOERROR)) + dirty_iclog->ic_state = XLOG_STATE_DIRTY; + + /* Walk all the iclogs to update the ordered active state. */ iclog = log->l_iclog; do { if (iclog->ic_state == XLOG_STATE_DIRTY) { @@ -2548,7 +2589,13 @@ xlog_state_clean_log( iclog = iclog->ic_next; } while (iclog != log->l_iclog); - /* log is locked when we are called */ + + /* + * Wake up threads waiting in xfs_log_force() for the dirty iclog + * to be cleaned. + */ + wake_up_all(&dirty_iclog->ic_force_wait); + /* * Change state for the dummy log recording. * We usually go to NEED. But we go to NEED2 if the changed indicates @@ -2582,7 +2629,7 @@ xlog_state_clean_log( ASSERT(0); } } -} /* xlog_state_clean_log */ +} STATIC xfs_lsn_t xlog_get_lowest_lsn( @@ -2603,30 +2650,205 @@ xlog_get_lowest_lsn( return lowest_lsn; } +/* + * Completion of a iclog IO does not imply that a transaction has completed, as + * transactions can be large enough to span many iclogs. We cannot change the + * tail of the log half way through a transaction as this may be the only + * transaction in the log and moving the tail to point to the middle of it + * will prevent recovery from finding the start of the transaction. Hence we + * should only update the last_sync_lsn if this iclog contains transaction + * completion callbacks on it. + * + * We have to do this before we drop the icloglock to ensure we are the only one + * that can update it. + * + * If we are moving the last_sync_lsn forwards, we also need to ensure we kick + * the reservation grant head pushing. This is due to the fact that the push + * target is bound by the current last_sync_lsn value. Hence if we have a large + * amount of log space bound up in this committing transaction then the + * last_sync_lsn value may be the limiting factor preventing tail pushing from + * freeing space in the log. Hence once we've updated the last_sync_lsn we + * should push the AIL to ensure the push target (and hence the grant head) is + * no longer bound by the old log head location and can move forwards and make + * progress again. + */ +static void +xlog_state_set_callback( + struct xlog *log, + struct xlog_in_core *iclog, + xfs_lsn_t header_lsn) +{ + iclog->ic_state = XLOG_STATE_CALLBACK; + + ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), + header_lsn) <= 0); + + if (list_empty_careful(&iclog->ic_callbacks)) + return; + + atomic64_set(&log->l_last_sync_lsn, header_lsn); + xlog_grant_push_ail(log, 0); +} + +/* + * Return true if we need to stop processing, false to continue to the next + * iclog. The caller will need to run callbacks if the iclog is returned in the + * XLOG_STATE_CALLBACK state. + */ +static bool +xlog_state_iodone_process_iclog( + struct xlog *log, + struct xlog_in_core *iclog, + struct xlog_in_core *completed_iclog, + bool *ioerror) +{ + xfs_lsn_t lowest_lsn; + xfs_lsn_t header_lsn; + + /* Skip all iclogs in the ACTIVE & DIRTY states */ + if (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY)) + return false; + + /* + * Between marking a filesystem SHUTDOWN and stopping the log, we do + * flush all iclogs to disk (if there wasn't a log I/O error). So, we do + * want things to go smoothly in case of just a SHUTDOWN w/o a + * LOG_IO_ERROR. + */ + if (iclog->ic_state & XLOG_STATE_IOERROR) { + *ioerror = true; + return false; + } + + /* + * Can only perform callbacks in order. Since this iclog is not in the + * DONE_SYNC/ DO_CALLBACK state, we skip the rest and just try to clean + * up. If we set our iclog to DO_CALLBACK, we will not process it when + * we retry since a previous iclog is in the CALLBACK and the state + * cannot change since we are holding the l_icloglock. + */ + if (!(iclog->ic_state & + (XLOG_STATE_DONE_SYNC | XLOG_STATE_DO_CALLBACK))) { + if (completed_iclog && + (completed_iclog->ic_state == XLOG_STATE_DONE_SYNC)) { + completed_iclog->ic_state = XLOG_STATE_DO_CALLBACK; + } + return true; + } + + /* + * We now have an iclog that is in either the DO_CALLBACK or DONE_SYNC + * states. The other states (WANT_SYNC, SYNCING, or CALLBACK were caught + * by the above if and are going to clean (i.e. we aren't doing their + * callbacks) see the above if. + * + * We will do one more check here to see if we have chased our tail + * around. If this is not the lowest lsn iclog, then we will leave it + * for another completion to process. + */ + header_lsn = be64_to_cpu(iclog->ic_header.h_lsn); + lowest_lsn = xlog_get_lowest_lsn(log); + if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0) + return false; + + xlog_state_set_callback(log, iclog, header_lsn); + return false; + +} + +/* + * Keep processing entries in the iclog callback list until we come around and + * it is empty. We need to atomically see that the list is empty and change the + * state to DIRTY so that we don't miss any more callbacks being added. + * + * This function is called with the icloglock held and returns with it held. We + * drop it while running callbacks, however, as holding it over thousands of + * callbacks is unnecessary and causes excessive contention if we do. + */ +static void +xlog_state_do_iclog_callbacks( + struct xlog *log, + struct xlog_in_core *iclog, + bool aborted) +{ + spin_unlock(&log->l_icloglock); + spin_lock(&iclog->ic_callback_lock); + while (!list_empty(&iclog->ic_callbacks)) { + LIST_HEAD(tmp); + + list_splice_init(&iclog->ic_callbacks, &tmp); + + spin_unlock(&iclog->ic_callback_lock); + xlog_cil_process_committed(&tmp, aborted); + spin_lock(&iclog->ic_callback_lock); + } + + /* + * Pick up the icloglock while still holding the callback lock so we + * serialise against anyone trying to add more callbacks to this iclog + * now we've finished processing. + */ + spin_lock(&log->l_icloglock); + spin_unlock(&iclog->ic_callback_lock); +} + +#ifdef DEBUG +/* + * Make one last gasp attempt to see if iclogs are being left in limbo. If the + * above loop finds an iclog earlier than the current iclog and in one of the + * syncing states, the current iclog is put into DO_CALLBACK and the callbacks + * are deferred to the completion of the earlier iclog. Walk the iclogs in order + * and make sure that no iclog is in DO_CALLBACK unless an earlier iclog is in + * one of the syncing states. + * + * Note that SYNCING|IOERROR is a valid state so we cannot just check for + * ic_state == SYNCING. + */ +static void +xlog_state_callback_check_state( + struct xlog *log) +{ + struct xlog_in_core *first_iclog = log->l_iclog; + struct xlog_in_core *iclog = first_iclog; + + do { + ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK); + /* + * Terminate the loop if iclogs are found in states + * which will cause other threads to clean up iclogs. + * + * SYNCING - i/o completion will go through logs + * DONE_SYNC - interrupt thread should be waiting for + * l_icloglock + * IOERROR - give up hope all ye who enter here + */ + if (iclog->ic_state == XLOG_STATE_WANT_SYNC || + iclog->ic_state & XLOG_STATE_SYNCING || + iclog->ic_state == XLOG_STATE_DONE_SYNC || + iclog->ic_state == XLOG_STATE_IOERROR ) + break; + iclog = iclog->ic_next; + } while (first_iclog != iclog); +} +#else +#define xlog_state_callback_check_state(l) ((void)0) +#endif + STATIC void xlog_state_do_callback( struct xlog *log, bool aborted, struct xlog_in_core *ciclog) { - xlog_in_core_t *iclog; - xlog_in_core_t *first_iclog; /* used to know when we've - * processed all iclogs once */ - int flushcnt = 0; - xfs_lsn_t lowest_lsn; - int ioerrors; /* counter: iclogs with errors */ - int loopdidcallbacks; /* flag: inner loop did callbacks*/ - int funcdidcallbacks; /* flag: function did callbacks */ - int repeats; /* for issuing console warnings if - * looping too many times */ - int wake = 0; + struct xlog_in_core *iclog; + struct xlog_in_core *first_iclog; + bool did_callbacks = false; + bool cycled_icloglock; + bool ioerror; + int flushcnt = 0; + int repeats = 0; spin_lock(&log->l_icloglock); - first_iclog = iclog = log->l_iclog; - ioerrors = 0; - funcdidcallbacks = 0; - repeats = 0; - do { /* * Scan all iclogs starting with the one pointed to by the @@ -2638,137 +2860,34 @@ xlog_state_do_callback( */ first_iclog = log->l_iclog; iclog = log->l_iclog; - loopdidcallbacks = 0; + cycled_icloglock = false; + ioerror = false; repeats++; do { + if (xlog_state_iodone_process_iclog(log, iclog, + ciclog, &ioerror)) + break; - /* skip all iclogs in the ACTIVE & DIRTY states */ - if (iclog->ic_state & - (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY)) { + if (!(iclog->ic_state & + (XLOG_STATE_CALLBACK | XLOG_STATE_IOERROR))) { iclog = iclog->ic_next; continue; } /* - * Between marking a filesystem SHUTDOWN and stopping - * the log, we do flush all iclogs to disk (if there - * wasn't a log I/O error). So, we do want things to - * go smoothly in case of just a SHUTDOWN w/o a - * LOG_IO_ERROR. - */ - if (!(iclog->ic_state & XLOG_STATE_IOERROR)) { - /* - * Can only perform callbacks in order. Since - * this iclog is not in the DONE_SYNC/ - * DO_CALLBACK state, we skip the rest and - * just try to clean up. If we set our iclog - * to DO_CALLBACK, we will not process it when - * we retry since a previous iclog is in the - * CALLBACK and the state cannot change since - * we are holding the l_icloglock. - */ - if (!(iclog->ic_state & - (XLOG_STATE_DONE_SYNC | - XLOG_STATE_DO_CALLBACK))) { - if (ciclog && (ciclog->ic_state == - XLOG_STATE_DONE_SYNC)) { - ciclog->ic_state = XLOG_STATE_DO_CALLBACK; - } - break; - } - /* - * We now have an iclog that is in either the - * DO_CALLBACK or DONE_SYNC states. The other - * states (WANT_SYNC, SYNCING, or CALLBACK were - * caught by the above if and are going to - * clean (i.e. we aren't doing their callbacks) - * see the above if. - */ - - /* - * We will do one more check here to see if we - * have chased our tail around. - */ - - lowest_lsn = xlog_get_lowest_lsn(log); - if (lowest_lsn && - XFS_LSN_CMP(lowest_lsn, - be64_to_cpu(iclog->ic_header.h_lsn)) < 0) { - iclog = iclog->ic_next; - continue; /* Leave this iclog for - * another thread */ - } - - iclog->ic_state = XLOG_STATE_CALLBACK; - - - /* - * Completion of a iclog IO does not imply that - * a transaction has completed, as transactions - * can be large enough to span many iclogs. We - * cannot change the tail of the log half way - * through a transaction as this may be the only - * transaction in the log and moving th etail to - * point to the middle of it will prevent - * recovery from finding the start of the - * transaction. Hence we should only update the - * last_sync_lsn if this iclog contains - * transaction completion callbacks on it. - * - * We have to do this before we drop the - * icloglock to ensure we are the only one that - * can update it. - */ - ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), - be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); - if (!list_empty_careful(&iclog->ic_callbacks)) - atomic64_set(&log->l_last_sync_lsn, - be64_to_cpu(iclog->ic_header.h_lsn)); - - } else - ioerrors++; - - spin_unlock(&log->l_icloglock); - - /* - * Keep processing entries in the callback list until - * we come around and it is empty. We need to - * atomically see that the list is empty and change the - * state to DIRTY so that we don't miss any more - * callbacks being added. - */ - spin_lock(&iclog->ic_callback_lock); - while (!list_empty(&iclog->ic_callbacks)) { - LIST_HEAD(tmp); - - list_splice_init(&iclog->ic_callbacks, &tmp); - - spin_unlock(&iclog->ic_callback_lock); - xlog_cil_process_committed(&tmp, aborted); - spin_lock(&iclog->ic_callback_lock); - } - - loopdidcallbacks++; - funcdidcallbacks++; - - spin_lock(&log->l_icloglock); - spin_unlock(&iclog->ic_callback_lock); - if (!(iclog->ic_state & XLOG_STATE_IOERROR)) - iclog->ic_state = XLOG_STATE_DIRTY; - - /* - * Transition from DIRTY to ACTIVE if applicable. - * NOP if STATE_IOERROR. + * Running callbacks will drop the icloglock which means + * we'll have to run at least one more complete loop. */ - xlog_state_clean_log(log); - - /* wake up threads waiting in xfs_log_force() */ - wake_up_all(&iclog->ic_force_wait); + cycled_icloglock = true; + xlog_state_do_iclog_callbacks(log, iclog, aborted); + xlog_state_clean_iclog(log, iclog); iclog = iclog->ic_next; } while (first_iclog != iclog); + did_callbacks |= cycled_icloglock; + if (repeats > 5000) { flushcnt += repeats; repeats = 0; @@ -2776,50 +2895,15 @@ xlog_state_do_callback( "%s: possible infinite loop (%d iterations)", __func__, flushcnt); } - } while (!ioerrors && loopdidcallbacks); + } while (!ioerror && cycled_icloglock); -#ifdef DEBUG - /* - * Make one last gasp attempt to see if iclogs are being left in limbo. - * If the above loop finds an iclog earlier than the current iclog and - * in one of the syncing states, the current iclog is put into - * DO_CALLBACK and the callbacks are deferred to the completion of the - * earlier iclog. Walk the iclogs in order and make sure that no iclog - * is in DO_CALLBACK unless an earlier iclog is in one of the syncing - * states. - * - * Note that SYNCING|IOABORT is a valid state so we cannot just check - * for ic_state == SYNCING. - */ - if (funcdidcallbacks) { - first_iclog = iclog = log->l_iclog; - do { - ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK); - /* - * Terminate the loop if iclogs are found in states - * which will cause other threads to clean up iclogs. - * - * SYNCING - i/o completion will go through logs - * DONE_SYNC - interrupt thread should be waiting for - * l_icloglock - * IOERROR - give up hope all ye who enter here - */ - if (iclog->ic_state == XLOG_STATE_WANT_SYNC || - iclog->ic_state & XLOG_STATE_SYNCING || - iclog->ic_state == XLOG_STATE_DONE_SYNC || - iclog->ic_state == XLOG_STATE_IOERROR ) - break; - iclog = iclog->ic_next; - } while (first_iclog != iclog); - } -#endif + if (did_callbacks) + xlog_state_callback_check_state(log); if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) - wake = 1; - spin_unlock(&log->l_icloglock); - - if (wake) wake_up_all(&log->l_flush_wait); + + spin_unlock(&log->l_icloglock); } @@ -3919,7 +4003,9 @@ xfs_log_force_umount( * item committed callback functions will do this again under lock to * avoid races. */ + spin_lock(&log->l_cilp->xc_push_lock); wake_up_all(&log->l_cilp->xc_commit_wait); + spin_unlock(&log->l_cilp->xc_push_lock); xlog_state_do_callback(log, true, NULL); #ifdef XFSERRORDEBUG diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index fa5602d0fd7f..ef652abd112c 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -38,7 +38,7 @@ xlog_cil_ticket_alloc( struct xlog_ticket *tic; tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0, - KM_SLEEP|KM_NOFS); + KM_NOFS); /* * set the current reservation to zero so we know to steal the basic @@ -186,7 +186,7 @@ xlog_cil_alloc_shadow_bufs( */ kmem_free(lip->li_lv_shadow); - lv = kmem_alloc_large(buf_size, KM_SLEEP | KM_NOFS); + lv = kmem_alloc_large(buf_size, KM_NOFS); memset(lv, 0, xlog_cil_iovec_space(niovecs)); lv->lv_item = lip; @@ -660,7 +660,7 @@ xlog_cil_push( if (!cil) return 0; - new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); + new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_NOFS); new_ctx->ticket = xlog_cil_ticket_alloc(log); down_write(&cil->xc_ctx_lock); @@ -1179,11 +1179,11 @@ xlog_cil_init( struct xfs_cil *cil; struct xfs_cil_ctx *ctx; - cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); + cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL); if (!cil) return -ENOMEM; - ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL); + ctx = kmem_zalloc(sizeof(*ctx), KM_MAYFAIL); if (!ctx) { kmem_free(cil); return -ENOMEM; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 13d1d3e95b88..508319039dce 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -97,6 +97,8 @@ xlog_alloc_buffer( struct xlog *log, int nbblks) { + int align_mask = xfs_buftarg_dma_alignment(log->l_targ); + /* * Pass log block 0 since we don't have an addr yet, buffer will be * verified on read. @@ -125,7 +127,7 @@ xlog_alloc_buffer( if (nbblks > 1 && log->l_sectBBsize > 1) nbblks += log->l_sectBBsize; nbblks = round_up(nbblks, log->l_sectBBsize); - return kmem_alloc_large(BBTOB(nbblks), KM_MAYFAIL); + return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL); } /* @@ -1960,7 +1962,7 @@ xlog_recover_buffer_pass1( } } - bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP); + bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0); bcp->bc_blkno = buf_f->blf_blkno; bcp->bc_len = buf_f->blf_len; bcp->bc_refcount = 1; @@ -2930,7 +2932,7 @@ xlog_recover_inode_pass2( if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { in_f = item->ri_buf[0].i_addr; } else { - in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), KM_SLEEP); + in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0); need_free = 1; error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); if (error) @@ -4161,7 +4163,7 @@ xlog_recover_add_item( { xlog_recover_item_t *item; - item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP); + item = kmem_zalloc(sizeof(xlog_recover_item_t), 0); INIT_LIST_HEAD(&item->ri_list); list_add_tail(&item->ri_list, head); } @@ -4201,7 +4203,7 @@ xlog_recover_add_to_cont_trans( old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; old_len = item->ri_buf[item->ri_cnt-1].i_len; - ptr = kmem_realloc(old_ptr, len + old_len, KM_SLEEP); + ptr = kmem_realloc(old_ptr, len + old_len, 0); memcpy(&ptr[old_len], dp, len); item->ri_buf[item->ri_cnt-1].i_len += len; item->ri_buf[item->ri_cnt-1].i_addr = ptr; @@ -4261,7 +4263,7 @@ xlog_recover_add_to_trans( return 0; } - ptr = kmem_alloc(len, KM_SLEEP); + ptr = kmem_alloc(len, 0); memcpy(ptr, dp, len); in_f = (struct xfs_inode_log_format *)ptr; @@ -4289,7 +4291,7 @@ xlog_recover_add_to_trans( item->ri_total = in_f->ilf_size; item->ri_buf = kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t), - KM_SLEEP); + 0); } ASSERT(item->ri_total > item->ri_cnt); /* Description region is ri_buf[0] */ @@ -4423,7 +4425,7 @@ xlog_recover_ophdr_to_trans( * This is a new transaction so allocate a new recovery container to * hold the recovery ops that will follow. */ - trans = kmem_zalloc(sizeof(struct xlog_recover), KM_SLEEP); + trans = kmem_zalloc(sizeof(struct xlog_recover), 0); trans->r_log_tid = tid; trans->r_lsn = be64_to_cpu(rhead->h_lsn); INIT_LIST_HEAD(&trans->r_itemq); @@ -5022,16 +5024,27 @@ xlog_recover_process_one_iunlink( } /* - * xlog_iunlink_recover + * Recover AGI unlinked lists + * + * This is called during recovery to process any inodes which we unlinked but + * not freed when the system crashed. These inodes will be on the lists in the + * AGI blocks. What we do here is scan all the AGIs and fully truncate and free + * any inodes found on the lists. Each inode is removed from the lists when it + * has been fully truncated and is freed. The freeing of the inode and its + * removal from the list must be atomic. + * + * If everything we touch in the agi processing loop is already in memory, this + * loop can hold the cpu for a long time. It runs without lock contention, + * memory allocation contention, the need wait for IO, etc, and so will run + * until we either run out of inodes to process, run low on memory or we run out + * of log space. * - * This is called during recovery to process any inodes which - * we unlinked but not freed when the system crashed. These - * inodes will be on the lists in the AGI blocks. What we do - * here is scan all the AGIs and fully truncate and free any - * inodes found on the lists. Each inode is removed from the - * lists when it has been fully truncated and is freed. The - * freeing of the inode and its removal from the list must be - * atomic. + * This behaviour is bad for latency on single CPU and non-preemptible kernels, + * and can prevent other filesytem work (such as CIL pushes) from running. This + * can lead to deadlocks if the recovery process runs out of log reservation + * space. Hence we need to yield the CPU when there is other kernel work + * scheduled on this CPU to ensure other scheduled work can run without undue + * latency. */ STATIC void xlog_recover_process_iunlinks( @@ -5078,6 +5091,7 @@ xlog_recover_process_iunlinks( while (agino != NULLAGINO) { agino = xlog_recover_process_one_iunlink(mp, agno, agino, bucket); + cond_resched(); } } xfs_buf_rele(agibp); @@ -5527,7 +5541,7 @@ xlog_do_log_recovery( */ log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE * sizeof(struct list_head), - KM_SLEEP); + 0); for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 322da6909290..ba5b6f3b2b88 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -82,7 +82,7 @@ xfs_uuid_mount( if (hole < 0) { xfs_uuid_table = kmem_realloc(xfs_uuid_table, (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table), - KM_SLEEP); + 0); hole = xfs_uuid_table_size++; } xfs_uuid_table[hole] = *uuid; @@ -214,7 +214,7 @@ xfs_initialize_perag( spin_lock(&mp->m_perag_lock); if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { - BUG(); + WARN_ON_ONCE(1); spin_unlock(&mp->m_perag_lock); radix_tree_preload_end(); error = -EEXIST; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 4adb6837439a..fdb60e09a9c5 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -327,13 +327,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) } /* per-AG block reservation data structures*/ -enum xfs_ag_resv_type { - XFS_AG_RESV_NONE = 0, - XFS_AG_RESV_AGFL, - XFS_AG_RESV_METADATA, - XFS_AG_RESV_RMAPBT, -}; - struct xfs_ag_resv { /* number of blocks originally reserved here */ xfs_extlen_t ar_orig_reserved; diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index 74738813f60d..a06661dac5be 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -333,12 +333,12 @@ xfs_mru_cache_create( if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count)) return -EINVAL; - if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP))) + if (!(mru = kmem_zalloc(sizeof(*mru), 0))) return -ENOMEM; /* An extra list is needed to avoid reaping up to a grp_time early. */ mru->grp_count = grp_count + 1; - mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP); + mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), 0); if (!mru->lists) { err = -ENOMEM; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 5e7a37f0cf84..ecd8ce152ab1 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -642,7 +642,7 @@ xfs_qm_init_quotainfo( ASSERT(XFS_IS_QUOTA_RUNNING(mp)); - qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); + qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), 0); error = list_lru_init(&qinf->qi_lru); if (error) @@ -978,7 +978,7 @@ xfs_qm_reset_dqcounts_buf( if (qip->i_d.di_nblocks == 0) return 0; - map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP); + map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), 0); lblkno = 0; maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index d8288aa0670a..2328268e6245 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -144,9 +144,9 @@ xfs_cui_init( ASSERT(nextents > 0); if (nextents > XFS_CUI_MAX_FAST_EXTENTS) cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents), - KM_SLEEP); + 0); else - cuip = kmem_zone_zalloc(xfs_cui_zone, KM_SLEEP); + cuip = kmem_zone_zalloc(xfs_cui_zone, 0); xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops); cuip->cui_format.cui_nextents = nextents; @@ -223,7 +223,7 @@ xfs_trans_get_cud( { struct xfs_cud_log_item *cudp; - cudp = kmem_zone_zalloc(xfs_cud_zone, KM_SLEEP); + cudp = kmem_zone_zalloc(xfs_cud_zone, 0); xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD, &xfs_cud_item_ops); cudp->cud_cuip = cuip; @@ -555,26 +555,24 @@ xfs_cui_recover( irec.br_blockcount = new_len; switch (type) { case XFS_REFCOUNT_INCREASE: - error = xfs_refcount_increase_extent(tp, &irec); + xfs_refcount_increase_extent(tp, &irec); break; case XFS_REFCOUNT_DECREASE: - error = xfs_refcount_decrease_extent(tp, &irec); + xfs_refcount_decrease_extent(tp, &irec); break; case XFS_REFCOUNT_ALLOC_COW: - error = xfs_refcount_alloc_cow_extent(tp, + xfs_refcount_alloc_cow_extent(tp, irec.br_startblock, irec.br_blockcount); break; case XFS_REFCOUNT_FREE_COW: - error = xfs_refcount_free_cow_extent(tp, + xfs_refcount_free_cow_extent(tp, irec.br_startblock, irec.br_blockcount); break; default: ASSERT(0); } - if (error) - goto abort_error; requeue_only = true; } } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index edbe37b7f636..0f08153b4994 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -495,10 +495,8 @@ xfs_reflink_cancel_cow_blocks( ASSERT((*tpp)->t_firstblock == NULLFSBLOCK); /* Free the CoW orphan record. */ - error = xfs_refcount_free_cow_extent(*tpp, - del.br_startblock, del.br_blockcount); - if (error) - break; + xfs_refcount_free_cow_extent(*tpp, del.br_startblock, + del.br_blockcount); xfs_bmap_add_free(*tpp, del.br_startblock, del.br_blockcount, NULL); @@ -675,15 +673,10 @@ xfs_reflink_end_cow_extent( trace_xfs_reflink_cow_remap(ip, &del); /* Free the CoW orphan record. */ - error = xfs_refcount_free_cow_extent(tp, del.br_startblock, - del.br_blockcount); - if (error) - goto out_cancel; + xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount); /* Map the new blocks into the data fork. */ - error = xfs_bmap_map_extent(tp, ip, &del); - if (error) - goto out_cancel; + xfs_bmap_map_extent(tp, ip, &del); /* Charge this new data fork mapping to the on-disk quota. */ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT, @@ -1070,14 +1063,10 @@ xfs_reflink_remap_extent( uirec.br_blockcount, uirec.br_startblock); /* Update the refcount tree */ - error = xfs_refcount_increase_extent(tp, &uirec); - if (error) - goto out_cancel; + xfs_refcount_increase_extent(tp, &uirec); /* Map the new blocks into the data fork. */ - error = xfs_bmap_map_extent(tp, ip, &uirec); - if (error) - goto out_cancel; + xfs_bmap_map_extent(tp, ip, &uirec); /* Update quota accounting. */ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 77ed557b6127..8939e0ea09cd 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -142,9 +142,9 @@ xfs_rui_init( ASSERT(nextents > 0); if (nextents > XFS_RUI_MAX_FAST_EXTENTS) - ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), KM_SLEEP); + ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0); else - ruip = kmem_zone_zalloc(xfs_rui_zone, KM_SLEEP); + ruip = kmem_zone_zalloc(xfs_rui_zone, 0); xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops); ruip->rui_format.rui_nextents = nextents; @@ -244,7 +244,7 @@ xfs_trans_get_rud( { struct xfs_rud_log_item *rudp; - rudp = kmem_zone_zalloc(xfs_rud_zone, KM_SLEEP); + rudp = kmem_zone_zalloc(xfs_rud_zone, 0); xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD, &xfs_rud_item_ops); rudp->rud_ruip = ruip; diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 5fa4db3c3e32..4a48a8c75b4f 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -865,7 +865,7 @@ xfs_alloc_rsum_cache( * lower bound on the minimum level with any free extents. We can * continue without the cache if it couldn't be allocated. */ - mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, KM_SLEEP); + mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, 0); if (!mp->m_rsum_cache) xfs_warn(mp, "could not allocate realtime summary cache"); } @@ -963,7 +963,7 @@ xfs_growfs_rt( /* * Allocate a new (fake) mount/sb. */ - nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP); + nmp = kmem_alloc(sizeof(*nmp), 0); /* * Loop over the bitmap blocks. * We will do everything one bitmap block at a time. diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index f9450235533c..8d1df9f8be07 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -818,7 +818,8 @@ xfs_init_mount_workqueues( goto out_destroy_buf; mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); + WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND, + 0, mp->m_fsname); if (!mp->m_cil_workqueue) goto out_destroy_unwritten; @@ -1663,6 +1664,8 @@ xfs_fs_fill_super( sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits); sb->s_max_links = XFS_MAXLINK; sb->s_time_gran = 1; + sb->s_time_min = S32_MIN; + sb->s_time_max = S32_MAX; sb->s_iflags |= SB_I_CGROUPWB; set_posix_acl_flag(sb); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 8094b1920eef..eaae275ed430 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -23,6 +23,7 @@ struct xlog; struct xlog_ticket; struct xlog_recover; struct xlog_recover_item; +struct xlog_rec_header; struct xfs_buf_log_format; struct xfs_inode_log_format; struct xfs_bmbt_irec; @@ -30,6 +31,10 @@ struct xfs_btree_cur; struct xfs_refcount_irec; struct xfs_fsmap; struct xfs_rmap_irec; +struct xfs_icreate_log; +struct xfs_owner_info; +struct xfs_trans_res; +struct xfs_inobt_rec_incore; DECLARE_EVENT_CLASS(xfs_attr_list_class, TP_PROTO(struct xfs_attr_list_context *ctx), @@ -3575,6 +3580,35 @@ TRACE_EVENT(xfs_pwork_init, __entry->nr_threads, __entry->pid) ) +DECLARE_EVENT_CLASS(xfs_kmem_class, + TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), + TP_ARGS(size, flags, caller_ip), + TP_STRUCT__entry( + __field(ssize_t, size) + __field(int, flags) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->size = size; + __entry->flags = flags; + __entry->caller_ip = caller_ip; + ), + TP_printk("size %zd flags 0x%x caller %pS", + __entry->size, + __entry->flags, + (char *)__entry->caller_ip) +) + +#define DEFINE_KMEM_EVENT(name) \ +DEFINE_EVENT(xfs_kmem_class, name, \ + TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), \ + TP_ARGS(size, flags, caller_ip)) +DEFINE_KMEM_EVENT(kmem_alloc); +DEFINE_KMEM_EVENT(kmem_alloc_io); +DEFINE_KMEM_EVENT(kmem_alloc_large); +DEFINE_KMEM_EVENT(kmem_realloc); +DEFINE_KMEM_EVENT(kmem_zone_alloc); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index d42a68d8313b..f4795fdb7389 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -90,7 +90,7 @@ xfs_trans_dup( trace_xfs_trans_dup(tp, _RET_IP_); - ntp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP); + ntp = kmem_zone_zalloc(xfs_trans_zone, 0); /* * Initialize the new transaction structure. @@ -263,7 +263,7 @@ xfs_trans_alloc( * GFP_NOFS allocation context so that we avoid lockdep false positives * by doing GFP_KERNEL allocations inside sb_start_intwrite(). */ - tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP); + tp = kmem_zone_zalloc(xfs_trans_zone, 0); if (!(flags & XFS_TRANS_NO_WRITECOUNT)) sb_start_intwrite(mp->m_super); diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 1027c9ca6eb8..16457465833b 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -863,7 +863,7 @@ STATIC void xfs_trans_alloc_dqinfo( xfs_trans_t *tp) { - tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, KM_SLEEP); + tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, 0); } void diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 3123b5aaad2a..cb895b1df5e4 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -30,7 +30,7 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, value = NULL; } - error = xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags); + error = xfs_attr_get(ip, name, (unsigned char **)&value, &asize, xflags); if (error) return error; return asize; |