diff options
Diffstat (limited to 'fs')
317 files changed, 8932 insertions, 6571 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 92cd1d80218d..6ecf863bfa2f 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -213,7 +213,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) break; default: WARN_ONCE(1, "unknown lock status code: %d\n", status); - /* fall through */ + fallthrough; case P9_LOCK_ERROR: case P9_LOCK_GRACE: res = -ENOLCK; @@ -625,7 +625,7 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma) inode = file_inode(vma->vm_file); - if (!mapping_cap_writeback_dirty(inode->i_mapping)) + if (!mapping_can_writeback(inode->i_mapping)) wbc.nr_to_write = 0; might_sleep(); diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 74df32be4c6a..e34fa20acf61 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -80,8 +80,10 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, if (ret) return ret; - if (v9ses->cache) - sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; + if (!v9ses->cache) { + sb->s_bdi->ra_pages = 0; + sb->s_bdi->io_pages = 0; + } sb->s_flags |= SB_ACTIVE | SB_DIRSYNC; if (!v9ses->cache) diff --git a/fs/Makefile b/fs/Makefile index 1c7b0e3f6daa..d72ee2ce7af0 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -37,7 +37,6 @@ obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FS_ENCRYPTION) += crypto/ obj-$(CONFIG_FS_VERITY) += verity/ obj-$(CONFIG_FILE_LOCKING) += locks.o -obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c index 30d526fecc3f..05e963402e25 100644 --- a/fs/adfs/dir_f.c +++ b/fs/adfs/dir_f.c @@ -18,11 +18,11 @@ static inline unsigned int adfs_readval(unsigned char *p, int len) switch (len) { case 4: val |= p[3] << 24; - /* fall through */ + fallthrough; case 3: val |= p[2] << 16; - /* fall through */ + fallthrough; case 2: val |= p[1] << 8; - /* fall through */ + fallthrough; default: val |= p[0]; } return val; @@ -32,11 +32,11 @@ static inline void adfs_writeval(unsigned char *p, int len, unsigned int val) { switch (len) { case 4: p[3] = val >> 24; - /* fall through */ + fallthrough; case 3: p[2] = val >> 16; - /* fall through */ + fallthrough; case 2: p[1] = val >> 8; - /* fall through */ + fallthrough; default: p[0] = val; } } diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index f708c45d5f66..29f11e10a7c7 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -420,24 +420,51 @@ affs_mode_to_prot(struct inode *inode) u32 prot = AFFS_I(inode)->i_protect; umode_t mode = inode->i_mode; + /* + * First, clear all RWED bits for owner, group, other. + * Then, recalculate them afresh. + * + * We'll always clear the delete-inhibit bit for the owner, as that is + * the classic single-user mode AmigaOS protection bit and we need to + * stay compatible with all scenarios. + * + * Since multi-user AmigaOS is an extension, we'll only set the + * delete-allow bit if any of the other bits in the same user class + * (group/other) are used. + */ + prot &= ~(FIBF_NOEXECUTE | FIBF_NOREAD + | FIBF_NOWRITE | FIBF_NODELETE + | FIBF_GRP_EXECUTE | FIBF_GRP_READ + | FIBF_GRP_WRITE | FIBF_GRP_DELETE + | FIBF_OTR_EXECUTE | FIBF_OTR_READ + | FIBF_OTR_WRITE | FIBF_OTR_DELETE); + + /* Classic single-user AmigaOS flags. These are inverted. */ if (!(mode & 0100)) prot |= FIBF_NOEXECUTE; if (!(mode & 0400)) prot |= FIBF_NOREAD; if (!(mode & 0200)) prot |= FIBF_NOWRITE; + + /* Multi-user extended flags. Not inverted. */ if (mode & 0010) prot |= FIBF_GRP_EXECUTE; if (mode & 0040) prot |= FIBF_GRP_READ; if (mode & 0020) prot |= FIBF_GRP_WRITE; + if (mode & 0070) + prot |= FIBF_GRP_DELETE; + if (mode & 0001) prot |= FIBF_OTR_EXECUTE; if (mode & 0004) prot |= FIBF_OTR_READ; if (mode & 0002) prot |= FIBF_OTR_WRITE; + if (mode & 0007) + prot |= FIBF_OTR_DELETE; AFFS_I(inode)->i_protect = prot; } diff --git a/fs/affs/file.c b/fs/affs/file.c index a26a0f96c119..d91b0133d95d 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -429,6 +429,24 @@ static int affs_write_begin(struct file *file, struct address_space *mapping, return ret; } +static int affs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned int len, unsigned int copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + int ret; + + ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + + /* Clear Archived bit on file writes, as AmigaOS would do */ + if (AFFS_I(inode)->i_protect & FIBF_ARCHIVED) { + AFFS_I(inode)->i_protect &= ~FIBF_ARCHIVED; + mark_inode_dirty(inode); + } + + return ret; +} + static sector_t _affs_bmap(struct address_space *mapping, sector_t block) { return generic_block_bmap(mapping,block,affs_get_block); @@ -438,7 +456,7 @@ const struct address_space_operations affs_aops = { .readpage = affs_readpage, .writepage = affs_writepage, .write_begin = affs_write_begin, - .write_end = generic_write_end, + .write_end = affs_write_end, .direct_IO = affs_direct_IO, .bmap = _affs_bmap }; @@ -795,6 +813,12 @@ done: if (tmp > inode->i_size) inode->i_size = AFFS_I(inode)->mmu_private = tmp; + /* Clear Archived bit on file writes, as AmigaOS would do */ + if (AFFS_I(inode)->i_protect & FIBF_ARCHIVED) { + AFFS_I(inode)->i_protect &= ~FIBF_ARCHIVED; + mark_inode_dirty(inode); + } + err_first_bh: unlock_page(page); put_page(page); diff --git a/fs/affs/inode.c b/fs/affs/inode.c index a346cf7659f1..044412110b52 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -93,7 +93,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) case ST_ROOT: inode->i_uid = sbi->s_uid; inode->i_gid = sbi->s_gid; - /* fall through */ + fallthrough; case ST_USERDIR: if (be32_to_cpu(tail->stype) == ST_USERDIR || affs_test_opt(sbi->s_flags, SF_SETMODE)) { diff --git a/fs/affs/super.c b/fs/affs/super.c index 47107c6712a6..a100cd9950c8 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -474,7 +474,7 @@ got_root: case MUFS_INTLFFS: case MUFS_DCFFS: affs_set_opt(sbi->s_flags, SF_MUFS); - /* fall thru */ + fallthrough; case FS_INTLFFS: case FS_DCFFS: affs_set_opt(sbi->s_flags, SF_INTL); @@ -486,7 +486,7 @@ got_root: break; case MUFS_OFS: affs_set_opt(sbi->s_flags, SF_MUFS); - /* fall through */ + fallthrough; case FS_OFS: affs_set_opt(sbi->s_flags, SF_OFS); sb->s_flags |= SB_NOEXEC; @@ -494,7 +494,7 @@ got_root: case MUFS_DCOFS: case MUFS_INTLOFS: affs_set_opt(sbi->s_flags, SF_MUFS); - /* fall through */ + fallthrough; case FS_DCOFS: case FS_INTLOFS: affs_set_opt(sbi->s_flags, SF_INTL); diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index bef413818af7..a4e9e6e07e93 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -252,7 +252,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) call->unmarshall++; /* extract the FID array and its count in two steps */ - /* fall through */ + fallthrough; case 1: _debug("extract FID count"); ret = afs_extract_data(call, true); @@ -271,7 +271,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) afs_extract_to_buf(call, call->count * 3 * 4); call->unmarshall++; - /* Fall through */ + fallthrough; case 2: _debug("extract FID array"); ret = afs_extract_data(call, true); @@ -297,7 +297,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) call->unmarshall++; /* extract the callback array and its count in two steps */ - /* fall through */ + fallthrough; case 3: _debug("extract CB count"); ret = afs_extract_data(call, true); @@ -312,7 +312,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) iov_iter_discard(&call->def_iter, READ, call->count2 * 3 * 4); call->unmarshall++; - /* Fall through */ + fallthrough; case 4: _debug("extract discard %zu/%u", iov_iter_count(call->iter), call->count2 * 3 * 4); @@ -391,7 +391,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) afs_extract_to_buf(call, 11 * sizeof(__be32)); call->unmarshall++; - /* Fall through */ + fallthrough; case 1: _debug("extract UUID"); ret = afs_extract_data(call, false); @@ -503,7 +503,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call) afs_extract_to_buf(call, 11 * sizeof(__be32)); call->unmarshall++; - /* Fall through */ + fallthrough; case 1: _debug("extract UUID"); ret = afs_extract_data(call, false); @@ -618,7 +618,7 @@ static int afs_deliver_yfs_cb_callback(struct afs_call *call) call->unmarshall++; /* extract the FID array and its count in two steps */ - /* Fall through */ + fallthrough; case 1: _debug("extract FID count"); ret = afs_extract_data(call, true); @@ -637,7 +637,7 @@ static int afs_deliver_yfs_cb_callback(struct afs_call *call) afs_extract_to_buf(call, size); call->unmarshall++; - /* Fall through */ + fallthrough; case 2: _debug("extract FID array"); ret = afs_extract_data(call, false); diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index b79879aacc02..7b784af604fd 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -382,15 +382,17 @@ void afs_dynroot_depopulate(struct super_block *sb) net->dynroot_sb = NULL; mutex_unlock(&net->proc_cells_lock); - inode_lock(root->d_inode); - - /* Remove all the pins for dirs created for manually added cells */ - list_for_each_entry_safe(subdir, tmp, &root->d_subdirs, d_child) { - if (subdir->d_fsdata) { - subdir->d_fsdata = NULL; - dput(subdir); + if (root) { + inode_lock(root->d_inode); + + /* Remove all the pins for dirs created for manually added cells */ + list_for_each_entry_safe(subdir, tmp, &root->d_subdirs, d_child) { + if (subdir->d_fsdata) { + subdir->d_fsdata = NULL; + dput(subdir); + } } - } - inode_unlock(root->d_inode); + inode_unlock(root->d_inode); + } } diff --git a/fs/afs/file.c b/fs/afs/file.c index 6f6ed1605cfe..371d1488cc54 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -311,7 +311,7 @@ int afs_page_filler(void *data, struct page *page) case -ENOBUFS: _debug("cache said ENOBUFS"); - /* fall through */ + fallthrough; default: go_on: req = kzalloc(struct_size(req, array, 1), GFP_KERNEL); diff --git a/fs/afs/flock.c b/fs/afs/flock.c index ffb8575345ca..cb3054c7843e 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -376,7 +376,6 @@ again: spin_unlock(&vnode->lock); return; - /* Fall through */ default: /* Looks like a lock request was withdrawn. */ spin_unlock(&vnode->lock); diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c index 24fd163c6323..97cab12b0a6c 100644 --- a/fs/afs/fs_operation.c +++ b/fs/afs/fs_operation.c @@ -235,6 +235,7 @@ int afs_put_operation(struct afs_operation *op) afs_end_cursor(&op->ac); afs_put_serverlist(op->net, op->server_list); afs_put_volume(op->net, op->volume, afs_volume_trace_put_put_op); + key_put(op->key); kfree(op); return ret; } diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c index 5d9ef517cf81..e7e98ad63a91 100644 --- a/fs/afs/fs_probe.c +++ b/fs/afs/fs_probe.c @@ -161,8 +161,8 @@ responded: } } - rtt_us = rxrpc_kernel_get_srtt(call->net->socket, call->rxcall); - if (rtt_us < server->probe.rtt) { + if (rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us) && + rtt_us < server->probe.rtt) { server->probe.rtt = rtt_us; server->rtt = rtt_us; alist->preferred = index; diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index acb4d0ca2649..1d95ed9dd86e 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -320,7 +320,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) call->tmp_u = htonl(0); afs_extract_to_tmp(call); } - /* Fall through */ + fallthrough; /* extract the returned data length */ case 1: @@ -348,7 +348,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) call->bvec[0].bv_page = req->pages[req->index]; iov_iter_bvec(&call->def_iter, READ, call->bvec, 1, size); ASSERTCMP(size, <=, PAGE_SIZE); - /* Fall through */ + fallthrough; /* extract the returned data */ case 2: @@ -375,7 +375,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) /* Discard any excess data the server gave us */ afs_extract_discard(call, req->actual_len - req->len); call->unmarshall = 3; - /* Fall through */ + fallthrough; case 3: _debug("extract discard %zu/%llu", @@ -388,7 +388,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) no_more_data: call->unmarshall = 4; afs_extract_to_buf(call, (21 + 3 + 6) * 4); - /* Fall through */ + fallthrough; /* extract the metadata */ case 4: @@ -1343,7 +1343,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) case 0: call->unmarshall++; afs_extract_to_buf(call, 12 * 4); - /* Fall through */ + fallthrough; /* extract the returned status record */ case 1: @@ -1356,7 +1356,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) xdr_decode_AFSFetchVolumeStatus(&bp, &op->volstatus.vs); call->unmarshall++; afs_extract_to_tmp(call); - /* Fall through */ + fallthrough; /* extract the volume name length */ case 2: @@ -1371,7 +1371,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) size = (call->count + 3) & ~3; /* It's padded */ afs_extract_to_buf(call, size); call->unmarshall++; - /* Fall through */ + fallthrough; /* extract the volume name */ case 3: @@ -1385,7 +1385,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) _debug("volname '%s'", p); afs_extract_to_tmp(call); call->unmarshall++; - /* Fall through */ + fallthrough; /* extract the offline message length */ case 4: @@ -1400,7 +1400,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) size = (call->count + 3) & ~3; /* It's padded */ afs_extract_to_buf(call, size); call->unmarshall++; - /* Fall through */ + fallthrough; /* extract the offline message */ case 5: @@ -1415,7 +1415,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) afs_extract_to_tmp(call); call->unmarshall++; - /* Fall through */ + fallthrough; /* extract the message of the day length */ case 6: @@ -1430,7 +1430,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) size = (call->count + 3) & ~3; /* It's padded */ afs_extract_to_buf(call, size); call->unmarshall++; - /* Fall through */ + fallthrough; /* extract the message of the day */ case 7: @@ -1682,7 +1682,7 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call) case 0: afs_extract_to_tmp(call); call->unmarshall++; - /* Fall through */ + fallthrough; /* Extract the capabilities word count */ case 1: @@ -1696,7 +1696,7 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call) call->count2 = count; afs_extract_discard(call, count * sizeof(__be32)); call->unmarshall++; - /* Fall through */ + fallthrough; /* Extract capabilities words */ case 2: @@ -1776,7 +1776,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) case 0: afs_extract_to_tmp(call); call->unmarshall++; - /* Fall through */ + fallthrough; /* Extract the file status count and array in two steps */ case 1: @@ -1794,7 +1794,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) call->unmarshall++; more_counts: afs_extract_to_buf(call, 21 * sizeof(__be32)); - /* Fall through */ + fallthrough; case 2: _debug("extract status array %u", call->count); @@ -1824,7 +1824,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) call->count = 0; call->unmarshall++; afs_extract_to_tmp(call); - /* Fall through */ + fallthrough; /* Extract the callback count and array in two steps */ case 3: @@ -1841,7 +1841,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) call->unmarshall++; more_cbs: afs_extract_to_buf(call, 3 * sizeof(__be32)); - /* Fall through */ + fallthrough; case 4: _debug("extract CB array"); @@ -1870,7 +1870,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) afs_extract_to_buf(call, 6 * sizeof(__be32)); call->unmarshall++; - /* Fall through */ + fallthrough; case 5: ret = afs_extract_data(call, false); @@ -1974,7 +1974,7 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call) case 0: afs_extract_to_tmp(call); call->unmarshall++; - /* Fall through */ + fallthrough; /* extract the returned data length */ case 1: @@ -1992,7 +1992,7 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call) acl->size = call->count2; afs_extract_begin(call, acl->data, size); call->unmarshall++; - /* Fall through */ + fallthrough; /* extract the returned data */ case 2: @@ -2002,7 +2002,7 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call) afs_extract_to_buf(call, (21 + 6) * 4); call->unmarshall++; - /* Fall through */ + fallthrough; /* extract the metadata */ case 3: diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 1d13d2e882ad..0fe8844b4bee 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -810,14 +810,32 @@ void afs_evict_inode(struct inode *inode) static void afs_setattr_success(struct afs_operation *op) { - struct inode *inode = &op->file[0].vnode->vfs_inode; + struct afs_vnode_param *vp = &op->file[0]; + struct inode *inode = &vp->vnode->vfs_inode; + loff_t old_i_size = i_size_read(inode); + + op->setattr.old_i_size = old_i_size; + afs_vnode_commit_status(op, vp); + /* inode->i_size has now been changed. */ + + if (op->setattr.attr->ia_valid & ATTR_SIZE) { + loff_t size = op->setattr.attr->ia_size; + if (size > old_i_size) + pagecache_isize_extended(inode, old_i_size, size); + } +} + +static void afs_setattr_edit_file(struct afs_operation *op) +{ + struct afs_vnode_param *vp = &op->file[0]; + struct inode *inode = &vp->vnode->vfs_inode; - afs_vnode_commit_status(op, &op->file[0]); if (op->setattr.attr->ia_valid & ATTR_SIZE) { - loff_t i_size = inode->i_size, size = op->setattr.attr->ia_size; - if (size > i_size) - pagecache_isize_extended(inode, i_size, size); - truncate_pagecache(inode, size); + loff_t size = op->setattr.attr->ia_size; + loff_t i_size = op->setattr.old_i_size; + + if (size < i_size) + truncate_pagecache(inode, size); } } @@ -825,6 +843,7 @@ static const struct afs_operation_ops afs_setattr_operation = { .issue_afs_rpc = afs_fs_setattr, .issue_yfs_rpc = yfs_fs_setattr, .success = afs_setattr_success, + .edit_dir = afs_setattr_edit_file, }; /* @@ -863,11 +882,16 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr) if (S_ISREG(vnode->vfs_inode.i_mode)) filemap_write_and_wait(vnode->vfs_inode.i_mapping); + /* Prevent any new writebacks from starting whilst we do this. */ + down_write(&vnode->validate_lock); + op = afs_alloc_operation(((attr->ia_valid & ATTR_FILE) ? afs_file_key(attr->ia_file) : NULL), vnode->volume); - if (IS_ERR(op)) - return PTR_ERR(op); + if (IS_ERR(op)) { + ret = PTR_ERR(op); + goto out_unlock; + } afs_op_set_vnode(op, 0, vnode); op->setattr.attr = attr; @@ -880,5 +904,10 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr) op->file[0].update_ctime = 1; op->ops = &afs_setattr_operation; - return afs_do_sync_operation(op); + ret = afs_do_sync_operation(op); + +out_unlock: + up_write(&vnode->validate_lock); + _leave(" = %d", ret); + return ret; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 792ac711985e..e5f0446f27e5 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -401,22 +401,24 @@ struct afs_vlserver { #define AFS_VLSERVER_FL_PROBED 0 /* The VL server has been probed */ #define AFS_VLSERVER_FL_PROBING 1 /* VL server is being probed */ #define AFS_VLSERVER_FL_IS_YFS 2 /* Server is YFS not AFS */ +#define AFS_VLSERVER_FL_RESPONDING 3 /* VL server is responding */ rwlock_t lock; /* Lock on addresses */ atomic_t usage; + unsigned int rtt; /* Server's current RTT in uS */ /* Probe state */ wait_queue_head_t probe_wq; atomic_t probe_outstanding; spinlock_t probe_lock; struct { - unsigned int rtt; /* RTT as ktime/64 */ + unsigned int rtt; /* RTT in uS */ u32 abort_code; short error; - bool have_result; - bool responded:1; - bool is_yfs:1; - bool not_yfs:1; - bool local_failure:1; + unsigned short flags; +#define AFS_VLSERVER_PROBE_RESPONDED 0x01 /* At least once response (may be abort) */ +#define AFS_VLSERVER_PROBE_IS_YFS 0x02 /* The peer appears to be YFS */ +#define AFS_VLSERVER_PROBE_NOT_YFS 0x04 /* The peer appears not to be YFS */ +#define AFS_VLSERVER_PROBE_LOCAL_FAILURE 0x08 /* A local failure prevented a probe */ } probe; u16 port; @@ -810,6 +812,7 @@ struct afs_operation { } store; struct { struct iattr *attr; + loff_t old_i_size; } setattr; struct afs_acl *acl; struct yfs_acl *yacl; diff --git a/fs/afs/misc.c b/fs/afs/misc.c index 5334f1bd2bca..1d1a8debe472 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -120,42 +120,42 @@ void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code) if (e->error == -ETIMEDOUT || e->error == -ETIME) return; - /* Fall through */ + fallthrough; case -ETIMEDOUT: case -ETIME: if (e->error == -ENOMEM || e->error == -ENONET) return; - /* Fall through */ + fallthrough; case -ENOMEM: case -ENONET: if (e->error == -ERFKILL) return; - /* Fall through */ + fallthrough; case -ERFKILL: if (e->error == -EADDRNOTAVAIL) return; - /* Fall through */ + fallthrough; case -EADDRNOTAVAIL: if (e->error == -ENETUNREACH) return; - /* Fall through */ + fallthrough; case -ENETUNREACH: if (e->error == -EHOSTUNREACH) return; - /* Fall through */ + fallthrough; case -EHOSTUNREACH: if (e->error == -EHOSTDOWN) return; - /* Fall through */ + fallthrough; case -EHOSTDOWN: if (e->error == -ECONNREFUSED) return; - /* Fall through */ + fallthrough; case -ECONNREFUSED: if (e->error == -ECONNRESET) return; - /* Fall through */ + fallthrough; case -ECONNRESET: /* Responded, but call expired. */ if (e->responded) return; diff --git a/fs/afs/proc.c b/fs/afs/proc.c index e817fc740ba0..e8babb62ed44 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -310,6 +310,11 @@ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v) alist->preferred == i ? '>' : '-', &alist->addrs[i].transport); } + seq_printf(m, " info: fl=%lx rtt=%d\n", vlserver->flags, vlserver->rtt); + seq_printf(m, " probe: fl=%x e=%d ac=%d out=%d\n", + vlserver->probe.flags, vlserver->probe.error, + vlserver->probe.abort_code, + atomic_read(&vlserver->probe_outstanding)); return 0; } diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index 6a0935cb822f..d83f13c44b92 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -281,7 +281,7 @@ bool afs_select_fileserver(struct afs_operation *op) case -ETIME: if (op->error != -EDESTADDRREQ) goto iterate_address; - /* Fall through */ + fallthrough; case -ERFKILL: case -EADDRNOTAVAIL: case -ENETUNREACH: diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 8fc8fb406a5a..8be709cb8542 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -568,7 +568,7 @@ static void afs_deliver_to_call(struct afs_call *call) case -EIO: pr_err("kAFS: Call %u in bad state %u\n", call->debug_id, state); - /* Fall through */ + fallthrough; case -ENODATA: case -EBADMSG: case -EMSGSIZE: @@ -669,7 +669,7 @@ long afs_wait_for_call_to_complete(struct afs_call *call, ret = call->ret0; call->ret0 = 0; - /* Fall through */ + fallthrough; case -ECONNABORTED: ac->responded = true; break; @@ -872,7 +872,7 @@ void afs_send_empty_reply(struct afs_call *call) _debug("oom"); rxrpc_kernel_abort_call(net->socket, call->rxcall, RX_USER_ABORT, -ENOMEM, "KOO"); - /* Fall through */ + fallthrough; default: _leave(" [error]"); return; diff --git a/fs/afs/super.c b/fs/afs/super.c index b552357b1d13..3a40ee752c1e 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -456,7 +456,6 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx) ret = super_setup_bdi(sb); if (ret) return ret; - sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; /* allocate the root inode and dentry */ if (as->dyn_root) { diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c index 8fea54eba0c2..38b2ba1d9ec0 100644 --- a/fs/afs/vl_list.c +++ b/fs/afs/vl_list.c @@ -21,6 +21,7 @@ struct afs_vlserver *afs_alloc_vlserver(const char *name, size_t name_len, rwlock_init(&vlserver->lock); init_waitqueue_head(&vlserver->probe_wq); spin_lock_init(&vlserver->probe_lock); + vlserver->rtt = UINT_MAX; vlserver->name_len = name_len; vlserver->port = port; memcpy(vlserver->name, name, name_len); diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c index e3aa013c2177..d1c7068b4346 100644 --- a/fs/afs/vl_probe.c +++ b/fs/afs/vl_probe.c @@ -11,15 +11,33 @@ #include "internal.h" #include "protocol_yfs.h" -static bool afs_vl_probe_done(struct afs_vlserver *server) + +/* + * Handle the completion of a set of probes. + */ +static void afs_finished_vl_probe(struct afs_vlserver *server) { - if (!atomic_dec_and_test(&server->probe_outstanding)) - return false; + if (!(server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED)) { + server->rtt = UINT_MAX; + clear_bit(AFS_VLSERVER_FL_RESPONDING, &server->flags); + } - wake_up_var(&server->probe_outstanding); clear_bit_unlock(AFS_VLSERVER_FL_PROBING, &server->flags); wake_up_bit(&server->flags, AFS_VLSERVER_FL_PROBING); - return true; +} + +/* + * Handle the completion of a probe RPC call. + */ +static void afs_done_one_vl_probe(struct afs_vlserver *server, bool wake_up) +{ + if (atomic_dec_and_test(&server->probe_outstanding)) { + afs_finished_vl_probe(server); + wake_up = true; + } + + if (wake_up) + wake_up_all(&server->probe_wq); } /* @@ -45,15 +63,20 @@ void afs_vlserver_probe_result(struct afs_call *call) server->probe.error = 0; goto responded; case -ECONNABORTED: - if (!server->probe.responded) { + if (!(server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED)) { server->probe.abort_code = call->abort_code; server->probe.error = ret; } goto responded; case -ENOMEM: case -ENONET: - server->probe.local_failure = true; - afs_io_error(call, afs_io_error_vl_probe_fail); + case -EKEYEXPIRED: + case -EKEYREVOKED: + case -EKEYREJECTED: + server->probe.flags |= AFS_VLSERVER_PROBE_LOCAL_FAILURE; + if (server->probe.error == 0) + server->probe.error = ret; + trace_afs_io_error(call->debug_id, ret, afs_io_error_vl_probe_fail); goto out; case -ECONNRESET: /* Responded, but call expired. */ case -ERFKILL: @@ -67,12 +90,12 @@ void afs_vlserver_probe_result(struct afs_call *call) default: clear_bit(index, &alist->responded); set_bit(index, &alist->failed); - if (!server->probe.responded && + if (!(server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED) && (server->probe.error == 0 || server->probe.error == -ETIMEDOUT || server->probe.error == -ETIME)) server->probe.error = ret; - afs_io_error(call, afs_io_error_vl_probe_fail); + trace_afs_io_error(call->debug_id, ret, afs_io_error_vl_probe_fail); goto out; } @@ -81,39 +104,36 @@ responded: clear_bit(index, &alist->failed); if (call->service_id == YFS_VL_SERVICE) { - server->probe.is_yfs = true; + server->probe.flags |= AFS_VLSERVER_PROBE_IS_YFS; set_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags); alist->addrs[index].srx_service = call->service_id; } else { - server->probe.not_yfs = true; - if (!server->probe.is_yfs) { + server->probe.flags |= AFS_VLSERVER_PROBE_NOT_YFS; + if (!(server->probe.flags & AFS_VLSERVER_PROBE_IS_YFS)) { clear_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags); alist->addrs[index].srx_service = call->service_id; } } - rtt_us = rxrpc_kernel_get_srtt(call->net->socket, call->rxcall); - if (rtt_us < server->probe.rtt) { + if (rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us) && + rtt_us < server->probe.rtt) { server->probe.rtt = rtt_us; + server->rtt = rtt_us; alist->preferred = index; - have_result = true; } smp_wmb(); /* Set rtt before responded. */ - server->probe.responded = true; + server->probe.flags |= AFS_VLSERVER_PROBE_RESPONDED; set_bit(AFS_VLSERVER_FL_PROBED, &server->flags); + set_bit(AFS_VLSERVER_FL_RESPONDING, &server->flags); + have_result = true; out: spin_unlock(&server->probe_lock); _debug("probe [%u][%u] %pISpc rtt=%u ret=%d", server_index, index, &alist->addrs[index].transport, rtt_us, ret); - have_result |= afs_vl_probe_done(server); - if (have_result) { - server->probe.have_result = true; - wake_up_var(&server->probe.have_result); - wake_up_all(&server->probe_wq); - } + afs_done_one_vl_probe(server, have_result); } /* @@ -151,11 +171,10 @@ static bool afs_do_probe_vlserver(struct afs_net *net, in_progress = true; } else { afs_prioritise_error(_e, PTR_ERR(call), ac.abort_code); + afs_done_one_vl_probe(server, false); } } - if (!in_progress) - afs_vl_probe_done(server); return in_progress; } @@ -193,7 +212,7 @@ int afs_wait_for_vl_probes(struct afs_vlserver_list *vllist, { struct wait_queue_entry *waits; struct afs_vlserver *server; - unsigned int rtt = UINT_MAX; + unsigned int rtt = UINT_MAX, rtt_s; bool have_responders = false; int pref = -1, i; @@ -205,7 +224,7 @@ int afs_wait_for_vl_probes(struct afs_vlserver_list *vllist, server = vllist->servers[i].server; if (!test_bit(AFS_VLSERVER_FL_PROBING, &server->flags)) __clear_bit(i, &untried); - if (server->probe.responded) + if (server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED) have_responders = true; } } @@ -231,7 +250,7 @@ int afs_wait_for_vl_probes(struct afs_vlserver_list *vllist, for (i = 0; i < vllist->nr_servers; i++) { if (test_bit(i, &untried)) { server = vllist->servers[i].server; - if (server->probe.responded) + if (server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED) goto stop; if (test_bit(AFS_VLSERVER_FL_PROBING, &server->flags)) still_probing = true; @@ -249,10 +268,11 @@ stop: for (i = 0; i < vllist->nr_servers; i++) { if (test_bit(i, &untried)) { server = vllist->servers[i].server; - if (server->probe.responded && - server->probe.rtt < rtt) { + rtt_s = READ_ONCE(server->rtt); + if (test_bit(AFS_VLSERVER_FL_RESPONDING, &server->flags) && + rtt_s < rtt) { pref = i; - rtt = server->probe.rtt; + rtt = rtt_s; } remove_wait_queue(&server->probe_wq, &waits[i]); diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c index f405ca8b240a..c0458c903b31 100644 --- a/fs/afs/vl_rotate.c +++ b/fs/afs/vl_rotate.c @@ -192,7 +192,8 @@ pick_server: for (i = 0; i < vc->server_list->nr_servers; i++) { struct afs_vlserver *s = vc->server_list->servers[i].server; - if (!test_bit(i, &vc->untried) || !s->probe.responded) + if (!test_bit(i, &vc->untried) || + !test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags)) continue; if (s->probe.rtt < rtt) { vc->index = i; @@ -262,10 +263,14 @@ no_more_servers: for (i = 0; i < vc->server_list->nr_servers; i++) { struct afs_vlserver *s = vc->server_list->servers[i].server; + if (test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags)) + e.responded = true; afs_prioritise_error(&e, READ_ONCE(s->probe.error), s->probe.abort_code); } + error = e.error; + failed_set_error: vc->error = error; failed: diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index fd82850cd424..dc9327332f06 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -196,7 +196,7 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call) /* Extract the returned uuid, uniquifier, nentries and * blkaddrs size */ - /* Fall through */ + fallthrough; case 1: ret = afs_extract_data(call, true); if (ret < 0) @@ -221,7 +221,7 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call) count = min(call->count, 4U); afs_extract_to_buf(call, count * sizeof(__be32)); - /* Fall through - and extract entries */ + fallthrough; /* and extract entries */ case 2: ret = afs_extract_data(call, call->count > 4); if (ret < 0) @@ -324,7 +324,7 @@ static int afs_deliver_vl_get_capabilities(struct afs_call *call) afs_extract_to_tmp(call); call->unmarshall++; - /* Fall through - and extract the capabilities word count */ + fallthrough; /* and extract the capabilities word count */ case 1: ret = afs_extract_data(call, true); if (ret < 0) @@ -337,7 +337,7 @@ static int afs_deliver_vl_get_capabilities(struct afs_call *call) call->unmarshall++; afs_extract_discard(call, count * sizeof(__be32)); - /* Fall through - and extract capabilities words */ + fallthrough; /* and extract capabilities words */ case 2: ret = afs_extract_data(call, false); if (ret < 0) @@ -436,7 +436,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) /* Extract the returned uuid, uniquifier, fsEndpoints count and * either the first fsEndpoint type or the volEndpoints * count if there are no fsEndpoints. */ - /* Fall through */ + fallthrough; case 1: ret = afs_extract_data(call, true); if (ret < 0) @@ -475,7 +475,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) afs_extract_to_buf(call, size); call->unmarshall = 2; - /* Fall through - and extract fsEndpoints[] entries */ + fallthrough; /* and extract fsEndpoints[] entries */ case 2: ret = afs_extract_data(call, true); if (ret < 0) @@ -526,7 +526,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) * extract the type of the next endpoint when we extract the * data of the current one, but this is the first... */ - /* Fall through */ + fallthrough; case 3: ret = afs_extract_data(call, true); if (ret < 0) @@ -552,7 +552,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) afs_extract_to_buf(call, size); call->unmarshall = 4; - /* Fall through - and extract volEndpoints[] entries */ + fallthrough; /* and extract volEndpoints[] entries */ case 4: ret = afs_extract_data(call, true); if (ret < 0) @@ -587,7 +587,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) afs_extract_discard(call, 0); call->unmarshall = 5; - /* Fall through - Done */ + fallthrough; /* Done */ case 5: ret = afs_extract_data(call, false); if (ret < 0) @@ -663,7 +663,7 @@ static int afs_deliver_yfsvl_get_cell_name(struct afs_call *call) afs_extract_to_tmp(call); call->unmarshall++; - /* Fall through - and extract the cell name length */ + fallthrough; /* and extract the cell name length */ case 1: ret = afs_extract_data(call, true); if (ret < 0) @@ -685,7 +685,7 @@ static int afs_deliver_yfsvl_get_cell_name(struct afs_call *call) afs_extract_begin(call, cell_name, namesz); call->unmarshall++; - /* Fall through - and extract cell name */ + fallthrough; /* and extract cell name */ case 2: ret = afs_extract_data(call, true); if (ret < 0) @@ -694,7 +694,7 @@ static int afs_deliver_yfsvl_get_cell_name(struct afs_call *call) afs_extract_discard(call, call->count2); call->unmarshall++; - /* Fall through - and extract padding */ + fallthrough; /* and extract padding */ case 3: ret = afs_extract_data(call, false); if (ret < 0) diff --git a/fs/afs/write.c b/fs/afs/write.c index a121c247d95a..da12abd6db21 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -609,7 +609,7 @@ no_more: default: pr_notice("kAFS: Unexpected error from FS.StoreData %d\n", ret); - /* Fall through */ + fallthrough; case -EACCES: case -EPERM: case -ENOKEY: @@ -738,11 +738,21 @@ static int afs_writepages_region(struct address_space *mapping, int afs_writepages(struct address_space *mapping, struct writeback_control *wbc) { + struct afs_vnode *vnode = AFS_FS_I(mapping->host); pgoff_t start, end, next; int ret; _enter(""); + /* We have to be careful as we can end up racing with setattr() + * truncating the pagecache since the caller doesn't take a lock here + * to prevent it. + */ + if (wbc->sync_mode == WB_SYNC_ALL) + down_read(&vnode->validate_lock); + else if (!down_read_trylock(&vnode->validate_lock)) + return 0; + if (wbc->range_cyclic) { start = mapping->writeback_index; end = -1; @@ -762,6 +772,7 @@ int afs_writepages(struct address_space *mapping, ret = afs_writepages_region(mapping, wbc, start, end, &next); } + up_read(&vnode->validate_lock); _leave(" = %d", ret); return ret; } diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index 8c24fdc899e3..3b1239b7e90d 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -373,7 +373,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) req->offset = req->pos & (PAGE_SIZE - 1); afs_extract_to_tmp64(call); call->unmarshall++; - /* Fall through */ + fallthrough; /* extract the returned data length */ case 1: @@ -401,7 +401,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) call->bvec[0].bv_page = req->pages[req->index]; iov_iter_bvec(&call->def_iter, READ, call->bvec, 1, size); ASSERTCMP(size, <=, PAGE_SIZE); - /* Fall through */ + fallthrough; /* extract the returned data */ case 2: @@ -428,7 +428,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) /* Discard any excess data the server gave us */ afs_extract_discard(call, req->actual_len - req->len); call->unmarshall = 3; - /* Fall through */ + fallthrough; case 3: _debug("extract discard %zu/%llu", @@ -444,7 +444,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) sizeof(struct yfs_xdr_YFSFetchStatus) + sizeof(struct yfs_xdr_YFSCallBack) + sizeof(struct yfs_xdr_YFSVolSync)); - /* Fall through */ + fallthrough; /* extract the metadata */ case 4: @@ -461,7 +461,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) req->file_size = vp->scb.status.size; call->unmarshall++; - /* Fall through */ + fallthrough; case 5: break; @@ -1262,7 +1262,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call) case 0: call->unmarshall++; afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSFetchVolumeStatus)); - /* Fall through */ + fallthrough; /* extract the returned status record */ case 1: @@ -1275,7 +1275,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call) xdr_decode_YFSFetchVolumeStatus(&bp, &op->volstatus.vs); call->unmarshall++; afs_extract_to_tmp(call); - /* Fall through */ + fallthrough; /* extract the volume name length */ case 2: @@ -1290,7 +1290,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call) size = (call->count + 3) & ~3; /* It's padded */ afs_extract_to_buf(call, size); call->unmarshall++; - /* Fall through */ + fallthrough; /* extract the volume name */ case 3: @@ -1304,7 +1304,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call) _debug("volname '%s'", p); afs_extract_to_tmp(call); call->unmarshall++; - /* Fall through */ + fallthrough; /* extract the offline message length */ case 4: @@ -1319,7 +1319,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call) size = (call->count + 3) & ~3; /* It's padded */ afs_extract_to_buf(call, size); call->unmarshall++; - /* Fall through */ + fallthrough; /* extract the offline message */ case 5: @@ -1334,7 +1334,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call) afs_extract_to_tmp(call); call->unmarshall++; - /* Fall through */ + fallthrough; /* extract the message of the day length */ case 6: @@ -1349,7 +1349,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call) size = (call->count + 3) & ~3; /* It's padded */ afs_extract_to_buf(call, size); call->unmarshall++; - /* Fall through */ + fallthrough; /* extract the message of the day */ case 7: @@ -1363,7 +1363,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call) _debug("motd '%s'", p); call->unmarshall++; - /* Fall through */ + fallthrough; case 8: break; @@ -1622,7 +1622,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call) case 0: afs_extract_to_tmp(call); call->unmarshall++; - /* Fall through */ + fallthrough; /* Extract the file status count and array in two steps */ case 1: @@ -1640,7 +1640,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call) call->unmarshall++; more_counts: afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSFetchStatus)); - /* Fall through */ + fallthrough; case 2: _debug("extract status array %u", call->count); @@ -1670,7 +1670,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call) call->count = 0; call->unmarshall++; afs_extract_to_tmp(call); - /* Fall through */ + fallthrough; /* Extract the callback count and array in two steps */ case 3: @@ -1687,7 +1687,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call) call->unmarshall++; more_cbs: afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSCallBack)); - /* Fall through */ + fallthrough; case 4: _debug("extract CB array"); @@ -1716,7 +1716,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call) afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSVolSync)); call->unmarshall++; - /* Fall through */ + fallthrough; case 5: ret = afs_extract_data(call, false); @@ -1727,7 +1727,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call) xdr_decode_YFSVolSync(&bp, &op->volsync); call->unmarshall++; - /* Fall through */ + fallthrough; case 6: break; @@ -1804,7 +1804,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call) case 0: afs_extract_to_tmp(call); call->unmarshall++; - /* Fall through */ + fallthrough; /* Extract the file ACL length */ case 1: @@ -1826,7 +1826,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call) afs_extract_discard(call, size); } call->unmarshall++; - /* Fall through */ + fallthrough; /* Extract the file ACL */ case 2: @@ -1836,7 +1836,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call) afs_extract_to_tmp(call); call->unmarshall++; - /* Fall through */ + fallthrough; /* Extract the volume ACL length */ case 3: @@ -1858,7 +1858,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call) afs_extract_discard(call, size); } call->unmarshall++; - /* Fall through */ + fallthrough; /* Extract the volume ACL */ case 4: @@ -1871,7 +1871,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call) sizeof(struct yfs_xdr_YFSFetchStatus) + sizeof(struct yfs_xdr_YFSVolSync)); call->unmarshall++; - /* Fall through */ + fallthrough; /* extract the metadata */ case 5: @@ -1886,7 +1886,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call) xdr_decode_YFSVolSync(&bp, &op->volsync); call->unmarshall++; - /* Fall through */ + fallthrough; case 6: break; @@ -1489,12 +1489,8 @@ static ssize_t aio_setup_rw(int rw, const struct iocb *iocb, *iovec = NULL; return ret; } -#ifdef CONFIG_COMPAT - if (compat) - return compat_import_iovec(rw, buf, len, UIO_FASTIOV, iovec, - iter); -#endif - return import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter); + + return __import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter, compat); } static inline void aio_rw_done(struct kiocb *req, ssize_t ret) @@ -1511,7 +1507,7 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t ret) * may be already running. Just fail this IO with EINTR. */ ret = -EINTR; - /*FALLTHRU*/ + fallthrough; default: req->ki_complete(req, ret, 0); } diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c index 74c886f7c51c..5ced859dac53 100644 --- a/fs/autofs/waitq.c +++ b/fs/autofs/waitq.c @@ -53,7 +53,7 @@ static int autofs_write(struct autofs_sb_info *sbi, mutex_lock(&sbi->pipe_mutex); while (bytes) { - wr = kernel_write(file, data, bytes, &file->f_pos); + wr = __kernel_write(file, data, bytes, NULL); if (wr <= 0) break; data += wr; diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index f2f9086ebe98..b9c658e0548e 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -576,7 +576,7 @@ static int load_flat_file(struct linux_binprm *bprm, goto err; } - len = data_len + extra; + len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); len = PAGE_ALIGN(len); realdatastart = vm_mmap(NULL, 0, len, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); @@ -590,7 +590,9 @@ static int load_flat_file(struct linux_binprm *bprm, vm_munmap(textpos, text_len); goto err; } - datapos = ALIGN(realdatastart, FLAT_DATA_ALIGN); + datapos = ALIGN(realdatastart + + MAX_SHARED_LIBS * sizeof(unsigned long), + FLAT_DATA_ALIGN); pr_debug("Allocated data+bss+stack (%u bytes): %lx\n", data_len + bss_len + stack_len, datapos); @@ -620,7 +622,7 @@ static int load_flat_file(struct linux_binprm *bprm, memp_size = len; } else { - len = text_len + data_len + extra; + len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(u32); len = PAGE_ALIGN(len); textpos = vm_mmap(NULL, 0, len, PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); @@ -635,7 +637,9 @@ static int load_flat_file(struct linux_binprm *bprm, } realdatastart = textpos + ntohl(hdr->data_start); - datapos = ALIGN(realdatastart, FLAT_DATA_ALIGN); + datapos = ALIGN(realdatastart + + MAX_SHARED_LIBS * sizeof(u32), + FLAT_DATA_ALIGN); reloc = (__be32 __user *) (datapos + (ntohl(hdr->reloc_start) - text_len)); @@ -652,9 +656,8 @@ static int load_flat_file(struct linux_binprm *bprm, (text_len + full_data - sizeof(struct flat_hdr)), 0); - if (datapos != realdatastart) - memmove((void *)datapos, (void *)realdatastart, - full_data); + memmove((void *) datapos, (void *) realdatastart, + full_data); #else /* * This is used on MMU systems mainly for testing. @@ -710,7 +713,8 @@ static int load_flat_file(struct linux_binprm *bprm, if (IS_ERR_VALUE(result)) { ret = result; pr_err("Unable to read code+data+bss, errno %d\n", ret); - vm_munmap(textpos, text_len + data_len + extra); + vm_munmap(textpos, text_len + data_len + extra + + MAX_SHARED_LIBS * sizeof(u32)); goto err; } } diff --git a/fs/block_dev.c b/fs/block_dev.c index 8ae833e00443..9e84b1928b94 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -103,6 +103,35 @@ void invalidate_bdev(struct block_device *bdev) } EXPORT_SYMBOL(invalidate_bdev); +/* + * Drop all buffers & page cache for given bdev range. This function bails + * with error if bdev has other exclusive owner (such as filesystem). + */ +int truncate_bdev_range(struct block_device *bdev, fmode_t mode, + loff_t lstart, loff_t lend) +{ + struct block_device *claimed_bdev = NULL; + int err; + + /* + * If we don't hold exclusive handle for the device, upgrade to it + * while we discard the buffer cache to avoid discarding buffers + * under live filesystem. + */ + if (!(mode & FMODE_EXCL)) { + claimed_bdev = bdev->bd_contains; + err = bd_prepare_to_claim(bdev, claimed_bdev, + truncate_bdev_range); + if (err) + return err; + } + truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend); + if (claimed_bdev) + bd_abort_claiming(bdev, claimed_bdev, truncate_bdev_range); + return 0; +} +EXPORT_SYMBOL(truncate_bdev_range); + static void set_init_blocksize(struct block_device *bdev) { bdev->bd_inode->i_blkbits = blksize_bits(bdev_logical_block_size(bdev)); @@ -862,7 +891,7 @@ static int bdev_set(struct inode *inode, void *data) return 0; } -struct block_device *bdget(dev_t dev) +static struct block_device *bdget(dev_t dev) { struct block_device *bdev; struct inode *inode; @@ -876,11 +905,11 @@ struct block_device *bdget(dev_t dev) bdev = &BDEV_I(inode)->bdev; if (inode->i_state & I_NEW) { + spin_lock_init(&bdev->bd_size_lock); bdev->bd_contains = NULL; bdev->bd_super = NULL; bdev->bd_inode = inode; bdev->bd_part_count = 0; - bdev->bd_invalidated = 0; inode->i_mode = S_IFBLK; inode->i_rdev = dev; inode->i_bdev = bdev; @@ -891,8 +920,6 @@ struct block_device *bdget(dev_t dev) return bdev; } -EXPORT_SYMBOL(bdget); - /** * bdgrab -- Grab a reference to an already referenced block device * @bdev: Block device to grab a reference to. @@ -904,6 +931,11 @@ struct block_device *bdgrab(struct block_device *bdev) } EXPORT_SYMBOL(bdgrab); +struct block_device *bdget_part(struct hd_struct *part) +{ + return bdget(part_devt(part)); +} + long nr_blockdev_pages(void) { struct inode *inode; @@ -1290,6 +1322,7 @@ static void check_disk_size_change(struct gendisk *disk, { loff_t disk_size, bdev_size; + spin_lock(&bdev->bd_size_lock); disk_size = (loff_t)get_capacity(disk) << 9; bdev_size = i_size_read(bdev->bd_inode); if (disk_size != bdev_size) { @@ -1299,85 +1332,51 @@ static void check_disk_size_change(struct gendisk *disk, disk->disk_name, bdev_size, disk_size); } i_size_write(bdev->bd_inode, disk_size); - if (bdev_size > disk_size && __invalidate_device(bdev, false)) + } + spin_unlock(&bdev->bd_size_lock); + + if (bdev_size > disk_size) { + if (__invalidate_device(bdev, false)) pr_warn("VFS: busy inodes on resized disk %s\n", disk->disk_name); } - bdev->bd_invalidated = 0; } /** - * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back - * @disk: struct gendisk to be revalidated + * revalidate_disk_size - checks for disk size change and adjusts bdev size. + * @disk: struct gendisk to check + * @verbose: if %true log a message about a size change if there is any * - * This routine is a wrapper for lower-level driver's revalidate_disk - * call-backs. It is used to do common pre and post operations needed - * for all revalidate_disk operations. + * This routine checks to see if the bdev size does not match the disk size + * and adjusts it if it differs. When shrinking the bdev size, its all caches + * are freed. */ -int revalidate_disk(struct gendisk *disk) +void revalidate_disk_size(struct gendisk *disk, bool verbose) { - int ret = 0; - - if (disk->fops->revalidate_disk) - ret = disk->fops->revalidate_disk(disk); + struct block_device *bdev; /* * Hidden disks don't have associated bdev so there's no point in - * revalidating it. + * revalidating them. */ - if (!(disk->flags & GENHD_FL_HIDDEN)) { - struct block_device *bdev = bdget_disk(disk, 0); - - if (!bdev) - return ret; + if (disk->flags & GENHD_FL_HIDDEN) + return; - mutex_lock(&bdev->bd_mutex); - check_disk_size_change(disk, bdev, ret == 0); - mutex_unlock(&bdev->bd_mutex); + bdev = bdget_disk(disk, 0); + if (bdev) { + check_disk_size_change(disk, bdev, verbose); bdput(bdev); } - return ret; } -EXPORT_SYMBOL(revalidate_disk); +EXPORT_SYMBOL(revalidate_disk_size); -/* - * This routine checks whether a removable media has been changed, - * and invalidates all buffer-cache-entries in that case. This - * is a relatively slow routine, so we have to try to minimize using - * it. Thus it is called only upon a 'mount' or 'open'. This - * is the best way of combining speed and utility, I think. - * People changing diskettes in the middle of an operation deserve - * to lose :-) - */ -int check_disk_change(struct block_device *bdev) +void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors) { - struct gendisk *disk = bdev->bd_disk; - const struct block_device_operations *bdops = disk->fops; - unsigned int events; - - events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE | - DISK_EVENT_EJECT_REQUEST); - if (!(events & DISK_EVENT_MEDIA_CHANGE)) - return 0; - - if (__invalidate_device(bdev, true)) - pr_warn("VFS: busy inodes on changed media %s\n", - disk->disk_name); - bdev->bd_invalidated = 1; - if (bdops->revalidate_disk) - bdops->revalidate_disk(bdev->bd_disk); - return 1; -} - -EXPORT_SYMBOL(check_disk_change); - -void bd_set_size(struct block_device *bdev, loff_t size) -{ - inode_lock(bdev->bd_inode); - i_size_write(bdev->bd_inode, size); - inode_unlock(bdev->bd_inode); + spin_lock(&bdev->bd_size_lock); + i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); + spin_unlock(&bdev->bd_size_lock); } -EXPORT_SYMBOL(bd_set_size); +EXPORT_SYMBOL(bd_set_nr_sectors); static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); @@ -1388,6 +1387,8 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate) lockdep_assert_held(&bdev->bd_mutex); + clear_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); + rescan: ret = blk_drop_partitions(bdev); if (ret) @@ -1446,22 +1447,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, struct gendisk *disk; int ret; int partno; - int perm = 0; bool first_open = false, unblock_events = true, need_restart; - if (mode & FMODE_READ) - perm |= MAY_READ; - if (mode & FMODE_WRITE) - perm |= MAY_WRITE; - /* - * hooks: /n/, see "layering violations". - */ - if (!for_part) { - ret = devcgroup_inode_permission(bdev->bd_inode, perm); - if (ret != 0) - return ret; - } - restart: need_restart = false; ret = -ENXIO; @@ -1514,7 +1501,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, } if (!ret) { - bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); + bd_set_nr_sectors(bdev, get_capacity(disk)); set_init_blocksize(bdev); } @@ -1524,7 +1511,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, * The latter is necessary to prevent ghost * partitions on a removed medium. */ - if (bdev->bd_invalidated && + if (test_bit(GD_NEED_PART_SCAN, &disk->state) && (!ret || ret == -ENOMEDIUM)) bdev_disk_changed(bdev, ret == -ENOMEDIUM); @@ -1542,7 +1529,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, ret = -ENXIO; goto out_clear; } - bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); + bd_set_nr_sectors(bdev, bdev->bd_part->nr_sects); set_init_blocksize(bdev); } @@ -1554,7 +1541,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, if (bdev->bd_disk->fops->open) ret = bdev->bd_disk->fops->open(bdev, mode); /* the same as first opener case, read comment there */ - if (bdev->bd_invalidated && + if (test_bit(GD_NEED_PART_SCAN, &disk->state) && (!ret || ret == -ENOMEDIUM)) bdev_disk_changed(bdev, ret == -ENOMEDIUM); if (ret) @@ -1632,16 +1619,27 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, * RETURNS: * 0 on success, -errno on failure. */ -int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) +static int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) { - int res; + int ret, perm = 0; - res =__blkdev_get(bdev, mode, holder, 0); - if (res) - bdput(bdev); - return res; + if (mode & FMODE_READ) + perm |= MAY_READ; + if (mode & FMODE_WRITE) + perm |= MAY_WRITE; + ret = devcgroup_inode_permission(bdev->bd_inode, perm); + if (ret) + goto bdput; + + ret =__blkdev_get(bdev, mode, holder, 0); + if (ret) + goto bdput; + return 0; + +bdput: + bdput(bdev); + return ret; } -EXPORT_SYMBOL(blkdev_get); /** * blkdev_get_by_path - open a block device by name @@ -1889,7 +1887,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) if (bdev_read_only(I_BDEV(bd_inode))) return -EPERM; - if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode)) + if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev)) return -ETXTBSY; if (!iov_iter_count(from)) @@ -1969,7 +1967,6 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, loff_t len) { struct block_device *bdev = I_BDEV(bdev_file_inode(file)); - struct address_space *mapping; loff_t end = start + len - 1; loff_t isize; int error; @@ -1997,8 +1994,9 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, return -EINVAL; /* Invalidate the page cache, including dirty pages. */ - mapping = bdev->bd_inode->i_mapping; - truncate_inode_pages_range(mapping, start, end); + error = truncate_bdev_range(bdev, file->f_mode, start, end); + if (error) + return error; switch (mode) { case FALLOC_FL_ZERO_RANGE: @@ -2025,7 +2023,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, * the caller will be given -EBUSY. The third argument is * inclusive, so the rounding here is safe. */ - return invalidate_inode_pages2_range(mapping, + return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping, start >> PAGE_SHIFT, end >> PAGE_SHIFT); } diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 575636f6491e..68b95ad82126 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -14,6 +14,7 @@ config BTRFS_FS select LZO_DECOMPRESS select ZSTD_COMPRESS select ZSTD_DECOMPRESS + select FS_IOMAP select RAID6_PQ select XOR_BLOCKS select SRCU diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index ea1c28ccb44f..b3268f4ea5f3 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -2997,7 +2997,6 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, while (!list_empty(&pending_edge)) { struct btrfs_backref_node *upper; struct btrfs_backref_node *lower; - struct rb_node *rb_node; edge = list_first_entry(&pending_edge, struct btrfs_backref_edge, list[UPPER]); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 613920c17ac1..c0f1d6818df7 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1766,16 +1766,10 @@ static void link_block_group(struct btrfs_block_group *cache) { struct btrfs_space_info *space_info = cache->space_info; int index = btrfs_bg_flags_to_raid_index(cache->flags); - bool first = false; down_write(&space_info->groups_sem); - if (list_empty(&space_info->block_groups[index])) - first = true; list_add_tail(&cache->list, &space_info->block_groups[index]); up_write(&space_info->groups_sem); - - if (first) - btrfs_sysfs_add_block_group_type(cache); } static struct btrfs_block_group *btrfs_create_block_group_cache( @@ -1798,7 +1792,6 @@ static struct btrfs_block_group *btrfs_create_block_group_cache( cache->fs_info = fs_info; cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start); - set_free_space_tree_thresholds(cache); cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED; @@ -1874,7 +1867,7 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) return ret; } -static int read_block_group_item(struct btrfs_block_group *cache, +static void read_block_group_item(struct btrfs_block_group *cache, struct btrfs_path *path, const struct btrfs_key *key) { @@ -1888,8 +1881,6 @@ static int read_block_group_item(struct btrfs_block_group *cache, sizeof(bgi)); cache->used = btrfs_stack_block_group_used(&bgi); cache->flags = btrfs_stack_block_group_flags(&bgi); - - return 0; } static int read_one_block_group(struct btrfs_fs_info *info, @@ -1908,9 +1899,9 @@ static int read_one_block_group(struct btrfs_fs_info *info, if (!cache) return -ENOMEM; - ret = read_block_group_item(cache, path, key); - if (ret < 0) - goto error; + read_block_group_item(cache, path, key); + + set_free_space_tree_thresholds(cache); if (need_clear) { /* @@ -2034,8 +2025,18 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) btrfs_release_path(path); } - rcu_read_lock(); - list_for_each_entry_rcu(space_info, &info->space_info, list) { + list_for_each_entry(space_info, &info->space_info, list) { + int i; + + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + if (list_empty(&space_info->block_groups[i])) + continue; + cache = list_first_entry(&space_info->block_groups[i], + struct btrfs_block_group, + list); + btrfs_sysfs_add_block_group_type(cache); + } + if (!(btrfs_get_alloc_profile(info, space_info->flags) & (BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_RAID1_MASK | @@ -2055,7 +2056,6 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) list) inc_block_group_ro(cache, 1); } - rcu_read_unlock(); btrfs_init_global_block_rsv(info); ret = check_chunk_block_group_mappings(info); @@ -2096,12 +2096,16 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) return; while (!list_empty(&trans->new_bgs)) { + int index; + block_group = list_first_entry(&trans->new_bgs, struct btrfs_block_group, bg_list); if (ret) goto next; + index = btrfs_bg_flags_to_raid_index(block_group->flags); + ret = insert_block_group_item(trans, block_group); if (ret) btrfs_abort_transaction(trans, ret); @@ -2110,6 +2114,16 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) if (ret) btrfs_abort_transaction(trans, ret); add_block_group_free_space(trans, block_group); + + /* + * If we restriped during balance, we may have added a new raid + * type, so now add the sysfs entries when it is safe to do so. + * We don't have to worry about locking here as it's handled in + * btrfs_sysfs_add_block_group_type. + */ + if (block_group->space_info->block_group_kobjs[index] == NULL) + btrfs_sysfs_add_block_group_type(block_group); + /* Already aborted the transaction if it failed. */ next: btrfs_delayed_refs_rsv_release(fs_info, 1); @@ -2132,6 +2146,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, return -ENOMEM; cache->length = size; + set_free_space_tree_thresholds(cache); cache->used = bytes_used; cache->flags = type; cache->last_byte_to_unpin = (u64)-1; @@ -2783,7 +2798,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) * finished yet (no block group item in the extent tree * yet, etc). If this is the case, wait for all free * space endio workers to finish and retry. This is a - * a very rare case so no need for a more efficient and + * very rare case so no need for a more efficient and * complex approach. */ if (ret == -ENOENT) { @@ -2959,6 +2974,13 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, space_info, -ram_bytes); if (delalloc) cache->delalloc_bytes += num_bytes; + + /* + * Compression can use less space than we reserved, so wake + * tickets if that happens + */ + if (num_bytes < ram_bytes) + btrfs_try_granting_tickets(cache->fs_info, space_info); } spin_unlock(&cache->lock); spin_unlock(&space_info->lock); @@ -2992,6 +3014,8 @@ void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, if (delalloc) cache->delalloc_bytes -= num_bytes; spin_unlock(&cache->lock); + + btrfs_try_granting_tickets(cache->fs_info, space_info); spin_unlock(&space_info->lock); } @@ -3000,12 +3024,10 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) struct list_head *head = &info->space_info; struct btrfs_space_info *found; - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) { + list_for_each_entry(found, head, list) { if (found->flags & BTRFS_BLOCK_GROUP_METADATA) found->force_alloc = CHUNK_ALLOC_FORCE; } - rcu_read_unlock(); } static int should_alloc_chunk(struct btrfs_fs_info *fs_info, @@ -3336,14 +3358,6 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) } spin_unlock(&info->block_group_cache_lock); - /* - * Now that all the block groups are freed, go through and free all the - * space_info structs. This is only called during the final stages of - * unmount, and so we know nobody is using them. We call - * synchronize_rcu() once before we start, just to be on the safe side. - */ - synchronize_rcu(); - btrfs_release_global_block_rsv(info); while (!list_empty(&info->space_info)) { diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index c47b6c6fea9f..92dd86bceae3 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -21,14 +21,18 @@ * new data the application may have written before commit. */ enum { - BTRFS_INODE_ORDERED_DATA_CLOSE, + BTRFS_INODE_FLUSH_ON_CLOSE, BTRFS_INODE_DUMMY, BTRFS_INODE_IN_DEFRAG, BTRFS_INODE_HAS_ASYNC_EXTENT, + /* + * Always set under the VFS' inode lock, otherwise it can cause races + * during fsync (we start as a fast fsync and then end up in a full + * fsync racing with ordered extent completion). + */ BTRFS_INODE_NEEDS_FULL_SYNC, BTRFS_INODE_COPY_EVERYTHING, BTRFS_INODE_IN_DELALLOC_LIST, - BTRFS_INODE_READDIO_NEED_LOCK, BTRFS_INODE_HAS_PROPS, BTRFS_INODE_SNAPSHOT_FLUSH, }; @@ -212,6 +216,11 @@ struct btrfs_inode { struct inode vfs_inode; }; +static inline u32 btrfs_inode_sectorsize(const struct btrfs_inode *inode) +{ + return inode->root->fs_info->sectorsize; +} + static inline struct btrfs_inode *BTRFS_I(const struct inode *inode) { return container_of(inode, struct btrfs_inode, vfs_inode); @@ -324,23 +333,6 @@ struct btrfs_dio_private { u8 csums[]; }; -/* - * Disable DIO read nolock optimization, so new dio readers will be forced - * to grab i_mutex. It is used to avoid the endless truncate due to - * nonlocked dio read. - */ -static inline void btrfs_inode_block_unlocked_dio(struct btrfs_inode *inode) -{ - set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags); - smp_mb(); -} - -static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode) -{ - smp_mb__before_atomic(); - clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags); -} - /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 1ab56a734e70..eeface30facd 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -29,41 +29,6 @@ #include "extent_io.h" #include "extent_map.h" -int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, - unsigned long *total_in, unsigned long *total_out); -int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); -int zlib_decompress(struct list_head *ws, unsigned char *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, - size_t destlen); -struct list_head *zlib_alloc_workspace(unsigned int level); -void zlib_free_workspace(struct list_head *ws); -struct list_head *zlib_get_workspace(unsigned int level); - -int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, - unsigned long *total_in, unsigned long *total_out); -int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); -int lzo_decompress(struct list_head *ws, unsigned char *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, - size_t destlen); -struct list_head *lzo_alloc_workspace(unsigned int level); -void lzo_free_workspace(struct list_head *ws); - -int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, - unsigned long *total_in, unsigned long *total_out); -int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); -int zstd_decompress(struct list_head *ws, unsigned char *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, - size_t destlen); -void zstd_init_workspace_manager(void); -void zstd_cleanup_workspace_manager(void); -struct list_head *zstd_alloc_workspace(unsigned int level); -void zstd_free_workspace(struct list_head *ws); -struct list_head *zstd_get_workspace(unsigned int level); -void zstd_put_workspace(struct list_head *ws); - static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; const char* btrfs_compress_type2str(enum btrfs_compression_type type) diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 9f3dbe372631..8001b700ea3a 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -144,4 +144,39 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len); int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end); +int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out); +int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); +int zlib_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen); +struct list_head *zlib_alloc_workspace(unsigned int level); +void zlib_free_workspace(struct list_head *ws); +struct list_head *zlib_get_workspace(unsigned int level); + +int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out); +int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); +int lzo_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen); +struct list_head *lzo_alloc_workspace(unsigned int level); +void lzo_free_workspace(struct list_head *ws); + +int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out); +int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); +int zstd_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen); +void zstd_init_workspace_manager(void); +void zstd_cleanup_workspace_manager(void); +struct list_head *zstd_alloc_workspace(unsigned int level); +void zstd_free_workspace(struct list_head *ws); +struct list_head *zstd_get_workspace(unsigned int level); +void zstd_put_workspace(struct list_head *ws); + #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 70e49d8d4f6c..113da62dc17f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -68,7 +68,7 @@ const char *btrfs_super_csum_driver(u16 csum_type) btrfs_csums[csum_type].name; } -size_t __const btrfs_get_num_csums(void) +size_t __attribute_const__ btrfs_get_num_csums(void) { return ARRAY_SIZE(btrfs_csums); } @@ -198,7 +198,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, btrfs_node_key(buf, &disk_key, 0); cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, - &disk_key, level, buf->start, 0); + &disk_key, level, buf->start, 0, + BTRFS_NESTING_NEW_ROOT); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -957,7 +958,8 @@ static struct extent_buffer *alloc_tree_block_no_bg_flush( const struct btrfs_disk_key *disk_key, int level, u64 hint, - u64 empty_size) + u64 empty_size, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *ret; @@ -986,7 +988,7 @@ static struct extent_buffer *alloc_tree_block_no_bg_flush( ret = btrfs_alloc_tree_block(trans, root, parent_start, root->root_key.objectid, disk_key, level, - hint, empty_size); + hint, empty_size, nest); trans->can_flush_pending_bgs = true; return ret; @@ -1009,7 +1011,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, struct extent_buffer **cow_ret, - u64 search_start, u64 empty_size) + u64 search_start, u64 empty_size, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_disk_key disk_key; @@ -1040,7 +1043,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, parent_start = parent->start; cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key, - level, search_start, empty_size); + level, search_start, empty_size, nest); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -1061,6 +1064,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); return ret; } @@ -1068,6 +1073,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { ret = btrfs_reloc_cow_block(trans, root, buf, cow); if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); return ret; } @@ -1100,6 +1107,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (last_ref) { ret = tree_mod_log_free_eb(buf); if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); return ret; } @@ -1297,6 +1306,8 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); + btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin), + eb_rewin, btrfs_header_level(eb_rewin)); btrfs_tree_read_lock(eb_rewin); __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm); WARN_ON(btrfs_header_nritems(eb_rewin) > @@ -1370,7 +1381,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq) if (!eb) return NULL; - btrfs_tree_read_lock(eb); if (old_root) { btrfs_set_header_bytenr(eb, eb->start); btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); @@ -1378,6 +1388,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq) btrfs_set_header_level(eb, old_root->level); btrfs_set_header_generation(eb, old_generation); } + btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, + btrfs_header_level(eb)); + btrfs_tree_read_lock(eb); if (tm) __tree_mod_log_rewind(fs_info, eb, time_seq, tm); else @@ -1442,7 +1455,8 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, - struct extent_buffer **cow_ret) + struct extent_buffer **cow_ret, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; u64 search_start; @@ -1481,7 +1495,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, */ btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); ret = __btrfs_cow_block(trans, root, buf, parent, - parent_slot, cow_ret, search_start, 0); + parent_slot, cow_ret, search_start, 0, nest); trace_btrfs_cow_block(root, buf, *cow_ret); @@ -1653,7 +1667,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, err = __btrfs_cow_block(trans, root, cur, parent, i, &cur, search_start, min(16 * blocksize, - (end_slot - i) * blocksize)); + (end_slot - i) * blocksize), + BTRFS_NESTING_COW); if (err) { btrfs_tree_unlock(cur); free_extent_buffer(cur); @@ -1851,7 +1866,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_lock(child); btrfs_set_lock_blocking_write(child); - ret = btrfs_cow_block(trans, root, child, mid, 0, &child); + ret = btrfs_cow_block(trans, root, child, mid, 0, &child, + BTRFS_NESTING_COW); if (ret) { btrfs_tree_unlock(child); free_extent_buffer(child); @@ -1887,10 +1903,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, left = NULL; if (left) { - btrfs_tree_lock(left); + __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); btrfs_set_lock_blocking_write(left); wret = btrfs_cow_block(trans, root, left, - parent, pslot - 1, &left); + parent, pslot - 1, &left, + BTRFS_NESTING_LEFT_COW); if (wret) { ret = wret; goto enospc; @@ -1902,10 +1919,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, right = NULL; if (right) { - btrfs_tree_lock(right); + __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); btrfs_set_lock_blocking_write(right); wret = btrfs_cow_block(trans, root, right, - parent, pslot + 1, &right); + parent, pslot + 1, &right, + BTRFS_NESTING_RIGHT_COW); if (wret) { ret = wret; goto enospc; @@ -2065,7 +2083,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (left) { u32 left_nr; - btrfs_tree_lock(left); + __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); btrfs_set_lock_blocking_write(left); left_nr = btrfs_header_nritems(left); @@ -2073,7 +2091,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, wret = 1; } else { ret = btrfs_cow_block(trans, root, left, parent, - pslot - 1, &left); + pslot - 1, &left, + BTRFS_NESTING_LEFT_COW); if (ret) wret = 1; else { @@ -2119,7 +2138,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (right) { u32 right_nr; - btrfs_tree_lock(right); + __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); btrfs_set_lock_blocking_write(right); right_nr = btrfs_header_nritems(right); @@ -2128,7 +2147,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, } else { ret = btrfs_cow_block(trans, root, right, parent, pslot + 1, - &right); + &right, BTRFS_NESTING_RIGHT_COW); if (ret) wret = 1; else { @@ -2597,7 +2616,7 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, * We don't know the level of the root node until we actually * have it read locked */ - b = btrfs_read_lock_root_node(root); + b = __btrfs_read_lock_root_node(root, p->recurse); level = btrfs_header_level(b); if (level > write_lock_level) goto out; @@ -2736,11 +2755,13 @@ again: btrfs_set_path_blocking(p); if (last_level) err = btrfs_cow_block(trans, root, b, NULL, 0, - &b); + &b, + BTRFS_NESTING_COW); else err = btrfs_cow_block(trans, root, b, p->nodes[level + 1], - p->slots[level + 1], &b); + p->slots[level + 1], &b, + BTRFS_NESTING_COW); if (err) { ret = err; goto done; @@ -2871,7 +2892,8 @@ cow_done: } else { if (!btrfs_tree_read_lock_atomic(b)) { btrfs_set_path_blocking(p); - btrfs_tree_read_lock(b); + __btrfs_tree_read_lock(b, BTRFS_NESTING_NORMAL, + p->recurse); } p->locks[level] = BTRFS_READ_LOCK; } @@ -3160,6 +3182,58 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, } /* + * Check key order of two sibling extent buffers. + * + * Return true if something is wrong. + * Return false if everything is fine. + * + * Tree-checker only works inside one tree block, thus the following + * corruption can not be detected by tree-checker: + * + * Leaf @left | Leaf @right + * -------------------------------------------------------------- + * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 | + * + * Key f6 in leaf @left itself is valid, but not valid when the next + * key in leaf @right is 7. + * This can only be checked at tree block merge time. + * And since tree checker has ensured all key order in each tree block + * is correct, we only need to bother the last key of @left and the first + * key of @right. + */ +static bool check_sibling_keys(struct extent_buffer *left, + struct extent_buffer *right) +{ + struct btrfs_key left_last; + struct btrfs_key right_first; + int level = btrfs_header_level(left); + int nr_left = btrfs_header_nritems(left); + int nr_right = btrfs_header_nritems(right); + + /* No key to check in one of the tree blocks */ + if (!nr_left || !nr_right) + return false; + + if (level) { + btrfs_node_key_to_cpu(left, &left_last, nr_left - 1); + btrfs_node_key_to_cpu(right, &right_first, 0); + } else { + btrfs_item_key_to_cpu(left, &left_last, nr_left - 1); + btrfs_item_key_to_cpu(right, &right_first, 0); + } + + if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) { + btrfs_crit(left->fs_info, +"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)", + left_last.objectid, left_last.type, + left_last.offset, right_first.objectid, + right_first.type, right_first.offset); + return true; + } + return false; +} + +/* * try to push data from one node into the next node left in the * tree. * @@ -3203,6 +3277,12 @@ static int push_node_left(struct btrfs_trans_handle *trans, } else push_items = min(src_nritems - 8, push_items); + /* dst is the left eb, src is the middle eb */ + if (check_sibling_keys(dst, src)) { + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + return ret; + } ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); if (ret) { btrfs_abort_transaction(trans, ret); @@ -3271,6 +3351,12 @@ static int balance_node_right(struct btrfs_trans_handle *trans, if (max_push < push_items) push_items = max_push; + /* dst is the right eb, src is the middle eb */ + if (check_sibling_keys(src, dst)) { + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + return ret; + } ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems); BUG_ON(ret < 0); memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), @@ -3327,7 +3413,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_node_key(lower, &lower_key, 0); c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level, - root->node->start, 0); + root->node->start, 0, + BTRFS_NESTING_NEW_ROOT); if (IS_ERR(c)) return PTR_ERR(c); @@ -3457,7 +3544,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_node_key(c, &disk_key, mid); split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level, - c->start, 0); + c->start, 0, BTRFS_NESTING_SPLIT); if (IS_ERR(split)) return PTR_ERR(split); @@ -3726,7 +3813,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (IS_ERR(right)) return 1; - btrfs_tree_lock(right); + __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); btrfs_set_lock_blocking_write(right); free_space = btrfs_leaf_free_space(right); @@ -3735,7 +3822,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root /* cow and double check */ ret = btrfs_cow_block(trans, root, right, upper, - slot + 1, &right); + slot + 1, &right, BTRFS_NESTING_RIGHT_COW); if (ret) goto out_unlock; @@ -3747,6 +3834,12 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (left_nritems == 0) goto out_unlock; + if (check_sibling_keys(left, right)) { + ret = -EUCLEAN; + btrfs_tree_unlock(right); + free_extent_buffer(right); + return ret; + } if (path->slots[0] == left_nritems && !empty) { /* Key greater than all keys in the leaf, right neighbor has * enough room for it and we're not emptying our leaf to delete @@ -3959,7 +4052,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root if (IS_ERR(left)) return 1; - btrfs_tree_lock(left); + __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); btrfs_set_lock_blocking_write(left); free_space = btrfs_leaf_free_space(left); @@ -3970,7 +4063,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root /* cow and double check */ ret = btrfs_cow_block(trans, root, left, - path->nodes[1], slot - 1, &left); + path->nodes[1], slot - 1, &left, + BTRFS_NESTING_LEFT_COW); if (ret) { /* we hit -ENOSPC, but it isn't fatal here */ if (ret == -ENOSPC) @@ -3984,6 +4078,10 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } + if (check_sibling_keys(left, right)) { + ret = -EUCLEAN; + goto out; + } return __push_leaf_left(path, min_data_size, empty, left, free_space, right_nritems, max_slot); @@ -4232,8 +4330,18 @@ again: else btrfs_item_key(l, &disk_key, mid); + /* + * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double + * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES + * subclasses, which is 8 at the time of this patch, and we've maxed it + * out. In the future we could add a + * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just + * use BTRFS_NESTING_NEW_ROOT. + */ right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0, - l->start, 0); + l->start, 0, num_doubles ? + BTRFS_NESTING_NEW_ROOT : + BTRFS_NESTING_SPLIT); if (IS_ERR(right)) return PTR_ERR(right); @@ -4478,9 +4586,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, return ret; path->slots[0]++; - setup_items_for_insert(root, path, new_key, &item_size, - item_size, item_size + - sizeof(struct btrfs_item), 1); + setup_items_for_insert(root, path, new_key, &item_size, 1); leaf = path->nodes[0]; memcpy_extent_buffer(leaf, btrfs_item_ptr_offset(leaf, path->slots[0]), @@ -4653,14 +4759,20 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) } } -/* - * this is a helper for btrfs_insert_empty_items, the main goal here is - * to save stack depth by doing the bulk of the work in a function - * that doesn't call btrfs_search_slot +/** + * setup_items_for_insert - Helper called before inserting one or more items + * to a leaf. Main purpose is to save stack depth by doing the bulk of the work + * in a function that doesn't call btrfs_search_slot + * + * @root: root we are inserting items to + * @path: points to the leaf/slot where we are going to insert new items + * @cpu_key: array of keys for items to be inserted + * @data_size: size of the body of each item we are going to insert + * @nr: size of @cpu_key/@data_size arrays */ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *cpu_key, u32 *data_size, - u32 total_data, u32 total_size, int nr) + int nr) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_item *item; @@ -4671,6 +4783,12 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, struct extent_buffer *leaf; int slot; struct btrfs_map_token token; + u32 total_size; + u32 total_data = 0; + + for (i = 0; i < nr; i++) + total_data += data_size[i]; + total_size = total_data + (nr * sizeof(struct btrfs_item)); if (path->slots[0] == 0) { btrfs_cpu_key_to_disk(&disk_key, cpu_key); @@ -4697,7 +4815,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, if (old_data < data_end) { btrfs_print_leaf(leaf); - btrfs_crit(fs_info, "slot %d old_data %d data_end %d", + btrfs_crit(fs_info, + "item at slot %d with data offset %u beyond data end of leaf %u", slot, old_data, data_end); BUG(); } @@ -4730,8 +4849,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); btrfs_set_item_key(leaf, &disk_key, slot + i); item = btrfs_item_nr(slot + i); - btrfs_set_token_item_offset(&token, item, data_end - data_size[i]); data_end -= data_size[i]; + btrfs_set_token_item_offset(&token, item, data_end); btrfs_set_token_item_size(&token, item, data_size[i]); } @@ -4773,8 +4892,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, slot = path->slots[0]; BUG_ON(slot < 0); - setup_items_for_insert(root, path, cpu_key, data_size, - total_data, total_size, nr); + setup_items_for_insert(root, path, cpu_key, data_size, nr); return 0; } @@ -5111,7 +5229,7 @@ again: slot--; /* * check this node pointer against the min_trans parameters. - * If it is too old, old, skip to the next one. + * If it is too old, skip to the next one. */ while (slot < nritems) { u64 gen; @@ -5375,7 +5493,9 @@ again: } if (!ret) { btrfs_set_path_blocking(path); - btrfs_tree_read_lock(next); + __btrfs_tree_read_lock(next, + BTRFS_NESTING_RIGHT, + path->recurse); } next_rw_lock = BTRFS_READ_LOCK; } @@ -5410,7 +5530,9 @@ again: ret = btrfs_try_tree_read_lock(next); if (!ret) { btrfs_set_path_blocking(path); - btrfs_tree_read_lock(next); + __btrfs_tree_read_lock(next, + BTRFS_NESTING_RIGHT, + path->recurse); } next_rw_lock = BTRFS_READ_LOCK; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9c7e466f27a9..aac3d6f4e35b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -374,6 +374,7 @@ struct btrfs_path { unsigned int search_commit_root:1; unsigned int need_commit_sem:1; unsigned int skip_release_on_error:1; + unsigned int recurse:1; }; #define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ sizeof(struct btrfs_item)) @@ -494,7 +495,7 @@ enum btrfs_orphan_cleanup_state { ORPHAN_CLEANUP_DONE = 2, }; -void btrfs_init_async_reclaim_work(struct work_struct *work); +void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info); /* fs_info */ struct reloc_control; @@ -541,11 +542,6 @@ enum { /* Used to record internally whether fs has been frozen */ BTRFS_FS_FROZEN, /* - * Indicate that a whole-filesystem exclusive operation is running - * (device replace, resize, device add/delete, balance) - */ - BTRFS_FS_EXCL_OP, - /* * Indicate that balance has been set up from the ioctl and is in the * main phase. The fs_info::balance_ctl is initialized. * Set and cleared while holding fs_info::balance_mutex. @@ -565,6 +561,19 @@ enum { BTRFS_FS_DISCARD_RUNNING, }; +/* + * Exclusive operations (device replace, resize, device add/remove, balance) + */ +enum btrfs_exclusive_operation { + BTRFS_EXCLOP_NONE, + BTRFS_EXCLOP_BALANCE, + BTRFS_EXCLOP_DEV_ADD, + BTRFS_EXCLOP_DEV_REMOVE, + BTRFS_EXCLOP_DEV_REPLACE, + BTRFS_EXCLOP_RESIZE, + BTRFS_EXCLOP_SWAP_ACTIVATE, +}; + struct btrfs_fs_info { u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; unsigned long flags; @@ -912,6 +921,7 @@ struct btrfs_fs_info { /* Used to reclaim the metadata space in the background. */ struct work_struct async_reclaim_work; + struct work_struct async_data_reclaim_work; spinlock_t unused_bgs_lock; struct list_head unused_bgs; @@ -935,6 +945,9 @@ struct btrfs_fs_info { */ int send_in_progress; + /* Type of exclusive operation running */ + unsigned long exclusive_operation; + #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; struct rb_root block_tree; @@ -1181,24 +1194,40 @@ struct btrfs_root { #endif }; -struct btrfs_clone_extent_info { +/* + * Structure that conveys information about an extent that is going to replace + * all the extents in a file range. + */ +struct btrfs_replace_extent_info { u64 disk_offset; u64 disk_len; u64 data_offset; u64 data_len; u64 file_offset; + /* Pointer to a file extent item of type regular or prealloc. */ char *extent_buf; - u32 item_size; + /* + * Set to true when attempting to replace a file range with a new extent + * described by this structure, set to false when attempting to clone an + * existing extent into a file range. + */ + bool is_new_extent; + /* Meaningful only if is_new_extent is true. */ + int qgroup_reserved; + /* + * Meaningful only if is_new_extent is true. + * Used to track how many extent items we have already inserted in a + * subvolume tree that refer to the extent described by this structure, + * so that we know when to create a new delayed ref or update an existing + * one. + */ + int insertions; }; struct btrfs_file_private { void *filldir_buf; }; -static inline u32 btrfs_inode_sectorsize(const struct inode *inode) -{ - return btrfs_sb(inode->i_sb)->sectorsize; -} static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info) { @@ -1391,6 +1420,16 @@ static inline void btrfs_init_map_token(struct btrfs_map_token *token, #define cpu_to_le8(v) (v) #define __le8 u8 +static inline u8 get_unaligned_le8(const void *p) +{ + return *(u8 *)p; +} + +static inline void put_unaligned_le8(u8 val, void *p) +{ + *(u8 *)p = val; +} + #define read_eb_member(eb, ptr, type, member, result) (\ read_extent_buffer(eb, (char *)(result), \ ((unsigned long)(ptr)) + \ @@ -1449,27 +1488,25 @@ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ static inline u##bits btrfs_##name(const struct extent_buffer *eb) \ { \ const type *p = page_address(eb->pages[0]); \ - u##bits res = le##bits##_to_cpu(p->member); \ - return res; \ + return get_unaligned_le##bits(&p->member); \ } \ static inline void btrfs_set_##name(const struct extent_buffer *eb, \ u##bits val) \ { \ type *p = page_address(eb->pages[0]); \ - p->member = cpu_to_le##bits(val); \ + put_unaligned_le##bits(val, &p->member); \ } #define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(const type *s) \ { \ - return le##bits##_to_cpu(s->member); \ + return get_unaligned_le##bits(&s->member); \ } \ static inline void btrfs_set_##name(type *s, u##bits val) \ { \ - s->member = cpu_to_le##bits(val); \ + put_unaligned_le##bits(val, &s->member); \ } - static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb, struct btrfs_dev_item *s) { @@ -2262,7 +2299,7 @@ BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, int btrfs_super_csum_size(const struct btrfs_super_block *s); const char *btrfs_super_csum_name(u16 csum_type); const char *btrfs_super_csum_driver(u16 csum_type); -size_t __const btrfs_get_num_csums(void); +size_t __attribute_const__ btrfs_get_num_csums(void); /* @@ -2518,13 +2555,14 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes); int btrfs_exclude_logged_extents(struct extent_buffer *eb); int btrfs_cross_ref_exist(struct btrfs_root *root, - u64 objectid, u64 offset, u64 bytenr); + u64 objectid, u64 offset, u64 bytenr, bool strict); struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, const struct btrfs_disk_key *key, int level, u64 hint, - u64 empty_size); + u64 empty_size, + enum btrfs_lock_nesting nest); void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, @@ -2592,6 +2630,8 @@ enum btrfs_reserve_flush_enum { * * Can be interruped by fatal signal. */ + BTRFS_RESERVE_FLUSH_DATA, + BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE, BTRFS_RESERVE_FLUSH_ALL, /* @@ -2619,7 +2659,7 @@ enum btrfs_flush_state { int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, bool use_global_rsv); -void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, +void btrfs_subvolume_release_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv); void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); @@ -2651,8 +2691,6 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, struct btrfs_path *path, const struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); -struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); -struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, int lowest_level, u64 min_trans); @@ -2665,7 +2703,8 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, - struct extent_buffer **cow_ret); + struct extent_buffer **cow_ret, + enum btrfs_lock_nesting nest); int btrfs_copy_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, @@ -2713,7 +2752,7 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans, void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *cpu_key, u32 *data_size, - u32 total_data, u32 total_size, int nr); + int nr); int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, void *data, u32 data_size); int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, @@ -2930,11 +2969,15 @@ void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size); u64 btrfs_file_extent_end(const struct btrfs_path *path); /* inode.c */ +blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags); +int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, int mirror); struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, u64 start, u64 len); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, - u64 *ram_bytes); + u64 *ram_bytes, bool strict); void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode); @@ -2956,7 +2999,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u32 min_type); int btrfs_start_delalloc_snapshot(struct btrfs_root *root); -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr); +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr); int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state); @@ -3017,6 +3060,7 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end); void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, u64 end, int uptodate); extern const struct dentry_operations btrfs_dentry_operations; +ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter); /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -3031,6 +3075,9 @@ void btrfs_get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); +bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type); +void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); /* file.c */ int __init btrfs_auto_defrag_init(void); @@ -3053,9 +3100,9 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, int drop_cache); -int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, +int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, const u64 start, const u64 end, - struct btrfs_clone_extent_info *clone_info, + struct btrfs_replace_extent_info *extent_info, struct btrfs_trans_handle **trans_out); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 start, u64 end); @@ -3536,9 +3583,7 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) /* Sanity test specific functions */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -void btrfs_test_inode_set_ops(struct inode *inode); void btrfs_test_destroy_inode(struct inode *inode); - static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) { return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 0e354e9e57d0..bacee09b7bfd 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -115,126 +115,15 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; - u64 used; - int ret = 0; - int need_commit = 2; - int have_pinned_space; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA; /* Make sure bytes are sectorsize aligned */ bytes = ALIGN(bytes, fs_info->sectorsize); - if (btrfs_is_free_space_inode(inode)) { - need_commit = 0; - ASSERT(current->journal_info); - } - -again: - /* Make sure we have enough space to handle the data first */ - spin_lock(&data_sinfo->lock); - used = btrfs_space_info_used(data_sinfo, true); - - if (used + bytes > data_sinfo->total_bytes) { - struct btrfs_trans_handle *trans; - - /* - * If we don't have enough free bytes in this space then we need - * to alloc a new chunk. - */ - if (!data_sinfo->full) { - u64 alloc_target; - - data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; - spin_unlock(&data_sinfo->lock); - - alloc_target = btrfs_data_alloc_profile(fs_info); - /* - * It is ugly that we don't call nolock join - * transaction for the free space inode case here. - * But it is safe because we only do the data space - * reservation for the free space cache in the - * transaction context, the common join transaction - * just increase the counter of the current transaction - * handler, doesn't try to acquire the trans_lock of - * the fs. - */ - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - ret = btrfs_chunk_alloc(trans, alloc_target, - CHUNK_ALLOC_NO_FORCE); - btrfs_end_transaction(trans); - if (ret < 0) { - if (ret != -ENOSPC) - return ret; - else { - have_pinned_space = 1; - goto commit_trans; - } - } - - goto again; - } - - /* - * If we don't have enough pinned space to deal with this - * allocation, and no removed chunk in current transaction, - * don't bother committing the transaction. - */ - have_pinned_space = __percpu_counter_compare( - &data_sinfo->total_bytes_pinned, - used + bytes - data_sinfo->total_bytes, - BTRFS_TOTAL_BYTES_PINNED_BATCH); - spin_unlock(&data_sinfo->lock); - - /* Commit the current transaction and try again */ -commit_trans: - if (need_commit) { - need_commit--; - - if (need_commit > 0) { - btrfs_start_delalloc_roots(fs_info, -1); - btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, - (u64)-1); - } - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - if (have_pinned_space >= 0 || - test_bit(BTRFS_TRANS_HAVE_FREE_BGS, - &trans->transaction->flags) || - need_commit > 0) { - ret = btrfs_commit_transaction(trans); - if (ret) - return ret; - /* - * The cleaner kthread might still be doing iput - * operations. Wait for it to finish so that - * more space is released. We don't need to - * explicitly run the delayed iputs here because - * the commit_transaction would have woken up - * the cleaner. - */ - ret = btrfs_wait_on_delayed_iputs(fs_info); - if (ret) - return ret; - goto again; - } else { - btrfs_end_transaction(trans); - } - } - - trace_btrfs_space_reservation(fs_info, - "space_info:enospc", - data_sinfo->flags, bytes, 1); - return -ENOSPC; - } - btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes); - spin_unlock(&data_sinfo->lock); + if (btrfs_is_free_space_inode(inode)) + flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE; - return 0; + return btrfs_reserve_data_bytes(fs_info, bytes, flush); } int btrfs_check_data_free_space(struct btrfs_inode *inode, @@ -277,9 +166,7 @@ void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info, ASSERT(IS_ALIGNED(len, fs_info->sectorsize)); data_sinfo = fs_info->data_sinfo; - spin_lock(&data_sinfo->lock); - btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len); - spin_unlock(&data_sinfo->lock); + btrfs_space_info_free_bytes_may_use(fs_info, data_sinfo, len); } /* diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index bf1595a42a98..5aba81e16113 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -627,8 +627,7 @@ static int btrfs_delayed_inode_reserve_metadata( */ if (!src_rsv || (!trans->bytes_reserved && src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { - ret = btrfs_qgroup_reserve_meta_prealloc(root, - fs_info->nodesize, true); + ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true); if (ret < 0) return ret; ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, @@ -769,8 +768,7 @@ static int btrfs_batch_insert_items(struct btrfs_root *root, } /* insert the keys of the items */ - setup_items_for_insert(root, path, keys, data_size, - total_data_size, total_size, nitems); + setup_items_for_insert(root, path, keys, data_size, nitems); /* insert the dir index items */ slot = path->slots[0]; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index db93909b25e0..4a0243cb9d97 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -64,10 +64,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret); -static void btrfs_dev_replace_update_device_in_mapping_tree( - struct btrfs_fs_info *fs_info, - struct btrfs_device *srcdev, - struct btrfs_device *tgtdev); static int btrfs_dev_replace_kthread(void *data); int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) @@ -224,13 +220,12 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, { struct btrfs_device *device; struct block_device *bdev; - struct list_head *devices; struct rcu_string *name; u64 devid = BTRFS_DEV_REPLACE_DEVID; int ret = 0; *device_out = NULL; - if (fs_info->fs_devices->seeding) { + if (srcdev->fs_devices->seeding) { btrfs_err(fs_info, "the filesystem is a seed filesystem!"); return -EINVAL; } @@ -244,8 +239,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, sync_blockdev(bdev); - devices = &fs_info->fs_devices->devices; - list_for_each_entry(device, devices, dev_list) { + list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { if (device->bdev == bdev) { btrfs_err(fs_info, "target device is in the filesystem!"); @@ -512,7 +506,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); up_write(&dev_replace->rwsem); - ret = btrfs_sysfs_add_devices_dir(tgt_device->fs_devices, tgt_device); + ret = btrfs_sysfs_add_device(tgt_device); if (ret) btrfs_err(fs_info, "kobj add dev failed %d", ret); @@ -599,6 +593,63 @@ static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info) wake_up(&fs_info->dev_replace.replace_wait); } +/* + * When finishing the device replace, before swapping the source device with the + * target device we must update the chunk allocation state in the target device, + * as it is empty because replace works by directly copying the chunks and not + * through the normal chunk allocation path. + */ +static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev, + struct btrfs_device *tgtdev) +{ + struct extent_state *cached_state = NULL; + u64 start = 0; + u64 found_start; + u64 found_end; + int ret = 0; + + lockdep_assert_held(&srcdev->fs_info->chunk_mutex); + + while (!find_first_extent_bit(&srcdev->alloc_state, start, + &found_start, &found_end, + CHUNK_ALLOCATED, &cached_state)) { + ret = set_extent_bits(&tgtdev->alloc_state, found_start, + found_end, CHUNK_ALLOCATED); + if (ret) + break; + start = found_end + 1; + } + + free_extent_state(cached_state); + return ret; +} + +static void btrfs_dev_replace_update_device_in_mapping_tree( + struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev, + struct btrfs_device *tgtdev) +{ + struct extent_map_tree *em_tree = &fs_info->mapping_tree; + struct extent_map *em; + struct map_lookup *map; + u64 start = 0; + int i; + + write_lock(&em_tree->lock); + do { + em = lookup_extent_mapping(em_tree, start, (u64)-1); + if (!em) + break; + map = em->map_lookup; + for (i = 0; i < map->num_stripes; i++) + if (srcdev == map->stripes[i].dev) + map->stripes[i].dev = tgtdev; + start = em->start + em->len; + free_extent_map(em); + } while (start); + write_unlock(&em_tree->lock); +} + static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret) { @@ -630,7 +681,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, * flush all outstanding I/O and inode extent mappings before the * copy operation is declared as being finished */ - ret = btrfs_start_delalloc_roots(fs_info, -1); + ret = btrfs_start_delalloc_roots(fs_info, U64_MAX); if (ret) { mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; @@ -673,8 +724,14 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, dev_replace->time_stopped = ktime_get_real_seconds(); dev_replace->item_needs_writeback = 1; - /* replace old device with new one in mapping tree */ + /* + * Update allocation state in the new device and replace the old device + * with the new one in the mapping tree. + */ if (!scrub_ret) { + scrub_ret = btrfs_set_target_alloc_state(src_device, tgt_device); + if (scrub_ret) + goto error; btrfs_dev_replace_update_device_in_mapping_tree(fs_info, src_device, tgt_device); @@ -685,6 +742,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, btrfs_dev_name(src_device), src_device->devid, rcu_str_deref(tgt_device->name), scrub_ret); +error: up_write(&dev_replace->rwsem); mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); @@ -743,9 +801,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&fs_info->fs_devices->device_list_mutex); /* replace the sysfs entry */ - btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, src_device); + btrfs_sysfs_remove_device(src_device); btrfs_sysfs_update_devid(tgt_device); - btrfs_rm_dev_replace_free_srcdev(src_device); + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state)) + btrfs_scratch_superblocks(fs_info, src_device->bdev, + src_device->name->str); /* write back the superblocks */ trans = btrfs_start_transaction(root, 0); @@ -754,33 +814,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); - return 0; -} - -static void btrfs_dev_replace_update_device_in_mapping_tree( - struct btrfs_fs_info *fs_info, - struct btrfs_device *srcdev, - struct btrfs_device *tgtdev) -{ - struct extent_map_tree *em_tree = &fs_info->mapping_tree; - struct extent_map *em; - struct map_lookup *map; - u64 start = 0; - int i; + btrfs_rm_dev_replace_free_srcdev(src_device); - write_lock(&em_tree->lock); - do { - em = lookup_extent_mapping(em_tree, start, (u64)-1); - if (!em) - break; - map = em->map_lookup; - for (i = 0; i < map->num_stripes; i++) - if (srcdev == map->stripes[i].dev) - map->stripes[i].dev = tgtdev; - start = em->start + em->len; - free_extent_map(em); - } while (start); - write_unlock(&em_tree->lock); + return 0; } /* @@ -983,7 +1019,7 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) * should never allow both to start and pause. We don't want to allow * dev-replace to start anyway. */ - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) { down_write(&dev_replace->rwsem); dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; @@ -1020,7 +1056,7 @@ static int btrfs_dev_replace_kthread(void *data) ret = btrfs_dev_replace_finishing(fs_info, ret); WARN_ON(ret && ret != -ECANCELED); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); return 0; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9ae25f632157..8e3438672a82 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -50,7 +50,6 @@ BTRFS_SUPER_FLAG_METADUMP |\ BTRFS_SUPER_FLAG_METADUMP_V2) -static const struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, @@ -205,53 +204,6 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, #endif /* - * extents on the btree inode are pretty simple, there's one extent - * that covers the entire device - */ -struct extent_map *btree_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 len) -{ - struct extent_map_tree *em_tree = &inode->extent_tree; - struct extent_map *em; - int ret; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); - if (em) { - read_unlock(&em_tree->lock); - goto out; - } - read_unlock(&em_tree->lock); - - em = alloc_extent_map(); - if (!em) { - em = ERR_PTR(-ENOMEM); - goto out; - } - em->start = 0; - em->len = (u64)-1; - em->block_len = (u64)-1; - em->block_start = 0; - - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); - if (ret == -EEXIST) { - free_extent_map(em); - em = lookup_extent_mapping(em_tree, start, len); - if (!em) - em = ERR_PTR(-EIO); - } else if (ret) { - free_extent_map(em); - em = ERR_PTR(ret); - } - write_unlock(&em_tree->lock); - -out: - return em; -} - -/* * Compute the csum of a btree block and store the result to provided buffer. */ static void csum_tree_block(struct extent_buffer *buf, u8 *result) @@ -545,38 +497,35 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) static int check_tree_block_fsid(struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; u8 fsid[BTRFS_FSID_SIZE]; - int ret = 1; + u8 *metadata_uuid; read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid), BTRFS_FSID_SIZE); - while (fs_devices) { - u8 *metadata_uuid; + /* + * Checking the incompat flag is only valid for the current fs. For + * seed devices it's forbidden to have their uuid changed so reading + * ->fsid in this case is fine + */ + if (btrfs_fs_incompat(fs_info, METADATA_UUID)) + metadata_uuid = fs_devices->metadata_uuid; + else + metadata_uuid = fs_devices->fsid; - /* - * Checking the incompat flag is only valid for the current - * fs. For seed devices it's forbidden to have their uuid - * changed so reading ->fsid in this case is fine - */ - if (fs_devices == fs_info->fs_devices && - btrfs_fs_incompat(fs_info, METADATA_UUID)) - metadata_uuid = fs_devices->metadata_uuid; - else - metadata_uuid = fs_devices->fsid; + if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) + return 0; - if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) { - ret = 0; - break; - } - fs_devices = fs_devices->seed; - } - return ret; + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) + if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE)) + return 0; + + return 1; } -static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, - u64 phy_offset, struct page *page, - u64 start, u64 end, int mirror) +int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, + int mirror) { u64 found_start; int found_level; @@ -636,16 +585,15 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, csum_tree_block(eb, result); if (memcmp_extent_buffer(eb, result, 0, csum_size)) { - u32 val; - u32 found = 0; - - memcpy(&found, result, csum_size); + u8 val[BTRFS_CSUM_SIZE] = { 0 }; read_extent_buffer(eb, &val, 0, csum_size); btrfs_warn_rl(fs_info, - "%s checksum verify failed on %llu wanted %x found %x level %d", + "%s checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d", fs_info->sb->s_id, eb->start, - val, found, btrfs_header_level(eb)); + CSUM_FMT_VALUE(csum_size, val), + CSUM_FMT_VALUE(csum_size, result), + btrfs_header_level(eb)); ret = -EUCLEAN; goto err; } @@ -865,9 +813,8 @@ static int check_async_write(struct btrfs_fs_info *fs_info, return 1; } -static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio, - int mirror_num, - unsigned long bio_flags) +blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int async = check_async_write(fs_info, BTRFS_I(inode)); @@ -952,11 +899,6 @@ static int btree_writepages(struct address_space *mapping, return btree_write_cache_pages(mapping, wbc); } -static int btree_readpage(struct file *file, struct page *page) -{ - return extent_read_full_page(page, btree_get_extent, 0); -} - static int btree_releasepage(struct page *page, gfp_t gfp_flags) { if (PageWriteback(page) || PageDirty(page)) @@ -996,7 +938,6 @@ static int btree_set_page_dirty(struct page *page) } static const struct address_space_operations btree_aops = { - .readpage = btree_readpage, .writepages = btree_writepages, .releasepage = btree_releasepage, .invalidatepage = btree_invalidatepage, @@ -1209,7 +1150,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = 0; - leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0); + leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, + BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); leaf = NULL; @@ -1281,7 +1223,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, */ leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, - NULL, 0, 0, 0); + NULL, 0, 0, 0, BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { btrfs_put_root(root); return ERR_CAST(leaf); @@ -1506,10 +1448,12 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info) struct btrfs_root *root; while (!list_empty(&fs_info->allocated_roots)) { + char buf[BTRFS_ROOT_NAME_BUF_LEN]; + root = list_first_entry(&fs_info->allocated_roots, struct btrfs_root, leak_list); - btrfs_err(fs_info, "leaked root %llu-%llu refcount %d", - root->root_key.objectid, root->root_key.offset, + btrfs_err(fs_info, "leaked root %s refcount %d", + btrfs_root_name(root->root_key.objectid, buf), refcount_read(&root->refs)); while (refcount_read(&root->refs) > 1) btrfs_put_root(root); @@ -2116,12 +2060,10 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, - IO_TREE_INODE_IO, inode); + IO_TREE_BTREE_INODE_IO, inode); BTRFS_I(inode)->io_tree.track_uptodate = false; extent_map_tree_init(&BTRFS_I(inode)->extent_tree); - BTRFS_I(inode)->io_tree.ops = &btree_extent_io_ops; - BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key)); set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); @@ -2627,18 +2569,17 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) level = btrfs_super_root_level(sb); tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb), generation, level, NULL); - if (IS_ERR(tree_root->node) || - !extent_buffer_uptodate(tree_root->node)) { + if (IS_ERR(tree_root->node)) { handle_error = true; + ret = PTR_ERR(tree_root->node); + tree_root->node = NULL; + btrfs_warn(fs_info, "couldn't read tree root"); + continue; - if (IS_ERR(tree_root->node)) { - ret = PTR_ERR(tree_root->node); - tree_root->node = NULL; - } else if (!extent_buffer_uptodate(tree_root->node)) { - ret = -EUCLEAN; - } - - btrfs_warn(fs_info, "failed to read tree root"); + } else if (!extent_buffer_uptodate(tree_root->node)) { + handle_error = true; + ret = -EIO; + btrfs_warn(fs_info, "error while reading tree root"); continue; } @@ -2754,7 +2695,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) fs_info->check_integrity_print_mask = 0; #endif btrfs_init_balance(fs_info); - btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work); + btrfs_init_async_reclaim_work(fs_info); spin_lock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT; @@ -2929,7 +2870,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device } /* - * Verify the type first, if that or the the checksum value are + * Verify the type first, if that or the checksum value are * corrupted, we'll find out */ csum_type = btrfs_super_csum_type(disk_super); @@ -3091,8 +3032,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_sb_buffer; } - sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK; - sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super); sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE); @@ -3418,6 +3357,8 @@ fail_block_groups: btrfs_put_block_group_cache(fs_info); fail_tree_roots: + if (fs_info->data_reloc_root) + btrfs_drop_and_free_fs_root(fs_info, fs_info->data_reloc_root); free_root_pointers(fs_info, true); invalidate_inode_pages2(fs_info->btree_inode->i_mapping); @@ -3481,8 +3422,12 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, return ERR_CAST(page); super = page_address(page); - if (btrfs_super_bytenr(super) != bytenr || - btrfs_super_magic(super) != BTRFS_MAGIC) { + if (btrfs_super_magic(super) != BTRFS_MAGIC) { + btrfs_release_disk_super(super); + return ERR_PTR(-ENODATA); + } + + if (btrfs_super_bytenr(super) != bytenr) { btrfs_release_disk_super(super); return ERR_PTR(-EINVAL); } @@ -4055,6 +4000,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_cleanup_defrag_inodes(fs_info); cancel_work_sync(&fs_info->async_reclaim_work); + cancel_work_sync(&fs_info->async_data_reclaim_work); /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); @@ -4551,6 +4497,7 @@ static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache) cache->io_ctl.inode = NULL; iput(inode); } + ASSERT(cache->io_ctl.pages == NULL); btrfs_put_block_group(cache); } @@ -4685,9 +4632,3 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) return 0; } - -static const struct extent_io_ops btree_extent_io_ops = { - /* mandatory callbacks */ - .submit_bio_hook = btree_submit_bio_hook, - .readpage_end_io_hook = btree_readpage_end_io_hook, -}; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 00dc39d47ed3..fee69ced58b4 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -76,7 +76,11 @@ void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); - +int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, + int mirror); +blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); #endif @@ -123,9 +127,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid); int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)); -struct extent_map *btree_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 len); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); int __init btrfs_end_io_wq_init(void); void __cold btrfs_end_io_wq_exit(void); diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 219a09a2b734..9800a8306368 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -40,6 +40,7 @@ struct io_failure_record; enum { IO_TREE_FS_PINNED_EXTENTS, IO_TREE_FS_EXCLUDED_EXTENTS, + IO_TREE_BTREE_INODE_IO, IO_TREE_INODE_IO, IO_TREE_INODE_IO_FAILURE, IO_TREE_RELOC_BLOCKS, @@ -48,6 +49,7 @@ enum { IO_TREE_INODE_FILE_EXTENT, IO_TREE_LOG_CSUM_RANGE, IO_TREE_SELFTEST, + IO_TREE_DEVICE_ALLOC_STATE, }; struct extent_io_tree { @@ -61,7 +63,6 @@ struct extent_io_tree { u8 owner; spinlock_t lock; - const struct extent_io_ops *ops; }; struct extent_state { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index de6fe176fdfb..3b21fee13e77 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -400,12 +400,11 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, if (type == BTRFS_SHARED_BLOCK_REF_KEY) { ASSERT(eb->fs_info); /* - * Every shared one has parent tree - * block, which must be aligned to - * nodesize. + * Every shared one has parent tree block, + * which must be aligned to sector size. */ if (offset && - IS_ALIGNED(offset, eb->fs_info->nodesize)) + IS_ALIGNED(offset, eb->fs_info->sectorsize)) return type; } } else if (is_data == BTRFS_REF_TYPE_DATA) { @@ -414,12 +413,11 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, if (type == BTRFS_SHARED_DATA_REF_KEY) { ASSERT(eb->fs_info); /* - * Every shared one has parent tree - * block, which must be aligned to - * nodesize. + * Every shared one has parent tree block, + * which must be aligned to sector size. */ if (offset && - IS_ALIGNED(offset, eb->fs_info->nodesize)) + IS_ALIGNED(offset, eb->fs_info->sectorsize)) return type; } } else { @@ -429,8 +427,9 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, } btrfs_print_leaf((struct extent_buffer *)eb); - btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d", - eb->start, type); + btrfs_err(eb->fs_info, + "eb %llu iref 0x%lx invalid extent inline ref type %d", + eb->start, (unsigned long)iref, type); WARN_ON(1); return BTRFS_REF_TYPE_INVALID; @@ -1178,7 +1177,22 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, num_bytes, parent, root_objectid, owner, offset, 1); if (ret == 0) { - BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); + /* + * We're adding refs to a tree block we already own, this + * should not happen at all. + */ + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + btrfs_crit(trans->fs_info, +"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu", + bytenr, num_bytes, root_objectid); + if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) { + WARN_ON(1); + btrfs_crit(trans->fs_info, + "path->slots[0]=%d path->nodes[0]:", path->slots[0]); + btrfs_print_leaf(path->nodes[0]); + } + return -EUCLEAN; + } update_inline_extent_backref(path, iref, refs_to_add, extent_op, NULL); } else if (ret == -ENOENT) { @@ -1398,6 +1412,9 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, /* * __btrfs_inc_extent_ref - insert backreference for a given extent * + * The counterpart is in __btrfs_free_extent(), with examples and more details + * how it works. + * * @trans: Handle of transaction * * @node: The delayed ref node used to get the bytenr/length for @@ -2306,7 +2323,8 @@ static noinline int check_delayed_ref(struct btrfs_root *root, static noinline int check_committed_ref(struct btrfs_root *root, struct btrfs_path *path, - u64 objectid, u64 offset, u64 bytenr) + u64 objectid, u64 offset, u64 bytenr, + bool strict) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *extent_root = fs_info->extent_root; @@ -2348,9 +2366,13 @@ static noinline int check_committed_ref(struct btrfs_root *root, btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) goto out; - /* If extent created before last snapshot => it's definitely shared */ - if (btrfs_extent_generation(leaf, ei) <= - btrfs_root_last_snapshot(&root->root_item)) + /* + * If extent created before last snapshot => it's shared unless the + * snapshot has been deleted. Use the heuristic if strict is false. + */ + if (!strict && + (btrfs_extent_generation(leaf, ei) <= + btrfs_root_last_snapshot(&root->root_item))) goto out; iref = (struct btrfs_extent_inline_ref *)(ei + 1); @@ -2375,7 +2397,7 @@ out: } int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, - u64 bytenr) + u64 bytenr, bool strict) { struct btrfs_path *path; int ret; @@ -2386,7 +2408,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, do { ret = check_committed_ref(root, path, objectid, - offset, bytenr); + offset, bytenr, strict); if (ret && ret != -ENOENT) goto out; @@ -2845,11 +2867,10 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, len -= to_add; } spin_unlock(&global_rsv->lock); - /* Add to any tickets we may have */ - if (len) - btrfs_try_granting_tickets(fs_info, - space_info); } + /* Add to any tickets we may have */ + if (!readonly && return_free_space && len) + btrfs_try_granting_tickets(fs_info, space_info); spin_unlock(&space_info->lock); } @@ -2931,6 +2952,65 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) return 0; } +/* + * Drop one or more refs of @node. + * + * 1. Locate the extent refs. + * It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item. + * Locate it, then reduce the refs number or remove the ref line completely. + * + * 2. Update the refs count in EXTENT/METADATA_ITEM + * + * Inline backref case: + * + * in extent tree we have: + * + * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82 + * refs 2 gen 6 flags DATA + * extent data backref root FS_TREE objectid 258 offset 0 count 1 + * extent data backref root FS_TREE objectid 257 offset 0 count 1 + * + * This function gets called with: + * + * node->bytenr = 13631488 + * node->num_bytes = 1048576 + * root_objectid = FS_TREE + * owner_objectid = 257 + * owner_offset = 0 + * refs_to_drop = 1 + * + * Then we should get some like: + * + * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82 + * refs 1 gen 6 flags DATA + * extent data backref root FS_TREE objectid 258 offset 0 count 1 + * + * Keyed backref case: + * + * in extent tree we have: + * + * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24 + * refs 754 gen 6 flags DATA + * [...] + * item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28 + * extent data backref root FS_TREE objectid 866 offset 0 count 1 + * + * This function get called with: + * + * node->bytenr = 13631488 + * node->num_bytes = 1048576 + * root_objectid = FS_TREE + * owner_objectid = 866 + * owner_offset = 0 + * refs_to_drop = 1 + * + * Then we should get some like: + * + * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24 + * refs 753 gen 6 flags DATA + * + * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed. + */ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, @@ -2963,7 +3043,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, path->leave_spinning = 1; is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; - BUG_ON(!is_data && refs_to_drop != 1); + + if (!is_data && refs_to_drop != 1) { + btrfs_crit(info, +"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u", + node->bytenr, refs_to_drop); + ret = -EINVAL; + btrfs_abort_transaction(trans, ret); + goto out; + } if (is_data) skinny_metadata = false; @@ -2972,6 +3060,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, parent, root_objectid, owner_objectid, owner_offset); if (ret == 0) { + /* + * Either the inline backref or the SHARED_DATA_REF/ + * SHARED_BLOCK_REF is found + * + * Here is a quick path to locate EXTENT/METADATA_ITEM. + * It's possible the EXTENT/METADATA_ITEM is near current slot. + */ extent_slot = path->slots[0]; while (extent_slot >= 0) { btrfs_item_key_to_cpu(path->nodes[0], &key, @@ -2988,13 +3083,21 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, found_extent = 1; break; } + + /* Quick path didn't find the EXTEMT/METADATA_ITEM */ if (path->slots[0] - extent_slot > 5) break; extent_slot--; } if (!found_extent) { - BUG_ON(iref); + if (iref) { + btrfs_crit(info, +"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref"); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } + /* Must be SHARED_* item, remove the backref first */ ret = remove_extent_backref(trans, path, NULL, refs_to_drop, is_data, &last_ref); @@ -3005,6 +3108,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_release_path(path); path->leave_spinning = 1; + /* Slow path to locate EXTENT/METADATA_ITEM */ key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; @@ -3079,19 +3183,26 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && key.type == BTRFS_EXTENT_ITEM_KEY) { struct btrfs_tree_block_info *bi; - BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); + if (item_size < sizeof(*ei) + sizeof(*bi)) { + btrfs_crit(info, +"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %lu", + key.objectid, key.type, key.offset, + owner_objectid, item_size, + sizeof(*ei) + sizeof(*bi)); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } bi = (struct btrfs_tree_block_info *)(ei + 1); WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); } refs = btrfs_extent_refs(leaf, ei); if (refs < refs_to_drop) { - btrfs_err(info, - "trying to drop %d refs but we only have %Lu for bytenr %Lu", + btrfs_crit(info, + "trying to drop %d refs but we only have %llu for bytenr %llu", refs_to_drop, refs, bytenr); - ret = -EINVAL; - btrfs_abort_transaction(trans, ret); - goto out; + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; } refs -= refs_to_drop; @@ -3103,7 +3214,12 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, * be updated by remove_extent_backref */ if (iref) { - BUG_ON(!found_extent); + if (!found_extent) { + btrfs_crit(info, +"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found"); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } } else { btrfs_set_extent_refs(leaf, ei, refs); btrfs_mark_buffer_dirty(leaf); @@ -3118,13 +3234,39 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } } else { + /* In this branch refs == 1 */ if (found_extent) { - BUG_ON(is_data && refs_to_drop != - extent_data_ref_count(path, iref)); + if (is_data && refs_to_drop != + extent_data_ref_count(path, iref)) { + btrfs_crit(info, + "invalid refs_to_drop, current refs %u refs_to_drop %u", + extent_data_ref_count(path, iref), + refs_to_drop); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } if (iref) { - BUG_ON(path->slots[0] != extent_slot); + if (path->slots[0] != extent_slot) { + btrfs_crit(info, +"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref", + key.objectid, key.type, + key.offset); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } } else { - BUG_ON(path->slots[0] != extent_slot + 1); + /* + * No inline ref, we must be at SHARED_* item, + * And it's single ref, it must be: + * | extent_slot ||extent_slot + 1| + * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ] + */ + if (path->slots[0] != extent_slot + 1) { + btrfs_crit(info, + "invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM"); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } path->slots[0] = extent_slot; num_to_del = 2; } @@ -3165,6 +3307,19 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, out: btrfs_free_path(path); return ret; +err_dump: + /* + * Leaf dump can take up a lot of log buffer, so we only do full leaf + * dump for debug build. + */ + if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) { + btrfs_crit(info, "path->slots[0]=%d extent_slot=%d", + path->slots[0], extent_slot); + btrfs_print_leaf(path->nodes[0]); + } + + btrfs_free_path(path); + return -EUCLEAN; } /* @@ -3914,11 +4069,12 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, * |- Push harder to find free extents * |- If not found, re-iterate all block groups */ -static noinline int find_free_extent(struct btrfs_fs_info *fs_info, +static noinline int find_free_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 empty_size, u64 hint_byte_orig, struct btrfs_key *ins, u64 flags, int delalloc) { + struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; int cache_block_group_error = 0; struct btrfs_block_group *block_group = NULL; @@ -3950,7 +4106,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, ins->objectid = 0; ins->offset = 0; - trace_find_free_extent(fs_info, num_bytes, empty_size, flags); + trace_find_free_extent(root, num_bytes, empty_size, flags); space_info = btrfs_find_space_info(fs_info, flags); if (!space_info) { @@ -4199,7 +4355,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, flags = get_alloc_profile_by_root(root, is_data); again: WARN_ON(num_bytes < fs_info->sectorsize); - ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size, + ret = find_free_extent(root, ram_bytes, num_bytes, empty_size, hint_byte, ins, flags, delalloc); if (!ret && !is_data) { btrfs_dec_block_group_reservations(fs_info, ins->objectid); @@ -4500,7 +4656,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, static struct extent_buffer * btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, int level, u64 owner) + u64 bytenr, int level, u64 owner, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *buf; @@ -4522,8 +4679,8 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, return ERR_PTR(-EUCLEAN); } - btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); - btrfs_tree_lock(buf); + btrfs_set_buffer_lockdep_class(owner, buf, level); + __btrfs_tree_lock(buf, nest); btrfs_clean_tree_block(buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); @@ -4569,7 +4726,8 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, const struct btrfs_disk_key *key, int level, u64 hint, - u64 empty_size) + u64 empty_size, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key ins; @@ -4585,7 +4743,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (btrfs_is_testing(fs_info)) { buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, - level, root_objectid); + level, root_objectid, nest); if (!IS_ERR(buf)) root->alloc_bytenr += blocksize; return buf; @@ -4602,7 +4760,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, goto out_unuse; buf = btrfs_init_new_buffer(trans, root, ins.objectid, level, - root_objectid); + root_objectid, nest); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto out_free_reserved; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6def411b2eba..60f5f68d892d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -160,19 +160,20 @@ static int add_extent_changeset(struct extent_state *state, unsigned bits, return ret; } -static int __must_check submit_one_bio(struct bio *bio, int mirror_num, - unsigned long bio_flags) +int __must_check submit_one_bio(struct bio *bio, int mirror_num, + unsigned long bio_flags) { blk_status_t ret = 0; struct extent_io_tree *tree = bio->bi_private; bio->bi_private = NULL; - if (tree->ops) - ret = tree->ops->submit_bio_hook(tree->private_data, bio, - mirror_num, bio_flags); + if (is_data_inode(tree->private_data)) + ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num, + bio_flags); else - btrfsic_submit_bio(bio); + ret = btrfs_submit_metadata_bio(tree->private_data, bio, + mirror_num, bio_flags); return blk_status_to_errno(ret); } @@ -280,7 +281,6 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info, { tree->fs_info = fs_info; tree->state = RB_ROOT; - tree->ops = NULL; tree->dirty_bytes = 0; spin_lock_init(&tree->lock); tree->private_data = private_data; @@ -2819,8 +2819,6 @@ static void end_bio_extent_readpage(struct bio *bio) struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - bool data_inode = btrfs_ino(BTRFS_I(inode)) - != BTRFS_BTREE_INODE_OBJECTID; btrfs_debug(fs_info, "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", @@ -2851,9 +2849,12 @@ static void end_bio_extent_readpage(struct bio *bio) mirror = io_bio->mirror_num; if (likely(uptodate)) { - ret = tree->ops->readpage_end_io_hook(io_bio, offset, - page, start, end, - mirror); + if (is_data_inode(inode)) + ret = btrfs_verify_data_csum(io_bio, offset, page, + start, end, mirror); + else + ret = btrfs_validate_metadata_buffer(io_bio, + offset, page, start, end, mirror); if (ret) uptodate = 0; else @@ -2866,7 +2867,7 @@ static void end_bio_extent_readpage(struct bio *bio) if (likely(uptodate)) goto readpage_ok; - if (data_inode) { + if (is_data_inode(inode)) { /* * The generic bio_readpage_error handles errors the @@ -2881,7 +2882,7 @@ static void end_bio_extent_readpage(struct bio *bio) if (!btrfs_submit_read_repair(inode, bio, offset, page, start - page_offset(page), start, end, mirror, - tree->ops->submit_bio_hook)) { + btrfs_submit_data_bio)) { uptodate = !bio->bi_status; offset += len; continue; @@ -3053,7 +3054,6 @@ static int submit_extent_page(unsigned int opf, else contig = bio_end_sector(bio) == sector; - ASSERT(tree->ops); if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags)) can_merge = false; @@ -3110,8 +3110,7 @@ void set_page_extent_mapped(struct page *page) static struct extent_map * __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, - u64 start, u64 len, get_extent_t *get_extent, - struct extent_map **em_cached) + u64 start, u64 len, struct extent_map **em_cached) { struct extent_map *em; @@ -3127,7 +3126,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, *em_cached = NULL; } - em = get_extent(BTRFS_I(inode), page, pg_offset, start, len); + em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len); if (em_cached && !IS_ERR_OR_NULL(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); @@ -3142,12 +3141,9 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, * XXX JDM: This needs looking at to ensure proper page locking * return 0 on success, otherwise return error */ -static int __do_readpage(struct page *page, - get_extent_t *get_extent, - struct extent_map **em_cached, - struct bio **bio, int mirror_num, - unsigned long *bio_flags, unsigned int read_flags, - u64 *prev_em_start) +int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, + struct bio **bio, unsigned long *bio_flags, + unsigned int read_flags, u64 *prev_em_start) { struct inode *inode = page->mapping->host; u64 start = page_offset(page); @@ -3209,7 +3205,7 @@ static int __do_readpage(struct page *page, break; } em = __get_extent_map(inode, page, pg_offset, cur, - end - cur + 1, get_extent, em_cached); + end - cur + 1, em_cached); if (IS_ERR_OR_NULL(em)) { SetPageError(page); unlock_extent(tree, cur, end); @@ -3241,7 +3237,7 @@ static int __do_readpage(struct page *page, /* * If we have a file range that points to a compressed extent - * and it's followed by a consecutive file range that points to + * and it's followed by a consecutive file range that points * to the same compressed extent (possibly with a different * offset and/or length, so it either points to the whole extent * or only part of it), we must make sure we do not submit a @@ -3325,7 +3321,7 @@ static int __do_readpage(struct page *page, ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, page, offset, disk_io_size, pg_offset, bio, - end_bio_extent_readpage, mirror_num, + end_bio_extent_readpage, 0, *bio_flags, this_bio_flag, force_bio_submit); @@ -3362,44 +3358,12 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); for (index = 0; index < nr_pages; index++) { - __do_readpage(pages[index], btrfs_get_extent, em_cached, - bio, 0, bio_flags, REQ_RAHEAD, prev_em_start); + btrfs_do_readpage(pages[index], em_cached, bio, bio_flags, + REQ_RAHEAD, prev_em_start); put_page(pages[index]); } } -static int __extent_read_full_page(struct page *page, - get_extent_t *get_extent, - struct bio **bio, int mirror_num, - unsigned long *bio_flags, - unsigned int read_flags) -{ - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - u64 start = page_offset(page); - u64 end = start + PAGE_SIZE - 1; - int ret; - - btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - - ret = __do_readpage(page, get_extent, NULL, bio, mirror_num, - bio_flags, read_flags, NULL); - return ret; -} - -int extent_read_full_page(struct page *page, get_extent_t *get_extent, - int mirror_num) -{ - struct bio *bio = NULL; - unsigned long bio_flags = 0; - int ret; - - ret = __extent_read_full_page(page, get_extent, &bio, mirror_num, - &bio_flags, 0); - if (bio) - ret = submit_one_bio(bio, mirror_num, bio_flags); - return ret; -} - static void update_nr_written(struct writeback_control *wbc, unsigned long nr_written) { @@ -4552,7 +4516,7 @@ next: * helper function for fiemap, which doesn't want to see any holes. * This maps until we find something past 'last' */ -static struct extent_map *get_extent_skip_holes(struct inode *inode, +static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode, u64 offset, u64 last) { u64 sectorsize = btrfs_inode_sectorsize(inode); @@ -4567,7 +4531,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, if (len == 0) break; len = ALIGN(len, sectorsize); - em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len); + em = btrfs_get_extent_fiemap(inode, offset, len); if (IS_ERR_OR_NULL(em)) return em; @@ -4696,7 +4660,7 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, return ret; } -int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, +int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { int ret = 0; @@ -4707,12 +4671,12 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 last; u64 last_for_get_extent = 0; u64 disko = 0; - u64 isize = i_size_read(inode); + u64 isize = i_size_read(&inode->vfs_inode); struct btrfs_key found_key; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; struct btrfs_path *path; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct fiemap_cache cache = { 0 }; struct ulist *roots; struct ulist *tmp_ulist; @@ -4743,8 +4707,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, * lookup the last file extent. We're not using i_size here * because there might be preallocation past i_size */ - ret = btrfs_lookup_file_extent(NULL, root, path, - btrfs_ino(BTRFS_I(inode)), -1, 0); + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1, + 0); if (ret < 0) { goto out_free_ulist; } else { @@ -4758,7 +4722,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, found_type = found_key.type; /* No extents, but there might be delalloc bits */ - if (found_key.objectid != btrfs_ino(BTRFS_I(inode)) || + if (found_key.objectid != btrfs_ino(inode) || found_type != BTRFS_EXTENT_DATA_KEY) { /* have to trust i_size as the end */ last = (u64)-1; @@ -4784,7 +4748,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, last_for_get_extent = isize; } - lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, + lock_extent_bits(&inode->io_tree, start, start + len - 1, &cached_state); em = get_extent_skip_holes(inode, start, last_for_get_extent); @@ -4853,8 +4817,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, * then we're just getting a count and we can skip the * lookup stuff. */ - ret = btrfs_check_shared(root, - btrfs_ino(BTRFS_I(inode)), + ret = btrfs_check_shared(root, btrfs_ino(inode), bytenr, roots, tmp_ulist); if (ret < 0) goto out_free; @@ -4898,7 +4861,7 @@ out_free: ret = emit_last_fiemap_cache(fieinfo, &cache); free_extent_map(em); out: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, + unlock_extent_cached(&inode->io_tree, start, start + len - 1, &cached_state); out_free_ulist: @@ -4990,7 +4953,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, rwlock_init(&eb->lock); atomic_set(&eb->blocking_readers, 0); eb->blocking_writers = 0; - eb->lock_nested = false; + eb->lock_recursed = false; init_waitqueue_head(&eb->write_lock_wq); init_waitqueue_head(&eb->read_lock_wq); @@ -5574,20 +5537,19 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) } ClearPageError(page); - err = __extent_read_full_page(page, - btree_get_extent, &bio, - mirror_num, &bio_flags, - REQ_META); + err = submit_extent_page(REQ_OP_READ | REQ_META, NULL, + page, page_offset(page), PAGE_SIZE, 0, + &bio, end_bio_extent_readpage, + mirror_num, 0, 0, false); if (err) { - ret = err; /* - * We use &bio in above __extent_read_full_page, - * so we ensure that if it returns error, the - * current page fails to add itself to bio and - * it's been unlocked. - * - * We must dec io_pages by ourselves. + * We failed to submit the bio so it's the + * caller's responsibility to perform cleanup + * i.e unlock page/set error bit. */ + ret = err; + SetPageError(page); + unlock_page(page); atomic_dec(&eb->io_pages); } } else { @@ -5622,6 +5584,36 @@ unlock_exit: return ret; } +static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, + unsigned long len) +{ + btrfs_warn(eb->fs_info, + "access to eb bytenr %llu len %lu out of range start %lu len %lu", + eb->start, eb->len, start, len); + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + + return true; +} + +/* + * Check if the [start, start + len) range is valid before reading/writing + * the eb. + * NOTE: @start and @len are offset inside the eb, not logical address. + * + * Caller should not touch the dst/src memory if this function returns error. + */ +static inline int check_eb_range(const struct extent_buffer *eb, + unsigned long start, unsigned long len) +{ + unsigned long offset; + + /* start, start + len should not go beyond eb->len nor overflow */ + if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len)) + return report_eb_range(eb, start, len); + + return false; +} + void read_extent_buffer(const struct extent_buffer *eb, void *dstv, unsigned long start, unsigned long len) { @@ -5632,12 +5624,8 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, char *dst = (char *)dstv; unsigned long i = start >> PAGE_SHIFT; - if (start + len > eb->len) { - WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n", - eb->start, eb->len, start, len); - memset(dst, 0, len); + if (check_eb_range(eb, start, len)) return; - } offset = offset_in_page(start); @@ -5655,9 +5643,9 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, } } -int read_extent_buffer_to_user(const struct extent_buffer *eb, - void __user *dstv, - unsigned long start, unsigned long len) +int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, + void __user *dstv, + unsigned long start, unsigned long len) { size_t cur; size_t offset; @@ -5677,7 +5665,7 @@ int read_extent_buffer_to_user(const struct extent_buffer *eb, cur = min(len, (PAGE_SIZE - offset)); kaddr = page_address(page); - if (copy_to_user(dst, kaddr + offset, cur)) { + if (copy_to_user_nofault(dst, kaddr + offset, cur)) { ret = -EFAULT; break; } @@ -5702,8 +5690,8 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, unsigned long i = start >> PAGE_SHIFT; int ret = 0; - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); + if (check_eb_range(eb, start, len)) + return -EINVAL; offset = offset_in_page(start); @@ -5756,8 +5744,8 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, char *src = (char *)srcv; unsigned long i = start >> PAGE_SHIFT; - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); + if (check_eb_range(eb, start, len)) + return; offset = offset_in_page(start); @@ -5785,8 +5773,8 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, char *kaddr; unsigned long i = start >> PAGE_SHIFT; - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); + if (check_eb_range(eb, start, len)) + return; offset = offset_in_page(start); @@ -5830,6 +5818,10 @@ void copy_extent_buffer(const struct extent_buffer *dst, char *kaddr; unsigned long i = dst_offset >> PAGE_SHIFT; + if (check_eb_range(dst, dst_offset, len) || + check_eb_range(src, src_offset, len)) + return; + WARN_ON(src->len != dst_len); offset = offset_in_page(dst_offset); @@ -6019,25 +6011,15 @@ void memcpy_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { - struct btrfs_fs_info *fs_info = dst->fs_info; size_t cur; size_t dst_off_in_page; size_t src_off_in_page; unsigned long dst_i; unsigned long src_i; - if (src_offset + len > dst->len) { - btrfs_err(fs_info, - "memmove bogus src_offset %lu move len %lu dst len %lu", - src_offset, len, dst->len); - BUG(); - } - if (dst_offset + len > dst->len) { - btrfs_err(fs_info, - "memmove bogus dst_offset %lu move len %lu dst len %lu", - dst_offset, len, dst->len); - BUG(); - } + if (check_eb_range(dst, dst_offset, len) || + check_eb_range(dst, src_offset, len)) + return; while (len > 0) { dst_off_in_page = offset_in_page(dst_offset); @@ -6064,7 +6046,6 @@ void memmove_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { - struct btrfs_fs_info *fs_info = dst->fs_info; size_t cur; size_t dst_off_in_page; size_t src_off_in_page; @@ -6073,18 +6054,9 @@ void memmove_extent_buffer(const struct extent_buffer *dst, unsigned long dst_i; unsigned long src_i; - if (src_offset + len > dst->len) { - btrfs_err(fs_info, - "memmove bogus src_offset %lu move len %lu len %lu", - src_offset, len, dst->len); - BUG(); - } - if (dst_offset + len > dst->len) { - btrfs_err(fs_info, - "memmove bogus dst_offset %lu move len %lu len %lu", - dst_offset, len, dst->len); - BUG(); - } + if (check_eb_range(dst, dst_offset, len) || + check_eb_range(dst, src_offset, len)) + return; if (dst_offset < src_offset) { memcpy_extent_buffer(dst, dst_offset, src_offset, len); return; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 00a88f2eb5ab..f39d02e7f7ef 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -74,18 +74,6 @@ typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio, typedef blk_status_t (extent_submit_bio_start_t)(void *private_data, struct bio *bio, u64 bio_offset); -struct extent_io_ops { - /* - * The following callbacks must be always defined, the function - * pointer will be called unconditionally. - */ - submit_bio_hook_t *submit_bio_hook; - int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset, - struct page *page, u64 start, u64 end, - int mirror); -}; - - #define INLINE_EXTENT_BUFFER_PAGES 16 #define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_SIZE) struct extent_buffer { @@ -102,7 +90,7 @@ struct extent_buffer { int blocking_writers; atomic_t blocking_readers; - bool lock_nested; + bool lock_recursed; /* >= 0 if eb belongs to a log tree, -1 otherwise */ short log_index; @@ -193,8 +181,11 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); -int extent_read_full_page(struct page *page, get_extent_t *get_extent, - int mirror_num); +int __must_check submit_one_bio(struct bio *bio, int mirror_num, + unsigned long bio_flags); +int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, + struct bio **bio, unsigned long *bio_flags, + unsigned int read_flags, u64 *prev_em_start); int extent_write_full_page(struct page *page, struct writeback_control *wbc); int extent_write_locked_range(struct inode *inode, u64 start, u64 end, int mode); @@ -203,7 +194,7 @@ int extent_writepages(struct address_space *mapping, int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc); void extent_readahead(struct readahead_control *rac); -int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, +int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); void set_page_extent_mapped(struct page *page); @@ -241,9 +232,9 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, void read_extent_buffer(const struct extent_buffer *eb, void *dst, unsigned long start, unsigned long len); -int read_extent_buffer_to_user(const struct extent_buffer *eb, - void __user *dst, unsigned long start, - unsigned long len); +int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, + void __user *dst, unsigned long start, + unsigned long len); void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *src); void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb, const void *src); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 7d5ec71615b8..8f4f2bd6d9b9 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -318,8 +318,8 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, if (page_offsets) offset = page_offset(bvec.bv_page) + bvec.bv_offset; - count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, - csum, nblocks); + count = btrfs_find_ordered_sum(BTRFS_I(inode), offset, + disk_bytenr, csum, nblocks); if (count) goto found; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index bb824c7cb7c7..0ff659455b1e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1057,11 +1057,7 @@ delete_extent_item: if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) path->slots[0]++; } - setup_items_for_insert(root, path, &key, - &extent_item_size, - extent_item_size, - sizeof(struct btrfs_item) + - extent_item_size, 1); + setup_items_for_insert(root, path, &key, &extent_item_size, 1); *key_inserted = 1; } @@ -1477,9 +1473,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, int ret = 0; start_pos = round_down(pos, fs_info->sectorsize); - last_pos = start_pos - + round_up(pos + write_bytes - start_pos, - fs_info->sectorsize) - 1; + last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1; if (start_pos < inode->vfs_inode.i_size) { struct btrfs_ordered_extent *ordered; @@ -1497,8 +1491,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, unlock_page(pages[i]); put_page(pages[i]); } - btrfs_start_ordered_extent(&inode->vfs_inode, - ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); return -EAGAIN; } @@ -1571,7 +1564,7 @@ static int check_can_nocow(struct btrfs_inode *inode, loff_t pos, } ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, - NULL, NULL, NULL); + NULL, NULL, NULL, false); if (ret <= 0) { ret = 0; if (!nowait) @@ -1872,7 +1865,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) loff_t endbyte; int err; - written = generic_file_direct_write(iocb, from); + written = btrfs_direct_IO(iocb, from); if (written < 0 || !iov_iter_count(from)) return written; @@ -2025,7 +2018,40 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, atomic_inc(&BTRFS_I(inode)->sync_writers); if (iocb->ki_flags & IOCB_DIRECT) { + /* + * 1. We must always clear IOCB_DSYNC in order to not deadlock + * in iomap, as it calls generic_write_sync() in this case. + * 2. If we are async, we can call iomap_dio_complete() either + * in + * + * 2.1. A worker thread from the last bio completed. In this + * case we need to mark the btrfs_dio_data that it is + * async in order to call generic_write_sync() properly. + * This is handled by setting BTRFS_DIO_SYNC_STUB in the + * current->journal_info. + * 2.2 The submitter context, because all IO completed + * before we exited iomap_dio_rw(). In this case we can + * just re-set the IOCB_DSYNC on the iocb and we'll do + * the sync below. If our ->end_io() gets called and + * current->journal_info is set, then we know we're in + * our current context and we will clear + * current->journal_info to indicate that we need to + * sync below. + */ + if (sync) { + ASSERT(current->journal_info == NULL); + iocb->ki_flags &= ~IOCB_DSYNC; + current->journal_info = BTRFS_DIO_SYNC_STUB; + } num_written = __btrfs_direct_write(iocb, from); + + /* + * As stated above, we cleared journal_info, so we need to do + * the sync ourselves. + */ + if (sync && current->journal_info == NULL) + iocb->ki_flags |= IOCB_DSYNC; + current->journal_info = NULL; } else { num_written = btrfs_buffered_write(iocb, from); if (num_written > 0) @@ -2065,12 +2091,12 @@ int btrfs_release_file(struct inode *inode, struct file *filp) filp->private_data = NULL; /* - * ordered_data_close is set by setattr when we are about to truncate - * a file from a non-zero size to a zero size. This tries to - * flush down new bytes that may have been written if the - * application were using truncate to replace a file in place. + * Set by setattr when we are about to truncate a file from a non-zero + * size to a zero size. This tries to flush down new bytes that may + * have been written if the application were using truncate to replace + * a file in place. */ - if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, + if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE, &BTRFS_I(inode)->runtime_flags)) filemap_flush(inode->i_mapping); return 0; @@ -2116,20 +2142,24 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct btrfs_trans_handle *trans; struct btrfs_log_ctx ctx; int ret = 0, err; + u64 len; + bool full_sync; trace_btrfs_sync_file(file, datasync); btrfs_init_log_ctx(&ctx, inode); /* - * Set the range to full if the NO_HOLES feature is not enabled. - * This is to avoid missing file extent items representing holes after - * replaying the log. + * Always set the range to a full range, otherwise we can get into + * several problems, from missing file extent items to represent holes + * when not using the NO_HOLES feature, to log tree corruption due to + * races between hole detection during logging and completion of ordered + * extents outside the range, to missing checksums due to ordered extents + * for which we flushed only a subset of their pages. */ - if (!btrfs_fs_incompat(fs_info, NO_HOLES)) { - start = 0; - end = LLONG_MAX; - } + start = 0; + end = LLONG_MAX; + len = (u64)LLONG_MAX + 1; /* * We write the dirty pages in the range and wait until they complete @@ -2153,19 +2183,12 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); /* - * If the inode needs a full sync, make sure we use a full range to - * avoid log tree corruption, due to hole detection racing with ordered - * extent completion for adjacent ranges and races between logging and - * completion of ordered extents for adjancent ranges - both races - * could lead to file extent items in the log with overlapping ranges. - * Do this while holding the inode lock, to avoid races with other - * tasks. + * Always check for the full sync flag while holding the inode's lock, + * to avoid races with other tasks. The flag must be either set all the + * time during logging or always off all the time while logging. */ - if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags)) { - start = 0; - end = LLONG_MAX; - } + full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); /* * Before we acquired the inode's lock, someone may have dirtied more @@ -2196,20 +2219,42 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * We have to do this here to avoid the priority inversion of waiting on * IO of a lower priority task while holding a transaction open. * - * Also, the range length can be represented by u64, we have to do the - * typecasts to avoid signed overflow if it's [0, LLONG_MAX]. + * For a full fsync we wait for the ordered extents to complete while + * for a fast fsync we wait just for writeback to complete, and then + * attach the ordered extents to the transaction so that a transaction + * commit waits for their completion, to avoid data loss if we fsync, + * the current transaction commits before the ordered extents complete + * and a power failure happens right after that. */ - ret = btrfs_wait_ordered_range(inode, start, (u64)end - (u64)start + 1); - if (ret) { - up_write(&BTRFS_I(inode)->dio_sem); - inode_unlock(inode); - goto out; + if (full_sync) { + ret = btrfs_wait_ordered_range(inode, start, len); + } else { + /* + * Get our ordered extents as soon as possible to avoid doing + * checksum lookups in the csum tree, and use instead the + * checksums attached to the ordered extents. + */ + btrfs_get_ordered_extents_for_logging(BTRFS_I(inode), + &ctx.ordered_extents); + ret = filemap_fdatawait_range(inode->i_mapping, start, end); } + + if (ret) + goto out_release_extents; + atomic_inc(&root->log_batch); + /* + * If we are doing a fast fsync we can not bail out if the inode's + * last_trans is <= then the last committed transaction, because we only + * update the last_trans of the inode during ordered extent completion, + * and for a fast fsync we don't wait for that, we only wait for the + * writeback to complete. + */ smp_mb(); if (btrfs_inode_in_log(BTRFS_I(inode), fs_info->generation) || - BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed) { + (BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed && + (full_sync || list_empty(&ctx.ordered_extents)))) { /* * We've had everything committed since the last time we were * modified so clear this flag in case it was set for whatever @@ -2225,9 +2270,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * checked called fsync. */ ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err); - up_write(&BTRFS_I(inode)->dio_sem); - inode_unlock(inode); - goto out; + goto out_release_extents; } /* @@ -2244,12 +2287,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - up_write(&BTRFS_I(inode)->dio_sem); - inode_unlock(inode); - goto out; + goto out_release_extents; } - ret = btrfs_log_dentry_safe(trans, dentry, start, end, &ctx); + ret = btrfs_log_dentry_safe(trans, dentry, &ctx); + btrfs_release_log_ctx_extents(&ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ ret = 1; @@ -2276,6 +2318,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; } } + if (!full_sync) { + ret = btrfs_wait_ordered_range(inode, start, len); + if (ret) { + btrfs_end_transaction(trans); + goto out; + } + } ret = btrfs_commit_transaction(trans); } else { ret = btrfs_end_transaction(trans); @@ -2286,6 +2335,12 @@ out: if (!ret) ret = err; return ret > 0 ? -EIO : ret; + +out_release_extents: + btrfs_release_log_ctx_extents(&ctx); + up_write(&BTRFS_I(inode)->dio_sem); + inode_unlock(inode); + goto out; } static const struct vm_operations_struct btrfs_file_vm_ops = { @@ -2481,7 +2536,8 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, cached_state); - ordered = btrfs_lookup_first_ordered_extent(inode, lockend); + ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), + lockend); /* * We need to make sure we have no ordered extents in this range @@ -2509,11 +2565,11 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, return 0; } -static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, +static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_path *path, - struct btrfs_clone_extent_info *clone_info, - const u64 clone_len) + struct btrfs_replace_extent_info *extent_info, + const u64 replace_len) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; @@ -2522,51 +2578,69 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, struct btrfs_key key; int slot; struct btrfs_ref ref = { 0 }; - u64 ref_offset; int ret; - if (clone_len == 0) + if (replace_len == 0) return 0; - if (clone_info->disk_offset == 0 && + if (extent_info->disk_offset == 0 && btrfs_fs_incompat(fs_info, NO_HOLES)) return 0; key.objectid = btrfs_ino(BTRFS_I(inode)); key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = clone_info->file_offset; + key.offset = extent_info->file_offset; ret = btrfs_insert_empty_item(trans, root, path, &key, - clone_info->item_size); + sizeof(struct btrfs_file_extent_item)); if (ret) return ret; leaf = path->nodes[0]; slot = path->slots[0]; - write_extent_buffer(leaf, clone_info->extent_buf, + write_extent_buffer(leaf, extent_info->extent_buf, btrfs_item_ptr_offset(leaf, slot), - clone_info->item_size); + sizeof(struct btrfs_file_extent_item)); extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - btrfs_set_file_extent_offset(leaf, extent, clone_info->data_offset); - btrfs_set_file_extent_num_bytes(leaf, extent, clone_len); + ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE); + btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset); + btrfs_set_file_extent_num_bytes(leaf, extent, replace_len); + if (extent_info->is_new_extent) + btrfs_set_file_extent_generation(leaf, extent, trans->transid); btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), - clone_info->file_offset, clone_len); + extent_info->file_offset, replace_len); if (ret) return ret; /* If it's a hole, nothing more needs to be done. */ - if (clone_info->disk_offset == 0) + if (extent_info->disk_offset == 0) return 0; - inode_add_bytes(inode, clone_len); - btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, - clone_info->disk_offset, - clone_info->disk_len, 0); - ref_offset = clone_info->file_offset - clone_info->data_offset; - btrfs_init_data_ref(&ref, root->root_key.objectid, - btrfs_ino(BTRFS_I(inode)), ref_offset); - ret = btrfs_inc_extent_ref(trans, &ref); + inode_add_bytes(inode, replace_len); + + if (extent_info->is_new_extent && extent_info->insertions == 0) { + key.objectid = extent_info->disk_offset; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = extent_info->disk_len; + ret = btrfs_alloc_reserved_file_extent(trans, root, + btrfs_ino(BTRFS_I(inode)), + extent_info->file_offset, + extent_info->qgroup_reserved, + &key); + } else { + u64 ref_offset; + + btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, + extent_info->disk_offset, + extent_info->disk_len, 0); + ref_offset = extent_info->file_offset - extent_info->data_offset; + btrfs_init_data_ref(&ref, root->root_key.objectid, + btrfs_ino(BTRFS_I(inode)), ref_offset); + ret = btrfs_inc_extent_ref(trans, &ref); + } + + extent_info->insertions++; return ret; } @@ -2574,15 +2648,15 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, /* * The respective range must have been previously locked, as well as the inode. * The end offset is inclusive (last byte of the range). - * @clone_info is NULL for fallocate's hole punching and non-NULL for extent - * cloning. - * When cloning, we don't want to end up in a state where we dropped extents - * without inserting a new one, so we must abort the transaction to avoid a - * corruption. + * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing + * the file range with an extent. + * When not punching a hole, we don't want to end up in a state where we dropped + * extents without inserting a new one, so we must abort the transaction to avoid + * a corruption. */ -int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, +int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, const u64 start, const u64 end, - struct btrfs_clone_extent_info *clone_info, + struct btrfs_replace_extent_info *extent_info, struct btrfs_trans_handle **trans_out) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2611,10 +2685,10 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, /* * 1 - update the inode * 1 - removing the extents in the range - * 1 - adding the hole extent if no_holes isn't set or if we are cloning - * an extent + * 1 - adding the hole extent if no_holes isn't set or if we are + * replacing the range with a new extent */ - if (!btrfs_fs_incompat(fs_info, NO_HOLES) || clone_info) + if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info) rsv_count = 3; else rsv_count = 2; @@ -2644,14 +2718,15 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, * returned by __btrfs_drop_extents() without having * changed anything in the file. */ - if (clone_info && ret && ret != -EOPNOTSUPP) + if (extent_info && !extent_info->is_new_extent && + ret && ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, ret); break; } trans->block_rsv = &fs_info->trans_block_rsv; - if (!clone_info && cur_offset < drop_end && + if (!extent_info && cur_offset < drop_end && cur_offset < ino_size) { ret = fill_holes(trans, BTRFS_I(inode), path, cur_offset, drop_end); @@ -2665,7 +2740,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, btrfs_abort_transaction(trans, ret); break; } - } else if (!clone_info && cur_offset < drop_end) { + } else if (!extent_info && cur_offset < drop_end) { /* * We are past the i_size here, but since we didn't * insert holes we need to clear the mapped area so we @@ -2685,18 +2760,18 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, } } - if (clone_info && drop_end > clone_info->file_offset) { - u64 clone_len = drop_end - clone_info->file_offset; + if (extent_info && drop_end > extent_info->file_offset) { + u64 replace_len = drop_end - extent_info->file_offset; - ret = btrfs_insert_clone_extent(trans, inode, path, - clone_info, clone_len); + ret = btrfs_insert_replace_extent(trans, inode, path, + extent_info, replace_len); if (ret) { btrfs_abort_transaction(trans, ret); break; } - clone_info->data_len -= clone_len; - clone_info->data_offset += clone_len; - clone_info->file_offset += clone_len; + extent_info->data_len -= replace_len; + extent_info->data_offset += replace_len; + extent_info->file_offset += replace_len; } cur_offset = drop_end; @@ -2720,7 +2795,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv; - if (!clone_info) { + if (!extent_info) { ret = find_first_non_hole(inode, &cur_offset, &len); if (unlikely(ret < 0)) break; @@ -2739,7 +2814,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, * than 16Mb would force the full fsync any way (when * try_release_extent_mapping() is invoked during page cache truncation. */ - if (clone_info) + if (extent_info && !extent_info->is_new_extent) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); @@ -2765,7 +2840,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, * (because it's useless) or if it represents a 0 bytes range (when * cur_offset == drop_end). */ - if (!clone_info && cur_offset < ino_size && cur_offset < drop_end) { + if (!extent_info && cur_offset < ino_size && cur_offset < drop_end) { ret = fill_holes(trans, BTRFS_I(inode), path, cur_offset, drop_end); if (ret) { @@ -2773,7 +2848,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, btrfs_abort_transaction(trans, ret); goto out_trans; } - } else if (!clone_info && cur_offset < drop_end) { + } else if (!extent_info && cur_offset < drop_end) { /* See the comment in the loop above for the reasoning here. */ ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), cur_offset, drop_end - cur_offset); @@ -2783,9 +2858,9 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, } } - if (clone_info) { - ret = btrfs_insert_clone_extent(trans, inode, path, clone_info, - clone_info->data_len); + if (extent_info) { + ret = btrfs_insert_replace_extent(trans, inode, path, extent_info, + extent_info->data_len); if (ret) { btrfs_abort_transaction(trans, ret); goto out_trans; @@ -2840,9 +2915,9 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out_only_mutex; } - lockstart = round_up(offset, btrfs_inode_sectorsize(inode)); + lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode))); lockend = round_down(offset + len, - btrfs_inode_sectorsize(inode)) - 1; + btrfs_inode_sectorsize(BTRFS_I(inode))) - 1; same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset)) == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)); /* @@ -2927,7 +3002,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out; } - ret = btrfs_punch_hole_range(inode, path, lockstart, lockend, NULL, + ret = btrfs_replace_file_extents(inode, path, lockstart, lockend, NULL, &trans); btrfs_free_path(path); if (ret) @@ -3044,7 +3119,7 @@ enum { RANGE_BOUNDARY_HOLE, }; -static int btrfs_zero_range_check_range_boundary(struct inode *inode, +static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode, u64 offset) { const u64 sectorsize = btrfs_inode_sectorsize(inode); @@ -3052,7 +3127,7 @@ static int btrfs_zero_range_check_range_boundary(struct inode *inode, int ret; offset = round_down(offset, sectorsize); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize); if (IS_ERR(em)) return PTR_ERR(em); @@ -3077,7 +3152,7 @@ static int btrfs_zero_range(struct inode *inode, struct extent_changeset *data_reserved = NULL; int ret; u64 alloc_hint = 0; - const u64 sectorsize = btrfs_inode_sectorsize(inode); + const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode)); u64 alloc_start = round_down(offset, sectorsize); u64 alloc_end = round_up(offset + len, sectorsize); u64 bytes_to_reserve = 0; @@ -3167,7 +3242,8 @@ static int btrfs_zero_range(struct inode *inode, * to cover them. */ if (!IS_ALIGNED(offset, sectorsize)) { - ret = btrfs_zero_range_check_range_boundary(inode, offset); + ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode), + offset); if (ret < 0) goto out; if (ret == RANGE_BOUNDARY_HOLE) { @@ -3183,7 +3259,7 @@ static int btrfs_zero_range(struct inode *inode, } if (!IS_ALIGNED(offset + len, sectorsize)) { - ret = btrfs_zero_range_check_range_boundary(inode, + ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode), offset + len); if (ret < 0) goto out; @@ -3258,7 +3334,7 @@ static long btrfs_fallocate(struct file *file, int mode, u64 locked_end; u64 actual_end = 0; struct extent_map *em; - int blocksize = btrfs_inode_sectorsize(inode); + int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode)); int ret; alloc_start = round_down(offset, blocksize); @@ -3340,7 +3416,8 @@ static long btrfs_fallocate(struct file *file, int mode, */ lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state); - ordered = btrfs_lookup_first_ordered_extent(inode, locked_end); + ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), + locked_end); if (ordered && ordered->file_offset + ordered->num_bytes > alloc_start && @@ -3541,9 +3618,26 @@ static int btrfs_file_open(struct inode *inode, struct file *filp) return generic_file_open(inode, filp); } +static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + ssize_t ret = 0; + + if (iocb->ki_flags & IOCB_DIRECT) { + struct inode *inode = file_inode(iocb->ki_filp); + + inode_lock_shared(inode); + ret = btrfs_direct_IO(iocb, to); + inode_unlock_shared(inode); + if (ret < 0) + return ret; + } + + return generic_file_buffered_read(iocb, to, ret); +} + const struct file_operations btrfs_file_operations = { .llseek = btrfs_file_llseek, - .read_iter = generic_file_read_iter, + .read_iter = btrfs_file_read_iter, .splice_read = generic_file_splice_read, .write_iter = btrfs_file_write_iter, .splice_write = iter_file_splice_write, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index ef0fd7afb0b1..af0013d3df63 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -413,8 +413,6 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) { - __le64 *val; - io_ctl_map_page(io_ctl, 1); /* @@ -429,14 +427,13 @@ static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) io_ctl->size -= sizeof(u64) * 2; } - val = io_ctl->cur; - *val = cpu_to_le64(generation); + put_unaligned_le64(generation, io_ctl->cur); io_ctl->cur += sizeof(u64); } static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) { - __le64 *gen; + u64 cache_gen; /* * Skip the crc area. If we don't check crcs then we just have a 64bit @@ -451,11 +448,11 @@ static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) io_ctl->size -= sizeof(u64) * 2; } - gen = io_ctl->cur; - if (le64_to_cpu(*gen) != generation) { + cache_gen = get_unaligned_le64(io_ctl->cur); + if (cache_gen != generation) { btrfs_err_rl(io_ctl->fs_info, "space cache generation (%llu) does not match inode (%llu)", - *gen, generation); + cache_gen, generation); io_ctl_unmap_page(io_ctl); return -EIO; } @@ -525,8 +522,8 @@ static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes, return -ENOSPC; entry = io_ctl->cur; - entry->offset = cpu_to_le64(offset); - entry->bytes = cpu_to_le64(bytes); + put_unaligned_le64(offset, &entry->offset); + put_unaligned_le64(bytes, &entry->bytes); entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : BTRFS_FREE_SPACE_EXTENT; io_ctl->cur += sizeof(struct btrfs_free_space_entry); @@ -599,8 +596,8 @@ static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl, } e = io_ctl->cur; - entry->offset = le64_to_cpu(e->offset); - entry->bytes = le64_to_cpu(e->bytes); + entry->offset = get_unaligned_le64(&e->offset); + entry->bytes = get_unaligned_le64(&e->bytes); *type = e->type; io_ctl->cur += sizeof(struct btrfs_free_space_entry); io_ctl->size -= sizeof(struct btrfs_free_space_entry); @@ -1186,7 +1183,6 @@ static int __btrfs_wait_cache_io(struct btrfs_root *root, ret = update_cache_item(trans, root, inode, path, offset, io_ctl->entries, io_ctl->bitmaps); out: - io_ctl_free(io_ctl); if (ret) { invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; @@ -1347,13 +1343,14 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, * them out later */ io_ctl_drop_pages(io_ctl); + io_ctl_free(io_ctl); unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, &cached_state); /* * at this point the pages are under IO and we're happy, - * The caller is responsible for waiting on them and updating the + * The caller is responsible for waiting on them and updating * the cache and the inode */ io_ctl->entries = entries; diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 8b1f5c8897b7..6b9faf3b0e96 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -22,6 +22,10 @@ void set_free_space_tree_thresholds(struct btrfs_block_group *cache) size_t bitmap_size; u64 num_bitmaps, total_bitmap_size; + if (WARN_ON(cache->length == 0)) + btrfs_warn(cache->fs_info, "block group %llu length is zero", + cache->start); + /* * We convert to bitmaps when the disk space required for using extents * exceeds that required for using bitmaps. diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 51fcd82d41c0..936c3137c646 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6,7 +6,6 @@ #include <crypto/hash.h> #include <linux/kernel.h> #include <linux/bio.h> -#include <linux/buffer_head.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/pagemap.h> @@ -31,6 +30,7 @@ #include <linux/swap.h> #include <linux/migrate.h> #include <linux/sched/mm.h> +#include <linux/iomap.h> #include <asm/unaligned.h> #include "misc.h" #include "ctree.h" @@ -59,9 +59,10 @@ struct btrfs_iget_args { struct btrfs_dio_data { u64 reserve; - u64 unsubmitted_oe_range_start; - u64 unsubmitted_oe_range_end; - int overwrite; + loff_t length; + ssize_t submitted; + struct extent_changeset *data_reserved; + bool sync; }; static const struct inode_operations btrfs_dir_inode_operations; @@ -70,7 +71,6 @@ static const struct inode_operations btrfs_special_inode_operations; static const struct inode_operations btrfs_file_inode_operations; static const struct address_space_operations btrfs_aops; static const struct file_operations btrfs_dir_file_operations; -static const struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; struct kmem_cache *btrfs_trans_handle_cachep; @@ -140,13 +140,6 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, static int btrfs_dirty_inode(struct inode *inode); -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -void btrfs_test_inode_set_ops(struct inode *inode) -{ - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; -} -#endif - static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, const struct qstr *qstr) @@ -1610,7 +1603,7 @@ next_slot: goto out_check; ret = btrfs_cross_ref_exist(root, ino, found_key.offset - - extent_offset, disk_bytenr); + extent_offset, disk_bytenr, false); if (ret) { /* * ret could be -EIO if the above fails to read @@ -2161,11 +2154,8 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio, u64 bio_offset) { struct inode *inode = private_data; - blk_status_t ret = 0; - ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); - BUG_ON(ret); /* -ENOMEM */ - return 0; + return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); } /* @@ -2186,9 +2176,8 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio, * * c-3) otherwise: async submit */ -static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, - int mirror_num, - unsigned long bio_flags) +blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2248,16 +2237,15 @@ out: * given a list of ordered sums record them in the inode. This happens * at IO completion time based on sums calculated at bio submission time. */ -static noinline int add_pending_csums(struct btrfs_trans_handle *trans, - struct inode *inode, struct list_head *list) +static int add_pending_csums(struct btrfs_trans_handle *trans, + struct list_head *list) { struct btrfs_ordered_sum *sum; int ret; list_for_each_entry(sum, list, list) { trans->adding_csums = true; - ret = btrfs_csum_file_blocks(trans, - BTRFS_I(inode)->root->fs_info->csum_root, sum); + ret = btrfs_csum_file_blocks(trans, trans->fs_info->csum_root, sum); trans->adding_csums = false; if (ret) return ret; @@ -2360,7 +2348,7 @@ again: unlock_extent_cached(&inode->io_tree, page_start, page_end, &cached_state); unlock_page(page); - btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); goto again; } @@ -2551,7 +2539,6 @@ static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info, } static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, - struct inode *inode, struct btrfs_ordered_extent *oe) { struct btrfs_file_extent_item stack_fi; @@ -2571,8 +2558,9 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); /* Encryption and other encoding is reserved and all 0 */ - return insert_reserved_file_extent(trans, BTRFS_I(inode), oe->file_offset, - &stack_fi, oe->qgroup_rsv); + return insert_reserved_file_extent(trans, BTRFS_I(oe->inode), + oe->file_offset, &stack_fi, + oe->qgroup_rsv); } /* @@ -2669,8 +2657,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) logical_len); } else { BUG_ON(root == fs_info->tree_root); - ret = insert_ordered_extent_file_extent(trans, inode, - ordered_extent); + ret = insert_ordered_extent_file_extent(trans, ordered_extent); if (!ret) { clear_reserved_extent = false; btrfs_release_delalloc_bytes(fs_info, @@ -2686,7 +2673,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - ret = add_pending_csums(trans, inode, &ordered_extent->list); + ret = add_pending_csums(trans, &ordered_extent->list); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -2755,7 +2742,7 @@ out: * This needs to be done to make sure anybody waiting knows we are done * updating everything for this ordered extent. */ - btrfs_remove_ordered_extent(inode, ordered_extent); + btrfs_remove_ordered_extent(BTRFS_I(inode), ordered_extent); /* once for us */ btrfs_put_ordered_extent(ordered_extent); @@ -2775,8 +2762,8 @@ static void finish_ordered_fn(struct btrfs_work *work) void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, u64 end, int uptodate) { - struct inode *inode = page->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_ordered_extent *ordered_extent = NULL; struct btrfs_workqueue *wq; @@ -2787,7 +2774,7 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, end - start + 1, uptodate)) return; - if (btrfs_is_free_space_inode(BTRFS_I(inode))) + if (btrfs_is_free_space_inode(inode)) wq = fs_info->endio_freespace_worker; else wq = fs_info->endio_write_workers; @@ -2836,9 +2823,8 @@ zeroit: * if there's a match, we allow the bio to finish. If not, the code in * extent_io.c will try to find good copies for us. */ -static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, - u64 phy_offset, struct page *page, - u64 start, u64 end, int mirror) +int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, int mirror) { size_t offset = start - page_offset(page); struct inode *inode = page->mapping->host; @@ -3058,7 +3044,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) if (ret == -ENOENT && root == fs_info->tree_root) { struct btrfs_root *dead_root; - struct btrfs_fs_info *fs_info = root->fs_info; int is_dead_root = 0; /* @@ -3398,7 +3383,6 @@ cache_acl: switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_mapping->a_ops = &btrfs_aops; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; break; @@ -4054,7 +4038,7 @@ out_end_trans: err = ret; inode->i_flags |= S_DEAD; out_release: - btrfs_subvolume_release_metadata(fs_info, &block_rsv); + btrfs_subvolume_release_metadata(root, &block_rsv); out_up_write: up_write(&fs_info->subvol_sem); if (err) { @@ -4586,7 +4570,7 @@ again: &cached_state); unlock_page(page); put_page(page); - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); goto again; } @@ -4851,19 +4835,16 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) /* * We're truncating a file that used to have good data down to - * zero. Make sure it gets into the ordered flush list so that - * any new writes get down to disk quickly. + * zero. Make sure any new writes to the file get on disk + * on close. */ if (newsize == 0) - set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, + set_bit(BTRFS_INODE_FLUSH_ON_CLOSE, &BTRFS_I(inode)->runtime_flags); truncate_setsize(inode, newsize); - /* Disable nonlocked read DIO to avoid the endless truncate */ - btrfs_inode_block_unlocked_dio(BTRFS_I(inode)); inode_dio_wait(inode); - btrfs_inode_resume_unlocked_dio(BTRFS_I(inode)); ret = btrfs_truncate(inode, newsize == oldsize); if (ret && inode->i_nlink) { @@ -5308,15 +5289,15 @@ static void inode_tree_add(struct inode *inode) spin_unlock(&root->inode_lock); } -static void inode_tree_del(struct inode *inode) +static void inode_tree_del(struct btrfs_inode *inode) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; int empty = 0; spin_lock(&root->inode_lock); - if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { - rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); - RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); + if (!RB_EMPTY_NODE(&inode->rb_node)) { + rb_erase(&inode->rb_node, &root->inode_tree); + RB_CLEAR_NODE(&inode->rb_node); empty = RB_EMPTY_ROOT(&root->inode_tree); } spin_unlock(&root->inode_lock); @@ -6314,7 +6295,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (err) goto out_unlock; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; d_instantiate_new(dentry, inode); out_unlock: @@ -6377,7 +6357,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, drop_inode = 1; } else { struct dentry *parent = dentry->d_parent; - int ret; err = btrfs_update_inode(trans, root, inode); if (err) @@ -6392,12 +6371,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, goto fail; } d_instantiate(dentry, inode); - ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent, - true, NULL); - if (ret == BTRFS_NEED_TRANS_COMMIT) { - err = btrfs_commit_transaction(trans); - trans = NULL; - } + btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent); } fail: @@ -6543,8 +6517,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, u64 start, u64 len) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - int ret; - int err = 0; + int ret = 0; u64 extent_start = 0; u64 extent_end = 0; u64 objectid = btrfs_ino(inode); @@ -6572,7 +6545,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, } em = alloc_extent_map(); if (!em) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } em->start = EXTENT_MAP_HOLE; @@ -6582,7 +6555,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, path = btrfs_alloc_path(); if (!path) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } @@ -6595,14 +6568,16 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, */ path->leave_spinning = 1; + path->recurse = btrfs_is_free_space_inode(inode); + ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); if (ret < 0) { - err = ret; goto out; } else if (ret > 0) { if (path->slots[0] == 0) goto not_found; path->slots[0]--; + ret = 0; } leaf = path->nodes[0]; @@ -6628,7 +6603,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, extent_type == BTRFS_FILE_EXTENT_PREALLOC) { /* Only regular file could have regular/prealloc extent */ if (!S_ISREG(inode->vfs_inode.i_mode)) { - err = -EUCLEAN; + ret = -EUCLEAN; btrfs_crit(fs_info, "regular/prealloc extent found for non-regular inode %llu", btrfs_ino(inode)); @@ -6646,12 +6621,11 @@ next: path->slots[0]++; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); - if (ret < 0) { - err = ret; + if (ret < 0) goto out; - } else if (ret > 0) { + else if (ret > 0) goto not_found; - } + leaf = path->nodes[0]; } btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); @@ -6702,10 +6676,8 @@ next: BTRFS_COMPRESS_NONE) { ret = uncompress_inline(path, page, pg_offset, extent_offset, item); - if (ret) { - err = ret; + if (ret) goto out; - } } else { map = kmap(page); read_extent_buffer(leaf, map + pg_offset, ptr, @@ -6729,29 +6701,28 @@ not_found: em->len = len; em->block_start = EXTENT_MAP_HOLE; insert: + ret = 0; btrfs_release_path(path); if (em->start > start || extent_map_end(em) <= start) { btrfs_err(fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]", em->start, em->len, start, len); - err = -EIO; + ret = -EIO; goto out; } - err = 0; write_lock(&em_tree->lock); - err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); write_unlock(&em_tree->lock); out: btrfs_free_path(path); trace_btrfs_get_extent(root, inode, em); - if (err) { + if (ret) { free_extent_map(em); - return ERR_PTR(err); + return ERR_PTR(ret); } - BUG_ON(!em); /* Error is always set */ return em; } @@ -6953,6 +6924,8 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, * @orig_start: (optional) Return the original file offset of the file extent * @orig_len: (optional) Return the original on-disk length of the file extent * @ram_bytes: (optional) Return the ram_bytes of the file extent + * @strict: if true, omit optimizations that might force us into unnecessary + * cow. e.g., don't trust generation number. * * This function will flush ordered extents in the range to ensure proper * nocow checks for (nowait == false) case. @@ -6967,7 +6940,7 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, */ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, - u64 *ram_bytes) + u64 *ram_bytes, bool strict) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_path *path; @@ -7045,8 +7018,9 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, * Do the same check as in btrfs_cross_ref_exist but without the * unnecessary search. */ - if (btrfs_file_extent_generation(leaf, fi) <= - btrfs_root_last_snapshot(&root->root_item)) + if (!strict && + (btrfs_file_extent_generation(leaf, fi) <= + btrfs_root_last_snapshot(&root->root_item))) goto out; backref_offset = btrfs_file_extent_offset(leaf, fi); @@ -7082,7 +7056,8 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, */ ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)), - key.offset - backref_offset, disk_bytenr); + key.offset - backref_offset, disk_bytenr, + strict); if (ret) { ret = 0; goto out; @@ -7110,7 +7085,7 @@ out: } static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, - struct extent_state **cached_state, int writing) + struct extent_state **cached_state, bool writing) { struct btrfs_ordered_extent *ordered; int ret = 0; @@ -7159,7 +7134,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, */ if (writing || test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); else ret = -ENOTBLK; btrfs_put_ordered_extent(ordered); @@ -7248,30 +7223,7 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, } -static int btrfs_get_blocks_direct_read(struct extent_map *em, - struct buffer_head *bh_result, - struct inode *inode, - u64 start, u64 len) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - - if (em->block_start == EXTENT_MAP_HOLE || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - return -ENOENT; - - len = min(len, em->len - (start - em->start)); - - bh_result->b_blocknr = (em->block_start + (start - em->start)) >> - inode->i_blkbits; - bh_result->b_size = len; - bh_result->b_bdev = fs_info->fs_devices->latest_bdev; - set_buffer_mapped(bh_result); - - return 0; -} - static int btrfs_get_blocks_direct_write(struct extent_map **map, - struct buffer_head *bh_result, struct inode *inode, struct btrfs_dio_data *dio_data, u64 start, u64 len) @@ -7303,7 +7255,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, block_start = em->block_start + (start - em->start); if (can_nocow_extent(inode, start, &len, &orig_start, - &orig_block_len, &ram_bytes) == 1 && + &orig_block_len, &ram_bytes, false) == 1 && btrfs_inc_nocow_writers(fs_info, block_start)) { struct extent_map *em2; @@ -7332,7 +7284,6 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, } /* this will cow the extent */ - len = bh_result->b_size; free_extent_map(em); *map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len); if (IS_ERR(em)) { @@ -7343,64 +7294,88 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, len = min(len, em->len - (start - em->start)); skip_cow: - bh_result->b_blocknr = (em->block_start + (start - em->start)) >> - inode->i_blkbits; - bh_result->b_size = len; - bh_result->b_bdev = fs_info->fs_devices->latest_bdev; - set_buffer_mapped(bh_result); - - if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - set_buffer_new(bh_result); - /* * Need to update the i_size under the extent lock so buffered * readers will get the updated i_size when we unlock. */ - if (!dio_data->overwrite && start + len > i_size_read(inode)) + if (start + len > i_size_read(inode)) i_size_write(inode, start + len); - WARN_ON(dio_data->reserve < len); dio_data->reserve -= len; - dio_data->unsubmitted_oe_range_end = start + len; - current->journal_info = dio_data; out: return ret; } -static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, + loff_t length, unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em; struct extent_state *cached_state = NULL; struct btrfs_dio_data *dio_data = NULL; - u64 start = iblock << inode->i_blkbits; u64 lockstart, lockend; - u64 len = bh_result->b_size; + const bool write = !!(flags & IOMAP_WRITE); int ret = 0; + u64 len = length; + bool unlock_extents = false; + bool sync = (current->journal_info == BTRFS_DIO_SYNC_STUB); + + /* + * We used current->journal_info here to see if we were sync, but + * there's a lot of tests in the enospc machinery to not do flushing if + * we have a journal_info set, so we need to clear this out and re-set + * it in iomap_end. + */ + ASSERT(current->journal_info == NULL || + current->journal_info == BTRFS_DIO_SYNC_STUB); + current->journal_info = NULL; - if (!create) + if (!write) len = min_t(u64, len, fs_info->sectorsize); lockstart = start; lockend = start + len - 1; - if (current->journal_info) { - /* - * Need to pull our outstanding extents and set journal_info to NULL so - * that anything that needs to check if there's a transaction doesn't get - * confused. - */ - dio_data = current->journal_info; - current->journal_info = NULL; + /* + * The generic stuff only does filemap_write_and_wait_range, which + * isn't enough if we've written compressed pages to this area, so we + * need to flush the dirty pages again to make absolutely sure that any + * outstanding dirty pages are on disk. + */ + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) { + ret = filemap_fdatawrite_range(inode->i_mapping, start, + start + length - 1); + if (ret) + return ret; } + dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS); + if (!dio_data) + return -ENOMEM; + + dio_data->sync = sync; + dio_data->length = length; + if (write) { + dio_data->reserve = round_up(length, fs_info->sectorsize); + ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), + &dio_data->data_reserved, + start, dio_data->reserve); + if (ret) { + extent_changeset_free(dio_data->data_reserved); + kfree(dio_data); + return ret; + } + } + iomap->private = dio_data; + + /* * If this errors out it's because we couldn't invalidate pagecache for * this range and we need to fallback to buffered. */ - if (lock_extent_direct(inode, lockstart, lockend, &cached_state, - create)) { + if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) { ret = -ENOTBLK; goto err; } @@ -7432,35 +7407,47 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, goto unlock_err; } - if (create) { - ret = btrfs_get_blocks_direct_write(&em, bh_result, inode, - dio_data, start, len); + len = min(len, em->len - (start - em->start)); + if (write) { + ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, + start, len); if (ret < 0) goto unlock_err; - - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, - lockend, &cached_state); + unlock_extents = true; + /* Recalc len in case the new em is smaller than requested */ + len = min(len, em->len - (start - em->start)); } else { - ret = btrfs_get_blocks_direct_read(em, bh_result, inode, - start, len); - /* Can be negative only if we read from a hole */ - if (ret < 0) { - ret = 0; - free_extent_map(em); - goto unlock_err; - } /* * We need to unlock only the end area that we aren't using. * The rest is going to be unlocked by the endio routine. */ - lockstart = start + bh_result->b_size; - if (lockstart < lockend) { - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - lockstart, lockend, &cached_state); - } else { - free_extent_state(cached_state); - } + lockstart = start + len; + if (lockstart < lockend) + unlock_extents = true; + } + + if (unlock_extents) + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + lockstart, lockend, &cached_state); + else + free_extent_state(cached_state); + + /* + * Translate extent map information to iomap. + * We trim the extents (and move the addr) even though iomap code does + * that, since we have locked only the parts we are performing I/O in. + */ + if ((em->block_start == EXTENT_MAP_HOLE) || + (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) { + iomap->addr = IOMAP_NULL_ADDR; + iomap->type = IOMAP_HOLE; + } else { + iomap->addr = em->block_start + (start - em->start); + iomap->type = IOMAP_MAPPED; } + iomap->offset = start; + iomap->bdev = fs_info->fs_devices->latest_bdev; + iomap->length = len; free_extent_map(em); @@ -7470,8 +7457,63 @@ unlock_err: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); err: - if (dio_data) - current->journal_info = dio_data; + if (dio_data) { + btrfs_delalloc_release_space(BTRFS_I(inode), + dio_data->data_reserved, start, + dio_data->reserve, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve); + extent_changeset_free(dio_data->data_reserved); + kfree(dio_data); + } + return ret; +} + +static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, + ssize_t written, unsigned int flags, struct iomap *iomap) +{ + int ret = 0; + struct btrfs_dio_data *dio_data = iomap->private; + size_t submitted = dio_data->submitted; + const bool write = !!(flags & IOMAP_WRITE); + + if (!write && (iomap->type == IOMAP_HOLE)) { + /* If reading from a hole, unlock and return */ + unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1); + goto out; + } + + if (submitted < length) { + pos += submitted; + length -= submitted; + if (write) + __endio_write_update_ordered(BTRFS_I(inode), pos, + length, false); + else + unlock_extent(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1); + ret = -ENOTBLK; + } + + if (write) { + if (dio_data->reserve) + btrfs_delalloc_release_space(BTRFS_I(inode), + dio_data->data_reserved, pos, + dio_data->reserve, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length); + extent_changeset_free(dio_data->data_reserved); + } +out: + /* + * We're all done, we can re-set the current->journal_info now safely + * for our endio. + */ + if (dio_data->sync) { + ASSERT(current->journal_info == NULL); + current->journal_info = BTRFS_DIO_SYNC_STUB; + } + kfree(dio_data); + iomap->private = NULL; + return ret; } @@ -7495,7 +7537,7 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) dip->logical_offset + dip->bytes - 1); } - dio_end_io(dip->dio_bio); + bio_endio(dip->dio_bio); kfree(dip); } @@ -7619,10 +7661,8 @@ static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data, struct bio *bio, u64 offset) { struct inode *inode = private_data; - blk_status_t ret; - ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, offset, 1); - BUG_ON(ret); /* -ENOMEM */ - return 0; + + return btrfs_csum_one_bio(BTRFS_I(inode), bio, offset, 1); } static void btrfs_end_dio_bio(struct bio *bio) @@ -7731,24 +7771,11 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; dip->dio_bio = dio_bio; refcount_set(&dip->refs, 1); - - if (write) { - struct btrfs_dio_data *dio_data = current->journal_info; - - /* - * Setting range start and end to the same value means that - * no cleanup will happen in btrfs_direct_IO - */ - dio_data->unsubmitted_oe_range_end = dip->logical_offset + - dip->bytes; - dio_data->unsubmitted_oe_range_start = - dio_data->unsubmitted_oe_range_end; - } return dip; } -static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, - loff_t file_offset) +static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, + struct bio *dio_bio, loff_t file_offset) { const bool write = (bio_op(dio_bio) == REQ_OP_WRITE); const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); @@ -7765,6 +7792,7 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, int ret; blk_status_t status; struct btrfs_io_geometry geom; + struct btrfs_dio_data *dio_data = iomap->private; dip = btrfs_create_dio_private(dio_bio, inode, file_offset); if (!dip) { @@ -7773,8 +7801,8 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, file_offset + dio_bio->bi_iter.bi_size - 1); } dio_bio->bi_status = BLK_STS_RESOURCE; - dio_end_io(dio_bio); - return; + bio_endio(dio_bio); + return BLK_QC_T_NONE; } if (!write && csum) { @@ -7845,15 +7873,17 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, goto out_err; } + dio_data->submitted += clone_len; clone_offset += clone_len; start_sector += clone_len >> 9; file_offset += clone_len; } while (submit_len > 0); - return; + return BLK_QC_T_NONE; out_err: dip->dio_bio->bi_status = status; btrfs_dio_private_put(dip); + return BLK_QC_T_NONE; } static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, @@ -7889,37 +7919,59 @@ out: return retval; } -static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +static inline int btrfs_maybe_fsync_end_io(struct kiocb *iocb, ssize_t size, + int error, unsigned flags) +{ + /* + * Now if we're still in the context of our submitter we know we can't + * safely run generic_write_sync(), so clear our flag here so that the + * caller knows to follow up with a sync. + */ + if (current->journal_info == BTRFS_DIO_SYNC_STUB) { + current->journal_info = NULL; + return error; + } + + if (error) + return error; + + if (size) { + iocb->ki_flags |= IOCB_DSYNC; + return generic_write_sync(iocb, size); + } + + return 0; +} + +static const struct iomap_ops btrfs_dio_iomap_ops = { + .iomap_begin = btrfs_dio_iomap_begin, + .iomap_end = btrfs_dio_iomap_end, +}; + +static const struct iomap_dio_ops btrfs_dio_ops = { + .submit_io = btrfs_submit_direct, +}; + +static const struct iomap_dio_ops btrfs_sync_dops = { + .submit_io = btrfs_submit_direct, + .end_io = btrfs_maybe_fsync_end_io, +}; + +ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_dio_data dio_data = { 0 }; struct extent_changeset *data_reserved = NULL; loff_t offset = iocb->ki_pos; size_t count = 0; - int flags = 0; - bool wakeup = true; bool relock = false; ssize_t ret; if (check_direct_IO(fs_info, iter, offset)) return 0; - inode_dio_begin(inode); - - /* - * The generic stuff only does filemap_write_and_wait_range, which - * isn't enough if we've written compressed pages to this area, so - * we need to flush the dirty pages again to make absolutely sure - * that any outstanding dirty pages are on disk. - */ count = iov_iter_count(iter); - if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - filemap_fdatawrite_range(inode->i_mapping, offset, - offset + count - 1); - if (iov_iter_rw(iter) == WRITE) { /* * If the write DIO is beyond the EOF, we need update @@ -7927,66 +7979,29 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) * not unlock the i_mutex at this case. */ if (offset + count <= inode->i_size) { - dio_data.overwrite = 1; inode_unlock(inode); relock = true; } - ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, - offset, count); - if (ret) - goto out; - - /* - * We need to know how many extents we reserved so that we can - * do the accounting properly if we go over the number we - * originally calculated. Abuse current->journal_info for this. - */ - dio_data.reserve = round_up(count, - fs_info->sectorsize); - dio_data.unsubmitted_oe_range_start = (u64)offset; - dio_data.unsubmitted_oe_range_end = (u64)offset; - current->journal_info = &dio_data; down_read(&BTRFS_I(inode)->dio_sem); - } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, - &BTRFS_I(inode)->runtime_flags)) { - inode_dio_end(inode); - flags = DIO_LOCKING | DIO_SKIP_HOLES; - wakeup = false; } - ret = __blockdev_direct_IO(iocb, inode, - fs_info->fs_devices->latest_bdev, - iter, btrfs_get_blocks_direct, NULL, - btrfs_submit_direct, flags); - if (iov_iter_rw(iter) == WRITE) { + /* + * We have are actually a sync iocb, so we need our fancy endio to know + * if we need to sync. + */ + if (current->journal_info) + ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, + &btrfs_sync_dops, is_sync_kiocb(iocb)); + else + ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, + &btrfs_dio_ops, is_sync_kiocb(iocb)); + + if (ret == -ENOTBLK) + ret = 0; + + if (iov_iter_rw(iter) == WRITE) up_read(&BTRFS_I(inode)->dio_sem); - current->journal_info = NULL; - if (ret < 0 && ret != -EIOCBQUEUED) { - if (dio_data.reserve) - btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, offset, dio_data.reserve, - true); - /* - * On error we might have left some ordered extents - * without submitting corresponding bios for them, so - * cleanup them up to avoid other tasks getting them - * and waiting for them to complete forever. - */ - if (dio_data.unsubmitted_oe_range_start < - dio_data.unsubmitted_oe_range_end) - __endio_write_update_ordered(BTRFS_I(inode), - dio_data.unsubmitted_oe_range_start, - dio_data.unsubmitted_oe_range_end - - dio_data.unsubmitted_oe_range_start, - false); - } else if (ret >= 0 && (size_t)ret < count) - btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, - offset, count - (size_t)ret, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), count); - } -out: - if (wakeup) - inode_dio_end(inode); + if (relock) inode_lock(inode); @@ -8003,12 +8018,24 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret) return ret; - return extent_fiemap(inode, fieinfo, start, len); + return extent_fiemap(BTRFS_I(inode), fieinfo, start, len); } int btrfs_readpage(struct file *file, struct page *page) { - return extent_read_full_page(page, btrfs_get_extent, 0); + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + u64 start = page_offset(page); + u64 end = start + PAGE_SIZE - 1; + unsigned long bio_flags = 0; + struct bio *bio = NULL; + int ret; + + btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); + + ret = btrfs_do_readpage(page, NULL, &bio, &bio_flags, 0, NULL); + if (bio) + ret = submit_one_bio(bio, 0, bio_flags); + return ret; } static int btrfs_writepage(struct page *page, struct writeback_control *wbc) @@ -8092,15 +8119,15 @@ static int btrfs_migratepage(struct address_space *mapping, static void btrfs_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { - struct inode *inode = page->mapping->host; - struct extent_io_tree *tree; + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct extent_io_tree *tree = &inode->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; u64 page_start = page_offset(page); u64 page_end = page_start + PAGE_SIZE - 1; u64 start; u64 end; - int inode_evicting = inode->i_state & I_FREEING; + int inode_evicting = inode->vfs_inode.i_state & I_FREEING; /* * we have the page locked, so new writeback can't start, @@ -8111,7 +8138,6 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, */ wait_on_page_writeback(page); - tree = &BTRFS_I(inode)->io_tree; if (offset) { btrfs_releasepage(page, GFP_NOFS); return; @@ -8121,8 +8147,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, lock_extent_bits(tree, page_start, page_end, &cached_state); again: start = page_start; - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, - page_end - start + 1); + ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1); if (ordered) { end = min(page_end, ordered->file_offset + ordered->num_bytes - 1); @@ -8143,7 +8168,7 @@ again: struct btrfs_ordered_inode_tree *tree; u64 new_len; - tree = &BTRFS_I(inode)->ordered_tree; + tree = &inode->ordered_tree; spin_lock_irq(&tree->lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); @@ -8182,7 +8207,7 @@ again: * bit of its io_tree, and free the qgroup reserved data space. * Since the IO will never happen for this page. */ - btrfs_qgroup_free_data(BTRFS_I(inode), NULL, page_start, PAGE_SIZE); + btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); if (!inode_evicting) { clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | @@ -8284,7 +8309,7 @@ again: unlock_extent_cached(io_tree, page_start, page_end, &cached_state); unlock_page(page); - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); goto again; } @@ -8615,21 +8640,21 @@ void btrfs_free_inode(struct inode *inode) kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } -void btrfs_destroy_inode(struct inode *inode) +void btrfs_destroy_inode(struct inode *vfs_inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_extent *ordered; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_inode *inode = BTRFS_I(vfs_inode); + struct btrfs_root *root = inode->root; - WARN_ON(!hlist_empty(&inode->i_dentry)); - WARN_ON(inode->i_data.nrpages); - WARN_ON(BTRFS_I(inode)->block_rsv.reserved); - WARN_ON(BTRFS_I(inode)->block_rsv.size); - WARN_ON(BTRFS_I(inode)->outstanding_extents); - WARN_ON(BTRFS_I(inode)->delalloc_bytes); - WARN_ON(BTRFS_I(inode)->new_delalloc_bytes); - WARN_ON(BTRFS_I(inode)->csum_bytes); - WARN_ON(BTRFS_I(inode)->defrag_bytes); + WARN_ON(!hlist_empty(&vfs_inode->i_dentry)); + WARN_ON(vfs_inode->i_data.nrpages); + WARN_ON(inode->block_rsv.reserved); + WARN_ON(inode->block_rsv.size); + WARN_ON(inode->outstanding_extents); + WARN_ON(inode->delalloc_bytes); + WARN_ON(inode->new_delalloc_bytes); + WARN_ON(inode->csum_bytes); + WARN_ON(inode->defrag_bytes); /* * This can happen where we create an inode, but somebody else also @@ -8644,7 +8669,7 @@ void btrfs_destroy_inode(struct inode *inode) if (!ordered) break; else { - btrfs_err(fs_info, + btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup", ordered->file_offset, ordered->num_bytes); btrfs_remove_ordered_extent(inode, ordered); @@ -8652,11 +8677,11 @@ void btrfs_destroy_inode(struct inode *inode) btrfs_put_ordered_extent(ordered); } } - btrfs_qgroup_check_reserved_leak(BTRFS_I(inode)); + btrfs_qgroup_check_reserved_leak(inode); inode_tree_del(inode); - btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); - btrfs_inode_clear_file_extent_range(BTRFS_I(inode), 0, (u64)-1); - btrfs_put_root(BTRFS_I(inode)->root); + btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); + btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1); + btrfs_put_root(inode->root); } int btrfs_drop_inode(struct inode *inode) @@ -8781,27 +8806,19 @@ static int btrfs_rename_exchange(struct inode *old_dir, struct inode *new_inode = new_dentry->d_inode; struct inode *old_inode = old_dentry->d_inode; struct timespec64 ctime = current_time(old_inode); - struct dentry *parent; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); u64 new_ino = btrfs_ino(BTRFS_I(new_inode)); u64 old_idx = 0; u64 new_idx = 0; int ret; + int ret2; bool root_log_pinned = false; bool dest_log_pinned = false; - struct btrfs_log_ctx ctx_root; - struct btrfs_log_ctx ctx_dest; - bool sync_log_root = false; - bool sync_log_dest = false; - bool commit_transaction = false; /* we only allow rename subvolume link between subvolumes */ if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) return -EXDEV; - btrfs_init_log_ctx(&ctx_root, old_inode); - btrfs_init_log_ctx(&ctx_dest, new_inode); - /* close the race window with snapshot create/destroy ioctl */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID || new_ino == BTRFS_FIRST_FREE_OBJECTID) @@ -8943,30 +8960,14 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(new_inode)->dir_index = new_idx; if (root_log_pinned) { - parent = new_dentry->d_parent; - ret = btrfs_log_new_name(trans, BTRFS_I(old_inode), - BTRFS_I(old_dir), parent, - false, &ctx_root); - if (ret == BTRFS_NEED_LOG_SYNC) - sync_log_root = true; - else if (ret == BTRFS_NEED_TRANS_COMMIT) - commit_transaction = true; - ret = 0; + btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), + new_dentry->d_parent); btrfs_end_log_trans(root); root_log_pinned = false; } if (dest_log_pinned) { - if (!commit_transaction) { - parent = old_dentry->d_parent; - ret = btrfs_log_new_name(trans, BTRFS_I(new_inode), - BTRFS_I(new_dir), parent, - false, &ctx_dest); - if (ret == BTRFS_NEED_LOG_SYNC) - sync_log_dest = true; - else if (ret == BTRFS_NEED_TRANS_COMMIT) - commit_transaction = true; - ret = 0; - } + btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir), + old_dentry->d_parent); btrfs_end_log_trans(dest); dest_log_pinned = false; } @@ -8999,46 +9000,13 @@ out_fail: dest_log_pinned = false; } } - if (!ret && sync_log_root && !commit_transaction) { - ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, - &ctx_root); - if (ret) - commit_transaction = true; - } - if (!ret && sync_log_dest && !commit_transaction) { - ret = btrfs_sync_log(trans, BTRFS_I(new_inode)->root, - &ctx_dest); - if (ret) - commit_transaction = true; - } - if (commit_transaction) { - /* - * We may have set commit_transaction when logging the new name - * in the destination root, in which case we left the source - * root context in the list of log contextes. So make sure we - * remove it to avoid invalid memory accesses, since the context - * was allocated in our stack frame. - */ - if (sync_log_root) { - mutex_lock(&root->log_mutex); - list_del_init(&ctx_root.list); - mutex_unlock(&root->log_mutex); - } - ret = btrfs_commit_transaction(trans); - } else { - int ret2; - - ret2 = btrfs_end_transaction(trans); - ret = ret ? ret : ret2; - } + ret2 = btrfs_end_transaction(trans); + ret = ret ? ret : ret2; out_notrans: if (new_ino == BTRFS_FIRST_FREE_OBJECTID || old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); - ASSERT(list_empty(&ctx_root.list)); - ASSERT(list_empty(&ctx_dest.list)); - return ret; } @@ -9106,11 +9074,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *old_inode = d_inode(old_dentry); u64 index = 0; int ret; + int ret2; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); bool log_pinned = false; - struct btrfs_log_ctx ctx; - bool sync_log = false; - bool commit_transaction = false; if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -9260,17 +9226,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, BTRFS_I(old_inode)->dir_index = index; if (log_pinned) { - struct dentry *parent = new_dentry->d_parent; - - btrfs_init_log_ctx(&ctx, old_inode); - ret = btrfs_log_new_name(trans, BTRFS_I(old_inode), - BTRFS_I(old_dir), parent, - false, &ctx); - if (ret == BTRFS_NEED_LOG_SYNC) - sync_log = true; - else if (ret == BTRFS_NEED_TRANS_COMMIT) - commit_transaction = true; - ret = 0; + btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), + new_dentry->d_parent); btrfs_end_log_trans(root); log_pinned = false; } @@ -9307,23 +9264,8 @@ out_fail: btrfs_end_log_trans(root); log_pinned = false; } - if (!ret && sync_log) { - ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, &ctx); - if (ret) - commit_transaction = true; - } else if (sync_log) { - mutex_lock(&root->log_mutex); - list_del(&ctx.list); - mutex_unlock(&root->log_mutex); - } - if (commit_transaction) { - ret = btrfs_commit_transaction(trans); - } else { - int ret2; - - ret2 = btrfs_end_transaction(trans); - ret = ret ? ret : ret2; - } + ret2 = btrfs_end_transaction(trans); + ret = ret ? ret : ret2; out_notrans: if (old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); @@ -9389,7 +9331,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot) +static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot) { struct btrfs_inode *binode; struct inode *inode; @@ -9429,9 +9371,11 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot) list_add_tail(&work->list, &works); btrfs_queue_work(root->fs_info->flush_workers, &work->work); - ret++; - if (nr != -1 && ret >= nr) - goto out; + if (*nr != U64_MAX) { + (*nr)--; + if (*nr == 0) + goto out; + } cond_resched(); spin_lock(&root->delalloc_lock); } @@ -9456,18 +9400,15 @@ out: int btrfs_start_delalloc_snapshot(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - int ret; + u64 nr = U64_MAX; if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return -EROFS; - ret = start_delalloc_inodes(root, -1, true); - if (ret > 0) - ret = 0; - return ret; + return start_delalloc_inodes(root, &nr, true); } -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr) +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr) { struct btrfs_root *root; struct list_head splice; @@ -9490,15 +9431,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr) &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); - ret = start_delalloc_inodes(root, nr, false); + ret = start_delalloc_inodes(root, &nr, false); btrfs_put_root(root); if (ret < 0) goto out; - - if (nr != -1) { - nr -= ret; - WARN_ON(nr < 0); - } spin_lock(&fs_info->delalloc_root_lock); } spin_unlock(&fs_info->delalloc_root_lock); @@ -9569,7 +9505,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) @@ -9634,11 +9569,15 @@ out_unlock: return err; } -static int insert_prealloc_file_extent(struct btrfs_trans_handle *trans, +static struct btrfs_trans_handle *insert_prealloc_file_extent( + struct btrfs_trans_handle *trans_in, struct inode *inode, struct btrfs_key *ins, u64 file_offset) { struct btrfs_file_extent_item stack_fi; + struct btrfs_replace_extent_info extent_info; + struct btrfs_trans_handle *trans = trans_in; + struct btrfs_path *path; u64 start = ins->objectid; u64 len = ins->offset; int ret; @@ -9655,10 +9594,40 @@ static int insert_prealloc_file_extent(struct btrfs_trans_handle *trans, ret = btrfs_qgroup_release_data(BTRFS_I(inode), file_offset, len); if (ret < 0) - return ret; - return insert_reserved_file_extent(trans, BTRFS_I(inode), file_offset, - &stack_fi, ret); + return ERR_PTR(ret); + + if (trans) { + ret = insert_reserved_file_extent(trans, BTRFS_I(inode), + file_offset, &stack_fi, ret); + if (ret) + return ERR_PTR(ret); + return trans; + } + + extent_info.disk_offset = start; + extent_info.disk_len = len; + extent_info.data_offset = 0; + extent_info.data_len = len; + extent_info.file_offset = file_offset; + extent_info.extent_buf = (char *)&stack_fi; + extent_info.is_new_extent = true; + extent_info.qgroup_reserved = ret; + extent_info.insertions = 0; + + path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); + + ret = btrfs_replace_file_extents(inode, path, file_offset, + file_offset + len - 1, &extent_info, + &trans); + btrfs_free_path(path); + if (ret) + return ERR_PTR(ret); + + return trans; } + static int __btrfs_prealloc_file_range(struct inode *inode, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint, @@ -9681,14 +9650,6 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, if (trans) own_trans = false; while (num_bytes > 0) { - if (own_trans) { - trans = btrfs_start_transaction(root, 3); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } - } - cur_bytes = min_t(u64, num_bytes, SZ_256M); cur_bytes = max(cur_bytes, min_size); /* @@ -9700,11 +9661,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, cur_bytes = min(cur_bytes, last_alloc); ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes, min_size, 0, *alloc_hint, &ins, 1, 0); - if (ret) { - if (own_trans) - btrfs_end_transaction(trans); + if (ret) break; - } /* * We've reserved this space, and thus converted it from @@ -9717,13 +9675,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, btrfs_dec_block_group_reservations(fs_info, ins.objectid); last_alloc = ins.offset; - ret = insert_prealloc_file_extent(trans, inode, &ins, cur_offset); - if (ret) { + trans = insert_prealloc_file_extent(trans, inode, &ins, cur_offset); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); - btrfs_abort_transaction(trans, ret); - if (own_trans) - btrfs_end_transaction(trans); break; } @@ -9786,8 +9742,10 @@ next: break; } - if (own_trans) + if (own_trans) { btrfs_end_transaction(trans); + trans = NULL; + } } if (clear_offset < end) btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset, @@ -9866,7 +9824,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; ret = btrfs_init_inode_security(trans, inode, dir, NULL); if (ret) @@ -10073,14 +10030,14 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, /* * Balance or device remove/replace/resize can move stuff around from - * under us. The EXCL_OP flag makes sure they aren't running/won't run - * concurrently while we are mapping the swap extents, and - * fs_info->swapfile_pins prevents them from running while the swap file - * is active and moving the extents. Note that this also prevents a - * concurrent device add which isn't actually necessary, but it's not + * under us. The exclop protection makes sure they aren't running/won't + * run concurrently while we are mapping the swap extents, and + * fs_info->swapfile_pins prevents them from running while the swap + * file is active and moving the extents. Note that this also prevents + * a concurrent device add which isn't actually necessary, but it's not * really worth the trouble to allow it. */ - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) { btrfs_warn(fs_info, "cannot activate swapfile while exclusive operation is running"); return -EBUSY; @@ -10136,7 +10093,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, free_extent_map(em); em = NULL; - ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL); + ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, true); if (ret < 0) { goto out; } else if (ret) { @@ -10226,7 +10183,7 @@ out: if (ret) btrfs_swap_deactivate(file); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); if (ret) return ret; @@ -10284,12 +10241,6 @@ static const struct file_operations btrfs_dir_file_operations = { .fsync = btrfs_sync_file, }; -static const struct extent_io_ops btrfs_extent_io_ops = { - /* mandatory callbacks */ - .submit_bio_hook = btrfs_submit_bio_hook, - .readpage_end_io_hook = btrfs_readpage_end_io_hook, -}; - /* * btrfs doesn't support the bmap operation because swapfiles * use bmap to make a mapping of extents in the file. They assume @@ -10307,7 +10258,7 @@ static const struct address_space_operations btrfs_aops = { .writepage = btrfs_writepage, .writepages = btrfs_writepages, .readahead = btrfs_readahead, - .direct_IO = btrfs_direct_IO, + .direct_IO = noop_direct_IO, .invalidatepage = btrfs_invalidatepage, .releasepage = btrfs_releasepage, #ifdef CONFIG_MIGRATION diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index bd3511c5ca81..ab408a23ba32 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -378,6 +378,18 @@ static int check_xflags(unsigned int flags) return 0; } +bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type) +{ + return !cmpxchg(&fs_info->exclusive_operation, BTRFS_EXCLOP_NONE, type); +} + +void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) +{ + WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE); + sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation"); +} + /* * Set the xflags from the internal inode flags. The remaining items of fsxattr * are zeroed. @@ -618,7 +630,7 @@ static noinline int create_subvol(struct inode *dir, trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - btrfs_subvolume_release_metadata(fs_info, &block_rsv); + btrfs_subvolume_release_metadata(root, &block_rsv); goto fail_free; } trans->block_rsv = &block_rsv; @@ -628,7 +640,8 @@ static noinline int create_subvol(struct inode *dir, if (ret) goto fail; - leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0); + leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, + BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); goto fail; @@ -742,7 +755,7 @@ fail: kfree(root_item); trans->block_rsv = NULL; trans->bytes_reserved = 0; - btrfs_subvolume_release_metadata(fs_info, &block_rsv); + btrfs_subvolume_release_metadata(root, &block_rsv); err = btrfs_commit_transaction(trans); if (err && !ret) @@ -856,7 +869,7 @@ fail: if (ret && pending_snapshot->snap) pending_snapshot->snap->anon_dev = 0; btrfs_put_root(pending_snapshot->snap); - btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv); + btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv); free_pending: if (pending_snapshot->anon_dev) free_anon_bdev(pending_snapshot->anon_dev); @@ -1306,7 +1319,7 @@ again: break; unlock_page(page); - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); lock_page(page); /* @@ -1638,7 +1651,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (ret) return ret; - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_RESIZE)) { mnt_drop_write_file(file); return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } @@ -1752,7 +1765,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, out_free: kfree(vol_args); out: - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); mnt_drop_write_file(file); return ret; } @@ -2086,9 +2099,14 @@ static noinline int copy_to_sk(struct btrfs_path *path, sh.len = item_len; sh.transid = found_transid; - /* copy search result header */ - if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) { - ret = -EFAULT; + /* + * Copy search result header. If we fault then loop again so we + * can fault in the pages and -EFAULT there if there's a + * problem. Otherwise we'll fault and then copy the buffer in + * properly this next time through + */ + if (copy_to_user_nofault(ubuf + *sk_offset, &sh, sizeof(sh))) { + ret = 0; goto out; } @@ -2096,10 +2114,14 @@ static noinline int copy_to_sk(struct btrfs_path *path, if (item_len) { char __user *up = ubuf + *sk_offset; - /* copy the item */ - if (read_extent_buffer_to_user(leaf, up, - item_off, item_len)) { - ret = -EFAULT; + /* + * Copy the item, same behavior as above, but reset the + * * sk_offset so we copy the full thing again. + */ + if (read_extent_buffer_to_user_nofault(leaf, up, + item_off, item_len)) { + ret = 0; + *sk_offset -= sizeof(sh); goto out; } @@ -2184,6 +2206,11 @@ static noinline int search_ioctl(struct inode *inode, key.offset = sk->min_offset; while (1) { + ret = fault_in_pages_writeable(ubuf + sk_offset, + *buf_size - sk_offset); + if (ret) + break; + ret = btrfs_search_forward(root, &key, path, sk->min_transid); if (ret != 0) { if (ret > 0) @@ -3112,7 +3139,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; vol_args = memdup_user(arg, sizeof(*vol_args)); @@ -3129,7 +3156,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) kfree(vol_args); out: - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); return ret; } @@ -3158,7 +3185,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) goto out; } - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; goto out; } @@ -3169,7 +3196,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; ret = btrfs_rm_device(fs_info, vol_args->name, 0); } - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); if (!ret) { if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) @@ -3200,7 +3227,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) if (ret) return ret; - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; goto out_drop_write; } @@ -3218,7 +3245,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) btrfs_info(fs_info, "disk deleted %s", vol_args->name); kfree(vol_args); out: - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); out_drop_write: mnt_drop_write_file(file); @@ -3448,15 +3475,12 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *tmp; info = NULL; - rcu_read_lock(); - list_for_each_entry_rcu(tmp, &fs_info->space_info, - list) { + list_for_each_entry(tmp, &fs_info->space_info, list) { if (tmp->flags == types[i]) { info = tmp; break; } } - rcu_read_unlock(); if (!info) continue; @@ -3504,15 +3528,12 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, break; info = NULL; - rcu_read_lock(); - list_for_each_entry_rcu(tmp, &fs_info->space_info, - list) { + list_for_each_entry(tmp, &fs_info->space_info, list) { if (tmp->flags == types[i]) { info = tmp; break; } } - rcu_read_unlock(); if (!info) continue; @@ -3722,11 +3743,11 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info, ret = -EROFS; goto out; } - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } else { ret = btrfs_dev_replace_by_ioctl(fs_info, p); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); } break; case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS: @@ -3937,7 +3958,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) return ret; again: - if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { mutex_lock(&fs_info->balance_mutex); need_unlock = true; goto locked; @@ -3983,7 +4004,6 @@ again: } locked: - BUG_ON(!test_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); if (arg) { bargs = memdup_user(arg, sizeof(*bargs)); @@ -4038,10 +4058,10 @@ locked: do_balance: /* - * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP goes to - * btrfs_balance. bctl is freed in reset_balance_state, or, if - * restriper was paused all the way until unmount, in free_fs_info. - * The flag should be cleared after reset_balance_state. + * Ownership of bctl and exclusive operation goes to btrfs_balance. + * bctl is freed in reset_balance_state, or, if restriper was paused + * all the way until unmount, in free_fs_info. The flag should be + * cleared after reset_balance_state. */ need_unlock = false; @@ -4060,7 +4080,7 @@ out_bargs: out_unlock: mutex_unlock(&fs_info->balance_mutex); if (need_unlock) - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); out: mnt_drop_write_file(file); return ret; @@ -4883,7 +4903,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SYNC: { int ret; - ret = btrfs_start_delalloc_roots(fs_info, -1); + ret = btrfs_start_delalloc_roots(fs_info, U64_MAX); if (ret) return ret; ret = btrfs_sync_fs(inode->i_sb, 1); diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index f75612e18a82..66e02ebdd340 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -57,8 +57,8 @@ * performance reasons. * * - * Lock nesting - * ------------ + * Lock recursion + * -------------- * * A write operation on a tree might indirectly start a look up on the same * tree. This can happen when btrfs_cow_block locks the tree and needs to @@ -201,7 +201,7 @@ void btrfs_set_lock_blocking_read(struct extent_buffer *eb) * lock, but it won't change to or away from us. If we have the write * lock, we are the owner and it'll never change. */ - if (eb->lock_nested && current->pid == eb->lock_owner) + if (eb->lock_recursed && current->pid == eb->lock_owner) return; btrfs_assert_tree_read_locked(eb); atomic_inc(&eb->blocking_readers); @@ -225,7 +225,7 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb) * lock, but it won't change to or away from us. If we have the write * lock, we are the owner and it'll never change. */ - if (eb->lock_nested && current->pid == eb->lock_owner) + if (eb->lock_recursed && current->pid == eb->lock_owner) return; if (eb->blocking_writers == 0) { btrfs_assert_spinning_writers_put(eb); @@ -244,7 +244,8 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb) * * The rwlock is held upon exit. */ -void btrfs_tree_read_lock(struct extent_buffer *eb) +void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest, + bool recurse) { u64 start_ns = 0; @@ -263,8 +264,9 @@ again: * depends on this as it may be called on a partly * (write-)locked tree. */ - BUG_ON(eb->lock_nested); - eb->lock_nested = true; + WARN_ON(!recurse); + BUG_ON(eb->lock_recursed); + eb->lock_recursed = true; read_unlock(&eb->lock); trace_btrfs_tree_read_lock(eb, start_ns); return; @@ -279,6 +281,11 @@ again: trace_btrfs_tree_read_lock(eb, start_ns); } +void btrfs_tree_read_lock(struct extent_buffer *eb) +{ + __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, false); +} + /* * Lock extent buffer for read, optimistically expecting that there are no * contending blocking writers. If there are, don't wait. @@ -362,11 +369,11 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb) /* * if we're nested, we have the write lock. No new locking * is needed as long as we are the lock owner. - * The write unlock will do a barrier for us, and the lock_nested + * The write unlock will do a barrier for us, and the lock_recursed * field only matters to the lock owner. */ - if (eb->lock_nested && current->pid == eb->lock_owner) { - eb->lock_nested = false; + if (eb->lock_recursed && current->pid == eb->lock_owner) { + eb->lock_recursed = false; return; } btrfs_assert_tree_read_locked(eb); @@ -388,11 +395,11 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) /* * if we're nested, we have the write lock. No new locking * is needed as long as we are the lock owner. - * The write unlock will do a barrier for us, and the lock_nested + * The write unlock will do a barrier for us, and the lock_recursed * field only matters to the lock owner. */ - if (eb->lock_nested && current->pid == eb->lock_owner) { - eb->lock_nested = false; + if (eb->lock_recursed && current->pid == eb->lock_owner) { + eb->lock_recursed = false; return; } btrfs_assert_tree_read_locked(eb); @@ -409,7 +416,7 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) * * The rwlock is held for write upon exit. */ -void btrfs_tree_lock(struct extent_buffer *eb) +void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) __acquires(&eb->lock) { u64 start_ns = 0; @@ -434,6 +441,11 @@ again: trace_btrfs_tree_lock(eb, start_ns); } +void btrfs_tree_lock(struct extent_buffer *eb) +{ + __btrfs_tree_lock(eb, BTRFS_NESTING_NORMAL); +} + /* * Release the write lock, either blocking or spinning (ie. there's no need * for an explicit blocking unlock, like btrfs_tree_read_unlock_blocking). @@ -552,13 +564,14 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) * * Return: root extent buffer with read lock held */ -struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) +struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root, + bool recurse) { struct extent_buffer *eb; while (1) { eb = btrfs_root_node(root); - btrfs_tree_read_lock(eb); + __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, recurse); if (eb == root->node) break; btrfs_tree_read_unlock(eb); diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index d715846c10b8..3ea81ed3320b 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -16,11 +16,81 @@ #define BTRFS_WRITE_LOCK_BLOCKING 3 #define BTRFS_READ_LOCK_BLOCKING 4 +/* + * We are limited in number of subclasses by MAX_LOCKDEP_SUBCLASSES, which at + * the time of this patch is 8, which is how many we use. Keep this in mind if + * you decide you want to add another subclass. + */ +enum btrfs_lock_nesting { + BTRFS_NESTING_NORMAL, + + /* + * When we COW a block we are holding the lock on the original block, + * and since our lockdep maps are rootid+level, this confuses lockdep + * when we lock the newly allocated COW'd block. Handle this by having + * a subclass for COW'ed blocks so that lockdep doesn't complain. + */ + BTRFS_NESTING_COW, + + /* + * Oftentimes we need to lock adjacent nodes on the same level while + * still holding the lock on the original node we searched to, such as + * for searching forward or for split/balance. + * + * Because of this we need to indicate to lockdep that this is + * acceptable by having a different subclass for each of these + * operations. + */ + BTRFS_NESTING_LEFT, + BTRFS_NESTING_RIGHT, + + /* + * When splitting we will be holding a lock on the left/right node when + * we need to cow that node, thus we need a new set of subclasses for + * these two operations. + */ + BTRFS_NESTING_LEFT_COW, + BTRFS_NESTING_RIGHT_COW, + + /* + * When splitting we may push nodes to the left or right, but still use + * the subsequent nodes in our path, keeping our locks on those adjacent + * blocks. Thus when we go to allocate a new split block we've already + * used up all of our available subclasses, so this subclass exists to + * handle this case where we need to allocate a new split block. + */ + BTRFS_NESTING_SPLIT, + + /* + * When promoting a new block to a root we need to have a special + * subclass so we don't confuse lockdep, as it will appear that we are + * locking a higher level node before a lower level one. Copying also + * has this problem as it appears we're locking the same block again + * when we make a snapshot of an existing root. + */ + BTRFS_NESTING_NEW_ROOT, + + /* + * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so + * add this in here and add a static_assert to keep us from going over + * the limit. As of this writing we're limited to 8, and we're + * definitely using 8, hence this check to keep us from messing up in + * the future. + */ + BTRFS_NESTING_MAX, +}; + +static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES, + "too many lock subclasses defined"); + struct btrfs_path; +void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest); void btrfs_tree_lock(struct extent_buffer *eb); void btrfs_tree_unlock(struct extent_buffer *eb); +void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest, + bool recurse); void btrfs_tree_read_lock(struct extent_buffer *eb); void btrfs_tree_read_unlock(struct extent_buffer *eb); void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb); @@ -29,6 +99,14 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb); int btrfs_try_tree_read_lock(struct extent_buffer *eb); int btrfs_try_tree_write_lock(struct extent_buffer *eb); int btrfs_tree_read_lock_atomic(struct extent_buffer *eb); +struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); +struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root, + bool recurse); + +static inline struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) +{ + return __btrfs_read_lock_root_node(root, false); +} #ifdef CONFIG_BTRFS_DEBUG static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index ebac13389e7e..87bac9ecdf4c 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -212,11 +212,12 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset refcount_set(&entry->refs, 1); init_waitqueue_head(&entry->wait); INIT_LIST_HEAD(&entry->list); + INIT_LIST_HEAD(&entry->log_list); INIT_LIST_HEAD(&entry->root_extent_list); INIT_LIST_HEAD(&entry->work_list); init_completion(&entry->completion); - trace_btrfs_ordered_extent_add(&inode->vfs_inode, entry); + trace_btrfs_ordered_extent_add(inode, entry); spin_lock_irq(&tree->lock); node = tree_insert(&tree->tree, file_offset, @@ -377,17 +378,16 @@ out: * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used * to make sure this function only returns 1 once for a given ordered extent. */ -int btrfs_dec_test_ordered_pending(struct inode *inode, +int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size, int uptodate) { - struct btrfs_ordered_inode_tree *tree; + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; unsigned long flags; int ret; - tree = &BTRFS_I(inode)->ordered_tree; spin_lock_irqsave(&tree->lock, flags); if (cached && *cached) { entry = *cached; @@ -408,7 +408,7 @@ have_entry: } if (io_size > entry->bytes_left) { - btrfs_crit(BTRFS_I(inode)->root->fs_info, + btrfs_crit(inode->root->fs_info, "bad ordered accounting left %llu size %llu", entry->bytes_left, io_size); } @@ -441,10 +441,11 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) struct list_head *cur; struct btrfs_ordered_sum *sum; - trace_btrfs_ordered_extent_put(entry->inode, entry); + trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry); if (refcount_dec_and_test(&entry->refs)) { ASSERT(list_empty(&entry->root_extent_list)); + ASSERT(list_empty(&entry->log_list)); ASSERT(RB_EMPTY_NODE(&entry->rb_node)); if (entry->inode) btrfs_add_delayed_iput(entry->inode); @@ -462,14 +463,14 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) * remove an ordered extent from the tree. No references are dropped * and waiters are woken up. */ -void btrfs_remove_ordered_extent(struct inode *inode, +void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_inode_tree *tree; - struct btrfs_inode *btrfs_inode = BTRFS_I(inode); struct btrfs_root *root = btrfs_inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *node; + bool pending; /* This is paired with btrfs_add_ordered_extent. */ spin_lock(&btrfs_inode->lock); @@ -491,13 +492,41 @@ void btrfs_remove_ordered_extent(struct inode *inode, if (tree->last == node) tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); + pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags); spin_unlock_irq(&tree->lock); + /* + * The current running transaction is waiting on us, we need to let it + * know that we're complete and wake it up. + */ + if (pending) { + struct btrfs_transaction *trans; + + /* + * The checks for trans are just a formality, it should be set, + * but if it isn't we don't want to deref/assert under the spin + * lock, so be nice and check if trans is set, but ASSERT() so + * if it isn't set a developer will notice. + */ + spin_lock(&fs_info->trans_lock); + trans = fs_info->running_transaction; + if (trans) + refcount_inc(&trans->use_count); + spin_unlock(&fs_info->trans_lock); + + ASSERT(trans); + if (trans) { + if (atomic_dec_and_test(&trans->pending_ordered)) + wake_up(&trans->pending_wait); + btrfs_put_transaction(trans); + } + } + spin_lock(&root->ordered_extent_lock); list_del_init(&entry->root_extent_list); root->nr_ordered_extents--; - trace_btrfs_ordered_extent_remove(inode, entry); + trace_btrfs_ordered_extent_remove(btrfs_inode, entry); if (!root->nr_ordered_extents) { spin_lock(&fs_info->ordered_root_lock); @@ -514,7 +543,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) struct btrfs_ordered_extent *ordered; ordered = container_of(work, struct btrfs_ordered_extent, flush_work); - btrfs_start_ordered_extent(ordered->inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); complete(&ordered->completion); } @@ -620,12 +649,11 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, * in the extent, and it waits on the io completion code to insert * metadata into the btree corresponding to the extent */ -void btrfs_start_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry, - int wait) +void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait) { u64 start = entry->file_offset; u64 end = start + entry->num_bytes - 1; + struct btrfs_inode *inode = BTRFS_I(entry->inode); trace_btrfs_ordered_extent_start(inode, entry); @@ -635,7 +663,7 @@ void btrfs_start_ordered_extent(struct inode *inode, * for the flusher thread to find them */ if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) - filemap_fdatawrite_range(inode->i_mapping, start, end); + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); if (wait) { wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); @@ -679,7 +707,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) end = orig_end; while (1) { - ordered = btrfs_lookup_first_ordered_extent(inode, end); + ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end); if (!ordered) break; if (ordered->file_offset > orig_end) { @@ -690,7 +718,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) btrfs_put_ordered_extent(ordered); break; } - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); end = ordered->file_offset; /* * If the ordered extent had an error save the error but don't @@ -775,17 +803,45 @@ out: } /* + * Adds all ordered extents to the given list. The list ends up sorted by the + * file_offset of the ordered extents. + */ +void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, + struct list_head *list) +{ + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; + struct rb_node *n; + + ASSERT(inode_is_locked(&inode->vfs_inode)); + + spin_lock_irq(&tree->lock); + for (n = rb_first(&tree->tree); n; n = rb_next(n)) { + struct btrfs_ordered_extent *ordered; + + ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); + + if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) + continue; + + ASSERT(list_empty(&ordered->log_list)); + list_add_tail(&ordered->log_list, list); + refcount_inc(&ordered->refs); + } + spin_unlock_irq(&tree->lock); +} + +/* * lookup and return any extent before 'file_offset'. NULL is returned * if none is found */ struct btrfs_ordered_extent * -btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) +btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - tree = &BTRFS_I(inode)->ordered_tree; + tree = &inode->ordered_tree; spin_lock_irq(&tree->lock); node = tree_search(tree, file_offset); if (!node) @@ -803,20 +859,21 @@ out: * try to find a checksum. This is used because we allow pages to * be reclaimed before their checksum is actually put into the btree */ -int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, - u8 *sum, int len) +int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset, + u64 disk_bytenr, u8 *sum, int len) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_ordered_sum *ordered_sum; struct btrfs_ordered_extent *ordered; - struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; unsigned long num_sectors; unsigned long i; u32 sectorsize = btrfs_inode_sectorsize(inode); + const u8 blocksize_bits = inode->vfs_inode.i_sb->s_blocksize_bits; const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); int index = 0; - ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), offset); + ordered = btrfs_lookup_ordered_extent(inode, offset); if (!ordered) return 0; @@ -824,10 +881,8 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { if (disk_bytenr >= ordered_sum->bytenr && disk_bytenr < ordered_sum->bytenr + ordered_sum->len) { - i = (disk_bytenr - ordered_sum->bytenr) >> - inode->i_sb->s_blocksize_bits; - num_sectors = ordered_sum->len >> - inode->i_sb->s_blocksize_bits; + i = (disk_bytenr - ordered_sum->bytenr) >> blocksize_bits; + num_sectors = ordered_sum->len >> blocksize_bits; num_sectors = min_t(int, len - index, num_sectors - i); memcpy(sum + index, ordered_sum->sums + i * csum_size, num_sectors * csum_size); @@ -883,7 +938,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, break; } unlock_extent_cached(&inode->io_tree, start, end, cachedp); - btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); } } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index d61ea9c880a3..c3a2325e64a4 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -56,6 +56,12 @@ enum { BTRFS_ORDERED_TRUNCATED, /* Regular IO for COW */ BTRFS_ORDERED_REGULAR, + /* Used during fsync to track already logged extents */ + BTRFS_ORDERED_LOGGED, + /* We have already logged all the csums of the ordered extent */ + BTRFS_ORDERED_LOGGED_CSUM, + /* We wait for this extent to complete in the current transaction */ + BTRFS_ORDERED_PENDING, }; struct btrfs_ordered_extent { @@ -104,6 +110,9 @@ struct btrfs_ordered_extent { /* list of checksums for insertion when the extent io is done */ struct list_head list; + /* used for fast fsyncs */ + struct list_head log_list; + /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ wait_queue_head_t wait; @@ -142,9 +151,9 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) } void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); -void btrfs_remove_ordered_extent(struct inode *inode, +void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry); -int btrfs_dec_test_ordered_pending(struct inode *inode, +int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size, int uptodate); int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, @@ -165,17 +174,18 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, u64 file_offset); -void btrfs_start_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry, int wait); +void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait); int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * -btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); +btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset); struct btrfs_ordered_extent *btrfs_lookup_ordered_range( struct btrfs_inode *inode, u64 file_offset, u64 len); -int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, - u8 *sum, int len); +void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, + struct list_head *list); +int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset, + u64 disk_bytenr, u8 *sum, int len); u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, const u64 range_start, const u64 range_len); void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 61f44e78e3c9..7695c4783d33 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -7,6 +7,44 @@ #include "disk-io.h" #include "print-tree.h" +struct root_name_map { + u64 id; + char name[16]; +}; + +static const struct root_name_map root_map[] = { + { BTRFS_ROOT_TREE_OBJECTID, "ROOT_TREE" }, + { BTRFS_EXTENT_TREE_OBJECTID, "EXTENT_TREE" }, + { BTRFS_CHUNK_TREE_OBJECTID, "CHUNK_TREE" }, + { BTRFS_DEV_TREE_OBJECTID, "DEV_TREE" }, + { BTRFS_FS_TREE_OBJECTID, "FS_TREE" }, + { BTRFS_CSUM_TREE_OBJECTID, "CSUM_TREE" }, + { BTRFS_TREE_LOG_OBJECTID, "TREE_LOG" }, + { BTRFS_QUOTA_TREE_OBJECTID, "QUOTA_TREE" }, + { BTRFS_UUID_TREE_OBJECTID, "UUID_TREE" }, + { BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" }, + { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" }, +}; + +const char *btrfs_root_name(u64 objectid, char *buf) +{ + int i; + + if (objectid == BTRFS_TREE_RELOC_OBJECTID) { + snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN, + "TREE_RELOC offset=%llu", objectid); + return buf; + } + + for (i = 0; i < ARRAY_SIZE(root_map); i++) { + if (root_map[i].id == objectid) + return root_map[i].name; + } + + snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN, "%llu", objectid); + return buf; +} + static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk) { int num_stripes = btrfs_chunk_num_stripes(eb, chunk); @@ -95,9 +133,10 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type) * offset is supposed to be a tree block which * must be aligned to nodesize. */ - if (!IS_ALIGNED(offset, eb->fs_info->nodesize)) - pr_info("\t\t\t(parent %llu is NOT ALIGNED to nodesize %llu)\n", - offset, (unsigned long long)eb->fs_info->nodesize); + if (!IS_ALIGNED(offset, eb->fs_info->sectorsize)) + pr_info( + "\t\t\t(parent %llu not aligned to sectorsize %u)\n", + offset, eb->fs_info->sectorsize); break; case BTRFS_EXTENT_DATA_REF_KEY: dref = (struct btrfs_extent_data_ref *)(&iref->offset); @@ -112,8 +151,9 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type) * must be aligned to nodesize. */ if (!IS_ALIGNED(offset, eb->fs_info->nodesize)) - pr_info("\t\t\t(parent %llu is NOT ALIGNED to nodesize %llu)\n", - offset, (unsigned long long)eb->fs_info->nodesize); + pr_info( + "\t\t\t(parent %llu not aligned to sectorsize %u)\n", + offset, eb->fs_info->sectorsize); break; default: pr_cont("(extent %llu has INVALID ref type %d)\n", diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index e6bb38fd75ad..78b99385a503 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -6,7 +6,11 @@ #ifndef BTRFS_PRINT_TREE_H #define BTRFS_PRINT_TREE_H +/* Buffer size to contain tree name and possibly additional data (offset) */ +#define BTRFS_ROOT_NAME_BUF_LEN 48 + void btrfs_print_leaf(struct extent_buffer *l); void btrfs_print_tree(struct extent_buffer *c, bool follow); +const char *btrfs_root_name(u64 objectid, char *buf); #endif diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index c0f350c3a0cf..580899bdb991 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2315,7 +2315,7 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, * Update qgroup rfer/excl counters. * Rfer update is easy, codes can explain themselves. * - * Excl update is tricky, the update is split into 2 part. + * Excl update is tricky, the update is split into 2 parts. * Part 1: Possible exclusive <-> sharing detect: * | A | !A | * ------------------------------------- diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 243a2e44526e..9d4f5316a7e8 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -767,31 +767,39 @@ static void reada_start_machine_worker(struct btrfs_work *work) kfree(rmw); } -static void __reada_start_machine(struct btrfs_fs_info *fs_info) +/* Try to start up to 10k READA requests for a group of devices */ +static int reada_start_for_fsdevs(struct btrfs_fs_devices *fs_devices) { - struct btrfs_device *device; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; u64 enqueued; u64 total = 0; - int i; + struct btrfs_device *device; -again: do { enqueued = 0; - mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { if (atomic_read(&device->reada_in_flight) < MAX_IN_FLIGHT) enqueued += reada_start_machine_dev(device); } - mutex_unlock(&fs_devices->device_list_mutex); total += enqueued; } while (enqueued && total < 10000); - if (fs_devices->seed) { - fs_devices = fs_devices->seed; - goto again; - } + return total; +} + +static void __reada_start_machine(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; + int i; + u64 enqueued = 0; + + mutex_lock(&fs_devices->device_list_mutex); + + enqueued += reada_start_for_fsdevs(fs_devices); + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) + enqueued += reada_start_for_fsdevs(seed_devs); + + mutex_unlock(&fs_devices->device_list_mutex); if (enqueued == 0) return; diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 5cd02514cf4d..99aa87c08912 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -45,7 +45,7 @@ out: return ret; } -static int copy_inline_to_page(struct inode *inode, +static int copy_inline_to_page(struct btrfs_inode *inode, const u64 file_offset, char *inline_data, const u64 size, @@ -58,6 +58,7 @@ static int copy_inline_to_page(struct inode *inode, char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0); struct extent_changeset *data_reserved = NULL; struct page *page = NULL; + struct address_space *mapping = inode->vfs_inode.i_mapping; int ret; ASSERT(IS_ALIGNED(file_offset, block_size)); @@ -68,24 +69,23 @@ static int copy_inline_to_page(struct inode *inode, * reservation here. Also we must not do the reservation while holding * a transaction open, otherwise we would deadlock. */ - ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, - file_offset, block_size); + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset, + block_size); if (ret) goto out; - page = find_or_create_page(inode->i_mapping, file_offset >> PAGE_SHIFT, - btrfs_alloc_write_mask(inode->i_mapping)); + page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT, + btrfs_alloc_write_mask(mapping)); if (!page) { ret = -ENOMEM; goto out_unlock; } set_page_extent_mapped(page); - clear_extent_bit(&BTRFS_I(inode)->io_tree, file_offset, range_end, + clear_extent_bit(&inode->io_tree, file_offset, range_end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, NULL); - ret = btrfs_set_extent_delalloc(BTRFS_I(inode), file_offset, range_end, - 0, NULL); + ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL); if (ret) goto out_unlock; @@ -134,9 +134,9 @@ out_unlock: put_page(page); } if (ret) - btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, - file_offset, block_size, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), block_size); + btrfs_delalloc_release_space(inode, data_reserved, file_offset, + block_size, true); + btrfs_delalloc_release_extents(inode, block_size); out: extent_changeset_free(data_reserved); @@ -167,8 +167,8 @@ static int clone_copy_inline_extent(struct inode *dst, struct btrfs_key key; if (new_key->offset > 0) { - ret = copy_inline_to_page(dst, new_key->offset, inline_data, - size, datal, comp_type); + ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + inline_data, size, datal, comp_type); goto out; } @@ -194,7 +194,7 @@ static int clone_copy_inline_extent(struct inode *dst, * inline extent's data to the page. */ ASSERT(key.offset > 0); - ret = copy_inline_to_page(dst, new_key->offset, + ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, inline_data, size, datal, comp_type); goto out; @@ -213,8 +213,8 @@ static int clone_copy_inline_extent(struct inode *dst, BTRFS_FILE_EXTENT_INLINE) goto copy_inline_extent; - ret = copy_inline_to_page(dst, new_key->offset, inline_data, - size, datal, comp_type); + ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + inline_data, size, datal, comp_type); goto out; } @@ -231,8 +231,8 @@ copy_inline_extent: * clone. Deal with all these cases by copying the inline extent * data into the respective page at the destination inode. */ - ret = copy_inline_to_page(dst, new_key->offset, inline_data, - size, datal, comp_type); + ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + inline_data, size, datal, comp_type); goto out; } @@ -439,7 +439,7 @@ process_slot: if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) { - struct btrfs_clone_extent_info clone_info; + struct btrfs_replace_extent_info clone_info; /* * a | --- range to clone ---| b @@ -462,8 +462,8 @@ process_slot: clone_info.data_len = datal; clone_info.file_offset = new_key.offset; clone_info.extent_buf = buf; - clone_info.item_size = size; - ret = btrfs_punch_hole_range(inode, path, drop_start, + clone_info.is_new_extent = false; + ret = btrfs_replace_file_extents(inode, path, drop_start, new_key.offset + datal - 1, &clone_info, &trans); if (ret) @@ -520,6 +520,8 @@ process_slot: ret = -EINTR; goto out; } + + cond_resched(); } ret = 0; @@ -533,7 +535,7 @@ process_slot: btrfs_release_path(path); path->leave_spinning = 0; - ret = btrfs_punch_hole_range(inode, path, last_dest_end, + ret = btrfs_replace_file_extents(inode, path, last_dest_end, destoff + len - 1, NULL, &trans); if (ret) goto out; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 4ba1ab9cc76d..3602806d71bd 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1206,7 +1206,8 @@ again: } if (cow) { - ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); + ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb, + BTRFS_NESTING_COW); BUG_ON(ret); } btrfs_set_lock_blocking_write(eb); @@ -1274,7 +1275,8 @@ again: btrfs_tree_lock(eb); if (cow) { ret = btrfs_cow_block(trans, dest, eb, parent, - slot, &eb); + slot, &eb, + BTRFS_NESTING_COW); BUG_ON(ret); } btrfs_set_lock_blocking_write(eb); @@ -1781,7 +1783,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, * relocated and the block is tree root. */ leaf = btrfs_lock_root_node(root); - ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf); + ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf, + BTRFS_NESTING_COW); btrfs_tree_unlock(leaf); free_extent_buffer(leaf); if (ret < 0) @@ -2308,7 +2311,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, if (!node->eb) { ret = btrfs_cow_block(trans, root, eb, upper->eb, - slot, &eb); + slot, &eb, BTRFS_NESTING_COW); btrfs_tree_unlock(eb); free_extent_buffer(eb); if (ret < 0) { diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index c89697486366..702dc5441f03 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -512,11 +512,20 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, if (ret && qgroup_num_bytes) btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); + if (!ret) { + spin_lock(&rsv->lock); + rsv->qgroup_rsv_reserved += qgroup_num_bytes; + spin_unlock(&rsv->lock); + } return ret; } -void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, +void btrfs_subvolume_release_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv) { - btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL); + struct btrfs_fs_info *fs_info = root->fs_info; + u64 qgroup_to_release; + + btrfs_block_rsv_release(fs_info, rsv, (u64)-1, &qgroup_to_release); + btrfs_qgroup_convert_reserved_meta(root, qgroup_to_release); } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 5a6cb9db512e..cf63f1e27a27 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -835,7 +835,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) int success; bool full_stripe_locked; unsigned int nofs_flag; - static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); BUG_ON(sblock_to_check->page_count < 1); @@ -969,14 +969,14 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) spin_lock(&sctx->stat_lock); sctx->stat.read_errors++; spin_unlock(&sctx->stat_lock); - if (__ratelimit(&_rs)) + if (__ratelimit(&rs)) scrub_print_warning("i/o error", sblock_to_check); btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); } else if (sblock_bad->checksum_error) { spin_lock(&sctx->stat_lock); sctx->stat.csum_errors++; spin_unlock(&sctx->stat_lock); - if (__ratelimit(&_rs)) + if (__ratelimit(&rs)) scrub_print_warning("checksum error", sblock_to_check); btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); @@ -984,7 +984,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) spin_lock(&sctx->stat_lock); sctx->stat.verify_errors++; spin_unlock(&sctx->stat_lock); - if (__ratelimit(&_rs)) + if (__ratelimit(&rs)) scrub_print_warning("checksum/header error", sblock_to_check); if (sblock_bad->generation_error) @@ -3716,50 +3716,84 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, return 0; } +static void scrub_workers_put(struct btrfs_fs_info *fs_info) +{ + if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt, + &fs_info->scrub_lock)) { + struct btrfs_workqueue *scrub_workers = NULL; + struct btrfs_workqueue *scrub_wr_comp = NULL; + struct btrfs_workqueue *scrub_parity = NULL; + + scrub_workers = fs_info->scrub_workers; + scrub_wr_comp = fs_info->scrub_wr_completion_workers; + scrub_parity = fs_info->scrub_parity_workers; + + fs_info->scrub_workers = NULL; + fs_info->scrub_wr_completion_workers = NULL; + fs_info->scrub_parity_workers = NULL; + mutex_unlock(&fs_info->scrub_lock); + + btrfs_destroy_workqueue(scrub_workers); + btrfs_destroy_workqueue(scrub_wr_comp); + btrfs_destroy_workqueue(scrub_parity); + } +} + /* * get a reference count on fs_info->scrub_workers. start worker if necessary */ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, int is_dev_replace) { + struct btrfs_workqueue *scrub_workers = NULL; + struct btrfs_workqueue *scrub_wr_comp = NULL; + struct btrfs_workqueue *scrub_parity = NULL; unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; int max_active = fs_info->thread_pool_size; + int ret = -ENOMEM; - lockdep_assert_held(&fs_info->scrub_lock); + if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt)) + return 0; - if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) { - ASSERT(fs_info->scrub_workers == NULL); - fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", - flags, is_dev_replace ? 1 : max_active, 4); - if (!fs_info->scrub_workers) - goto fail_scrub_workers; - - ASSERT(fs_info->scrub_wr_completion_workers == NULL); - fs_info->scrub_wr_completion_workers = - btrfs_alloc_workqueue(fs_info, "scrubwrc", flags, - max_active, 2); - if (!fs_info->scrub_wr_completion_workers) - goto fail_scrub_wr_completion_workers; + scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags, + is_dev_replace ? 1 : max_active, 4); + if (!scrub_workers) + goto fail_scrub_workers; - ASSERT(fs_info->scrub_parity_workers == NULL); - fs_info->scrub_parity_workers = - btrfs_alloc_workqueue(fs_info, "scrubparity", flags, + scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags, max_active, 2); - if (!fs_info->scrub_parity_workers) - goto fail_scrub_parity_workers; + if (!scrub_wr_comp) + goto fail_scrub_wr_completion_workers; + scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags, + max_active, 2); + if (!scrub_parity) + goto fail_scrub_parity_workers; + + mutex_lock(&fs_info->scrub_lock); + if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) { + ASSERT(fs_info->scrub_workers == NULL && + fs_info->scrub_wr_completion_workers == NULL && + fs_info->scrub_parity_workers == NULL); + fs_info->scrub_workers = scrub_workers; + fs_info->scrub_wr_completion_workers = scrub_wr_comp; + fs_info->scrub_parity_workers = scrub_parity; refcount_set(&fs_info->scrub_workers_refcnt, 1); - } else { - refcount_inc(&fs_info->scrub_workers_refcnt); + mutex_unlock(&fs_info->scrub_lock); + return 0; } - return 0; + /* Other thread raced in and created the workers for us */ + refcount_inc(&fs_info->scrub_workers_refcnt); + mutex_unlock(&fs_info->scrub_lock); + ret = 0; + btrfs_destroy_workqueue(scrub_parity); fail_scrub_parity_workers: - btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); + btrfs_destroy_workqueue(scrub_wr_comp); fail_scrub_wr_completion_workers: - btrfs_destroy_workqueue(fs_info->scrub_workers); + btrfs_destroy_workqueue(scrub_workers); fail_scrub_workers: - return -ENOMEM; + return ret; } int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, @@ -3770,9 +3804,6 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, int ret; struct btrfs_device *dev; unsigned int nofs_flag; - struct btrfs_workqueue *scrub_workers = NULL; - struct btrfs_workqueue *scrub_wr_comp = NULL; - struct btrfs_workqueue *scrub_parity = NULL; if (btrfs_fs_closing(fs_info)) return -EAGAIN; @@ -3819,13 +3850,17 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, if (IS_ERR(sctx)) return PTR_ERR(sctx); + ret = scrub_workers_get(fs_info, is_dev_replace); + if (ret) + goto out_free_ctx; + mutex_lock(&fs_info->fs_devices->device_list_mutex); dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true); if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && !is_dev_replace)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); ret = -ENODEV; - goto out_free_ctx; + goto out; } if (!is_dev_replace && !readonly && @@ -3834,7 +3869,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable", rcu_str_deref(dev->name)); ret = -EROFS; - goto out_free_ctx; + goto out; } mutex_lock(&fs_info->scrub_lock); @@ -3843,7 +3878,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); ret = -EIO; - goto out_free_ctx; + goto out; } down_read(&fs_info->dev_replace.rwsem); @@ -3854,17 +3889,10 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); ret = -EINPROGRESS; - goto out_free_ctx; + goto out; } up_read(&fs_info->dev_replace.rwsem); - ret = scrub_workers_get(fs_info, is_dev_replace); - if (ret) { - mutex_unlock(&fs_info->scrub_lock); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); - goto out_free_ctx; - } - sctx->readonly = readonly; dev->scrub_ctx = sctx; mutex_unlock(&fs_info->fs_devices->device_list_mutex); @@ -3917,24 +3945,14 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_lock(&fs_info->scrub_lock); dev->scrub_ctx = NULL; - if (refcount_dec_and_test(&fs_info->scrub_workers_refcnt)) { - scrub_workers = fs_info->scrub_workers; - scrub_wr_comp = fs_info->scrub_wr_completion_workers; - scrub_parity = fs_info->scrub_parity_workers; - - fs_info->scrub_workers = NULL; - fs_info->scrub_wr_completion_workers = NULL; - fs_info->scrub_parity_workers = NULL; - } mutex_unlock(&fs_info->scrub_lock); - btrfs_destroy_workqueue(scrub_workers); - btrfs_destroy_workqueue(scrub_wr_comp); - btrfs_destroy_workqueue(scrub_parity); + scrub_workers_put(fs_info); scrub_put_ctx(sctx); return ret; - +out: + scrub_workers_put(fs_info); out_free_ctx: scrub_free_ctx(sctx); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d9813a5b075a..340c76a12ce1 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -122,8 +122,6 @@ struct send_ctx { struct file_ra_state ra; - char *read_buf; - /* * We process inodes by their increasing order, so if before an * incremental send we reverse the parent/child relationship of @@ -278,11 +276,6 @@ enum btrfs_compare_tree_result { BTRFS_COMPARE_TREE_CHANGED, BTRFS_COMPARE_TREE_SAME, }; -typedef int (*btrfs_changed_cb_t)(struct btrfs_path *left_path, - struct btrfs_path *right_path, - struct btrfs_key *key, - enum btrfs_compare_tree_result result, - void *ctx); __cold static void inconsistent_snapshot_error(struct send_ctx *sctx, @@ -584,8 +577,8 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) return -EOVERFLOW; hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size); - hdr->tlv_type = cpu_to_le16(attr); - hdr->tlv_len = cpu_to_le16(len); + put_unaligned_le16(attr, &hdr->tlv_type); + put_unaligned_le16(len, &hdr->tlv_len); memcpy(hdr + 1, data, len); sctx->send_size += total_len; @@ -695,7 +688,7 @@ static int begin_cmd(struct send_ctx *sctx, int cmd) sctx->send_size += sizeof(*hdr); hdr = (struct btrfs_cmd_header *)sctx->send_buf; - hdr->cmd = cpu_to_le16(cmd); + put_unaligned_le16(cmd, &hdr->cmd); return 0; } @@ -707,17 +700,17 @@ static int send_cmd(struct send_ctx *sctx) u32 crc; hdr = (struct btrfs_cmd_header *)sctx->send_buf; - hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr)); - hdr->crc = 0; + put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len); + put_unaligned_le32(0, &hdr->crc); crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); - hdr->crc = cpu_to_le32(crc); + put_unaligned_le32(crc, &hdr->crc); ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, &sctx->send_off); sctx->total_send_size += sctx->send_size; - sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size; + sctx->cmd_send_size[get_unaligned_le16(&hdr->cmd)] += sctx->send_size; sctx->send_size = 0; return ret; @@ -3813,6 +3806,72 @@ static int update_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) } /* + * When processing the new references for an inode we may orphanize an existing + * directory inode because its old name conflicts with one of the new references + * of the current inode. Later, when processing another new reference of our + * inode, we might need to orphanize another inode, but the path we have in the + * reference reflects the pre-orphanization name of the directory we previously + * orphanized. For example: + * + * parent snapshot looks like: + * + * . (ino 256) + * |----- f1 (ino 257) + * |----- f2 (ino 258) + * |----- d1/ (ino 259) + * |----- d2/ (ino 260) + * + * send snapshot looks like: + * + * . (ino 256) + * |----- d1 (ino 258) + * |----- f2/ (ino 259) + * |----- f2_link/ (ino 260) + * | |----- f1 (ino 257) + * | + * |----- d2 (ino 258) + * + * When processing inode 257 we compute the name for inode 259 as "d1", and we + * cache it in the name cache. Later when we start processing inode 258, when + * collecting all its new references we set a full path of "d1/d2" for its new + * reference with name "d2". When we start processing the new references we + * start by processing the new reference with name "d1", and this results in + * orphanizing inode 259, since its old reference causes a conflict. Then we + * move on the next new reference, with name "d2", and we find out we must + * orphanize inode 260, as its old reference conflicts with ours - but for the + * orphanization we use a source path corresponding to the path we stored in the + * new reference, which is "d1/d2" and not "o259-6-0/d2" - this makes the + * receiver fail since the path component "d1/" no longer exists, it was renamed + * to "o259-6-0/" when processing the previous new reference. So in this case we + * must recompute the path in the new reference and use it for the new + * orphanization operation. + */ +static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) +{ + char *name; + int ret; + + name = kmemdup(ref->name, ref->name_len, GFP_KERNEL); + if (!name) + return -ENOMEM; + + fs_path_reset(ref->full_path); + ret = get_cur_path(sctx, ref->dir, ref->dir_gen, ref->full_path); + if (ret < 0) + goto out; + + ret = fs_path_add(ref->full_path, name, ref->name_len); + if (ret < 0) + goto out; + + /* Update the reference's base name pointer. */ + set_ref_path(ref, ref->full_path); +out: + kfree(name); + return ret; +} + +/* * This does all the move/link/unlink/rmdir magic. */ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) @@ -3880,52 +3939,56 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) goto out; } + /* + * Before doing any rename and link operations, do a first pass on the + * new references to orphanize any unprocessed inodes that may have a + * reference that conflicts with one of the new references of the current + * inode. This needs to happen first because a new reference may conflict + * with the old reference of a parent directory, so we must make sure + * that the path used for link and rename commands don't use an + * orphanized name when an ancestor was not yet orphanized. + * + * Example: + * + * Parent snapshot: + * + * . (ino 256) + * |----- testdir/ (ino 259) + * | |----- a (ino 257) + * | + * |----- b (ino 258) + * + * Send snapshot: + * + * . (ino 256) + * |----- testdir_2/ (ino 259) + * | |----- a (ino 260) + * | + * |----- testdir (ino 257) + * |----- b (ino 257) + * |----- b2 (ino 258) + * + * Processing the new reference for inode 257 with name "b" may happen + * before processing the new reference with name "testdir". If so, we + * must make sure that by the time we send a link command to create the + * hard link "b", inode 259 was already orphanized, since the generated + * path in "valid_path" already contains the orphanized name for 259. + * We are processing inode 257, so only later when processing 259 we do + * the rename operation to change its temporary (orphanized) name to + * "testdir_2". + */ list_for_each_entry(cur, &sctx->new_refs, list) { - /* - * We may have refs where the parent directory does not exist - * yet. This happens if the parent directories inum is higher - * than the current inum. To handle this case, we create the - * parent directory out of order. But we need to check if this - * did already happen before due to other refs in the same dir. - */ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; - if (ret == inode_state_will_create) { - ret = 0; - /* - * First check if any of the current inodes refs did - * already create the dir. - */ - list_for_each_entry(cur2, &sctx->new_refs, list) { - if (cur == cur2) - break; - if (cur2->dir == cur->dir) { - ret = 1; - break; - } - } - - /* - * If that did not happen, check if a previous inode - * did already create the dir. - */ - if (!ret) - ret = did_create_dir(sctx, cur->dir); - if (ret < 0) - goto out; - if (!ret) { - ret = send_create_inode(sctx, cur->dir); - if (ret < 0) - goto out; - } - } + if (ret == inode_state_will_create) + continue; /* - * Check if this new ref would overwrite the first ref of - * another unprocessed inode. If yes, orphanize the - * overwritten inode. If we find an overwritten ref that is - * not the first ref, simply unlink it. + * Check if this new ref would overwrite the first ref of another + * unprocessed inode. If yes, orphanize the overwritten inode. + * If we find an overwritten ref that is not the first ref, + * simply unlink it. */ ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen, cur->name, cur->name_len, @@ -3942,6 +4005,12 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) struct name_cache_entry *nce; struct waiting_dir_move *wdm; + if (orphanized_dir) { + ret = refresh_ref_path(sctx, cur); + if (ret < 0) + goto out; + } + ret = orphanize_inode(sctx, ow_inode, ow_gen, cur->full_path); if (ret < 0) @@ -4004,6 +4073,49 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) } } + } + + list_for_each_entry(cur, &sctx->new_refs, list) { + /* + * We may have refs where the parent directory does not exist + * yet. This happens if the parent directories inum is higher + * than the current inum. To handle this case, we create the + * parent directory out of order. But we need to check if this + * did already happen before due to other refs in the same dir. + */ + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); + if (ret < 0) + goto out; + if (ret == inode_state_will_create) { + ret = 0; + /* + * First check if any of the current inodes refs did + * already create the dir. + */ + list_for_each_entry(cur2, &sctx->new_refs, list) { + if (cur == cur2) + break; + if (cur2->dir == cur->dir) { + ret = 1; + break; + } + } + + /* + * If that did not happen, check if a previous inode + * did already create the dir. + */ + if (!ret) + ret = did_create_dir(sctx, cur->dir); + if (ret < 0) + goto out; + if (!ret) { + ret = send_create_inode(sctx, cur->dir); + if (ret < 0) + goto out; + } + } + if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) { ret = wait_for_dest_dir_move(sctx, cur, is_orphan); if (ret < 0) @@ -4799,7 +4911,25 @@ out: return ret; } -static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) +static inline u64 max_send_read_size(const struct send_ctx *sctx) +{ + return sctx->send_max_size - SZ_16K; +} + +static int put_data_header(struct send_ctx *sctx, u32 len) +{ + struct btrfs_tlv_header *hdr; + + if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len) + return -EOVERFLOW; + hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size); + put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type); + put_unaligned_le16(len, &hdr->tlv_len); + sctx->send_size += sizeof(*hdr); + return 0; +} + +static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) { struct btrfs_root *root = sctx->send_root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -4809,21 +4939,16 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) pgoff_t index = offset >> PAGE_SHIFT; pgoff_t last_index; unsigned pg_offset = offset_in_page(offset); - ssize_t ret = 0; + int ret; + + ret = put_data_header(sctx, len); + if (ret) + return ret; inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); if (IS_ERR(inode)) return PTR_ERR(inode); - if (offset + len > i_size_read(inode)) { - if (offset > i_size_read(inode)) - len = 0; - else - len = offset - i_size_read(inode); - } - if (len == 0) - goto out; - last_index = (offset + len - 1) >> PAGE_SHIFT; /* initial readahead */ @@ -4864,16 +4989,16 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) } addr = kmap(page); - memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len); + memcpy(sctx->send_buf + sctx->send_size, addr + pg_offset, + cur_len); kunmap(page); unlock_page(page); put_page(page); index++; pg_offset = 0; len -= cur_len; - ret += cur_len; + sctx->send_size += cur_len; } -out: iput(inode); return ret; } @@ -4887,7 +5012,6 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p; - ssize_t num_read = 0; p = fs_path_alloc(); if (!p) @@ -4895,13 +5019,6 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len); - num_read = fill_read_buf(sctx, offset, len); - if (num_read <= 0) { - if (num_read < 0) - ret = num_read; - goto out; - } - ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); if (ret < 0) goto out; @@ -4912,16 +5029,16 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); - TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read); + ret = put_file_data(sctx, offset, len); + if (ret < 0) + goto out; ret = send_cmd(sctx); tlv_put_failure: out: fs_path_free(p); - if (ret < 0) - return ret; - return num_read; + return ret; } /* @@ -5033,8 +5150,8 @@ out: static int send_hole(struct send_ctx *sctx, u64 end) { struct fs_path *p = NULL; + u64 read_size = max_send_read_size(sctx); u64 offset = sctx->cur_inode_last_extent; - u64 len; int ret = 0; /* @@ -5061,16 +5178,19 @@ static int send_hole(struct send_ctx *sctx, u64 end) ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); if (ret < 0) goto tlv_put_failure; - memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE); while (offset < end) { - len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE); + u64 len = min(end - offset, read_size); ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); if (ret < 0) break; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); - TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len); + ret = put_data_header(sctx, len); + if (ret < 0) + break; + memset(sctx->send_buf + sctx->send_size, 0, len); + sctx->send_size += len; ret = send_cmd(sctx); if (ret < 0) break; @@ -5086,23 +5206,20 @@ static int send_extent_data(struct send_ctx *sctx, const u64 offset, const u64 len) { + u64 read_size = max_send_read_size(sctx); u64 sent = 0; if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) return send_update_extent(sctx, offset, len); while (sent < len) { - u64 size = len - sent; + u64 size = min(len - sent, read_size); int ret; - if (size > BTRFS_SEND_READ_SIZE) - size = BTRFS_SEND_READ_SIZE; ret = send_write(sctx, offset + sent, size); if (ret < 0) return ret; - if (!ret) - break; - sent += ret; + sent += size; } return 0; } @@ -5402,51 +5519,29 @@ static int send_write_or_clone(struct send_ctx *sctx, struct clone_root *clone_root) { int ret = 0; - struct btrfs_file_extent_item *ei; u64 offset = key->offset; - u64 len; - u8 type; + u64 end; u64 bs = sctx->send_root->fs_info->sb->s_blocksize; - ei = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - type = btrfs_file_extent_type(path->nodes[0], ei); - if (type == BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_ram_bytes(path->nodes[0], ei); - /* - * it is possible the inline item won't cover the whole page, - * but there may be items after this page. Make - * sure to send the whole thing - */ - len = PAGE_ALIGN(len); - } else { - len = btrfs_file_extent_num_bytes(path->nodes[0], ei); - } - - if (offset >= sctx->cur_inode_size) { - ret = 0; - goto out; - } - if (offset + len > sctx->cur_inode_size) - len = sctx->cur_inode_size - offset; - if (len == 0) { - ret = 0; - goto out; - } + end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size); + if (offset >= end) + return 0; - if (clone_root && IS_ALIGNED(offset + len, bs)) { + if (clone_root && IS_ALIGNED(end, bs)) { + struct btrfs_file_extent_item *ei; u64 disk_byte; u64 data_offset; + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei); data_offset = btrfs_file_extent_offset(path->nodes[0], ei); ret = clone_range(sctx, clone_root, disk_byte, data_offset, - offset, len); + offset, end - offset); } else { - ret = send_extent_data(sctx, offset, len); + ret = send_extent_data(sctx, offset, end - offset); } - sctx->cur_inode_next_write_offset = offset + len; -out: + sctx->cur_inode_next_write_offset = end; return ret; } @@ -6692,8 +6787,7 @@ static int tree_compare_item(struct btrfs_path *left_path, * If it detects a change, it aborts immediately. */ static int btrfs_compare_trees(struct btrfs_root *left_root, - struct btrfs_root *right_root, - btrfs_changed_cb_t changed_cb, void *ctx) + struct btrfs_root *right_root, void *ctx) { struct btrfs_fs_info *fs_info = left_root->fs_info; int ret; @@ -6960,8 +7054,7 @@ static int send_subvol(struct send_ctx *sctx) goto out; if (sctx->parent_root) { - ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, - changed_cb, sctx); + ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, sctx); if (ret < 0) goto out; ret = finish_inode_if_needed(sctx, 1); @@ -7087,7 +7180,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) u32 i; u64 *clone_sources_tmp = NULL; int clone_sources_to_rollback = 0; - unsigned alloc_size; + size_t alloc_size; int sort_clone_roots = 0; if (!capable(CAP_SYS_ADMIN)) @@ -7169,25 +7262,20 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) goto out; } - sctx->read_buf = kvmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL); - if (!sctx->read_buf) { - ret = -ENOMEM; - goto out; - } - sctx->pending_dir_moves = RB_ROOT; sctx->waiting_dir_moves = RB_ROOT; sctx->orphan_dirs = RB_ROOT; - alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1); - - sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL); + sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), + arg->clone_sources_count + 1, + GFP_KERNEL); if (!sctx->clone_roots) { ret = -ENOMEM; goto out; } - alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources); + alloc_size = array_size(sizeof(*arg->clone_sources), + arg->clone_sources_count); if (arg->clone_sources_count) { clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL); @@ -7378,7 +7466,6 @@ out: kvfree(sctx->clone_roots); kvfree(sctx->send_buf); - kvfree(sctx->read_buf); name_cache_free(sctx); diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index ead397f7034f..de91488b7cd0 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -13,7 +13,6 @@ #define BTRFS_SEND_STREAM_VERSION 1 #define BTRFS_SEND_BUF_SIZE SZ_64K -#define BTRFS_SEND_READ_SIZE (48 * SZ_1K) enum btrfs_tlv_type { BTRFS_TLV_U8, diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 475968ccbd1d..64099565ab8f 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -175,10 +175,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) struct list_head *head = &info->space_info; struct btrfs_space_info *found; - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) + list_for_each_entry(found, head, list) found->full = 0; - rcu_read_unlock(); } static int create_space_info(struct btrfs_fs_info *info, u64 flags) @@ -213,7 +211,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) if (ret) return ret; - list_add_rcu(&space_info->list, &info->space_info); + list_add(&space_info->list, &info->space_info); if (flags & BTRFS_BLOCK_GROUP_DATA) info->data_sinfo = space_info; @@ -290,22 +288,13 @@ struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) { - if (found->flags & flags) { - rcu_read_unlock(); + list_for_each_entry(found, head, list) { + if (found->flags & flags) return found; - } } - rcu_read_unlock(); return NULL; } -static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) -{ - return (global->size << 1); -} - static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, enum btrfs_reserve_flush_enum flush) @@ -476,28 +465,6 @@ again: up_read(&info->groups_sem); } -static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, - unsigned long nr_pages, int nr_items) -{ - struct super_block *sb = fs_info->sb; - - if (down_read_trylock(&sb->s_umount)) { - writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); - up_read(&sb->s_umount); - } else { - /* - * We needn't worry the filesystem going from r/w to r/o though - * we don't acquire ->s_umount mutex, because the filesystem - * should guarantee the delalloc inodes list be empty after - * the filesystem is readonly(all dirty pages are written to - * the disk). - */ - btrfs_start_delalloc_roots(fs_info, nr_items); - if (!current->journal_info) - btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); - } -} - static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, u64 to_reclaim) { @@ -516,25 +483,33 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, /* * shrink metadata reservation for delalloc */ -static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, - u64 orig, bool wait_ordered) +static void shrink_delalloc(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 to_reclaim, bool wait_ordered) { - struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; u64 delalloc_bytes; u64 dio_bytes; - u64 async_pages; u64 items; long time_left; - unsigned long nr_pages; int loops; /* Calc the number of the pages we need flush for space reservation */ - items = calc_reclaim_items_nr(fs_info, to_reclaim); - to_reclaim = items * EXTENT_SIZE_PER_ITEM; + if (to_reclaim == U64_MAX) { + items = U64_MAX; + } else { + /* + * to_reclaim is set to however much metadata we need to + * reclaim, but reclaiming that much data doesn't really track + * exactly, so increase the amount to reclaim by 2x in order to + * make sure we're flushing enough delalloc to hopefully reclaim + * some metadata reservations. + */ + items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2; + to_reclaim = items * EXTENT_SIZE_PER_ITEM; + } trans = (struct btrfs_trans_handle *)current->journal_info; - space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); delalloc_bytes = percpu_counter_sum_positive( &fs_info->delalloc_bytes); @@ -557,37 +532,17 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, loops = 0; while ((delalloc_bytes || dio_bytes) && loops < 3) { - nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; - - /* - * Triggers inode writeback for up to nr_pages. This will invoke - * ->writepages callback and trigger delalloc filling - * (btrfs_run_delalloc_range()). - */ - btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); + btrfs_start_delalloc_roots(fs_info, items); - /* - * We need to wait for the compressed pages to start before - * we continue. - */ - async_pages = atomic_read(&fs_info->async_delalloc_pages); - if (!async_pages) - goto skip_async; - - /* - * Calculate how many compressed pages we want to be written - * before we continue. I.e if there are more async pages than we - * require wait_event will wait until nr_pages are written. - */ - if (async_pages <= nr_pages) - async_pages = 0; - else - async_pages -= nr_pages; + loops++; + if (wait_ordered && !trans) { + btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); + } else { + time_left = schedule_timeout_killable(1); + if (time_left) + break; + } - wait_event(fs_info->async_submit_wait, - atomic_read(&fs_info->async_delalloc_pages) <= - (int)async_pages); -skip_async: spin_lock(&space_info->lock); if (list_empty(&space_info->tickets) && list_empty(&space_info->priority_tickets)) { @@ -596,14 +551,6 @@ skip_async: } spin_unlock(&space_info->lock); - loops++; - if (wait_ordered && !trans) { - btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); - } else { - time_left = schedule_timeout_killable(1); - if (time_left) - break; - } delalloc_bytes = percpu_counter_sum_positive( &fs_info->delalloc_bytes); dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); @@ -628,8 +575,8 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv; struct btrfs_trans_handle *trans; - u64 bytes_needed; u64 reclaim_bytes = 0; + u64 bytes_needed = 0; u64 cur_free_bytes = 0; trans = (struct btrfs_trans_handle *)current->journal_info; @@ -649,7 +596,8 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, else if (!list_empty(&space_info->tickets)) ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); - bytes_needed = (ticket) ? ticket->bytes : 0; + if (ticket) + bytes_needed = ticket->bytes; if (bytes_needed > cur_free_bytes) bytes_needed -= cur_free_bytes; @@ -676,8 +624,10 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, goto commit; /* - * See if there is some space in the delayed insertion reservation for - * this reservation. + * See if there is some space in the delayed insertion reserve for this + * reservation. If the space_info's don't match (like for DATA or + * SYSTEM) then just go enospc, reclaiming this space won't recover any + * space to satisfy those reservations. */ if (space_info != delayed_rsv->space_info) goto enospc; @@ -742,7 +692,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; case FLUSH_DELALLOC: case FLUSH_DELALLOC_WAIT: - shrink_delalloc(fs_info, num_bytes * 2, num_bytes, + shrink_delalloc(fs_info, space_info, num_bytes, state == FLUSH_DELALLOC_WAIT); break; case FLUSH_DELAYED_REFS_NR: @@ -767,7 +717,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; } ret = btrfs_chunk_alloc(trans, - btrfs_metadata_alloc_profile(fs_info), + btrfs_get_alloc_profile(fs_info, space_info->flags), (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE); btrfs_end_transaction(trans); @@ -1037,9 +987,132 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) } while (flush_state <= COMMIT_TRANS); } -void btrfs_init_async_reclaim_work(struct work_struct *work) +/* + * FLUSH_DELALLOC_WAIT: + * Space is freed from flushing delalloc in one of two ways. + * + * 1) compression is on and we allocate less space than we reserved + * 2) we are overwriting existing space + * + * For #1 that extra space is reclaimed as soon as the delalloc pages are + * COWed, by way of btrfs_add_reserved_bytes() which adds the actual extent + * length to ->bytes_reserved, and subtracts the reserved space from + * ->bytes_may_use. + * + * For #2 this is trickier. Once the ordered extent runs we will drop the + * extent in the range we are overwriting, which creates a delayed ref for + * that freed extent. This however is not reclaimed until the transaction + * commits, thus the next stages. + * + * RUN_DELAYED_IPUTS + * If we are freeing inodes, we want to make sure all delayed iputs have + * completed, because they could have been on an inode with i_nlink == 0, and + * thus have been truncated and freed up space. But again this space is not + * immediately re-usable, it comes in the form of a delayed ref, which must be + * run and then the transaction must be committed. + * + * FLUSH_DELAYED_REFS + * The above two cases generate delayed refs that will affect + * ->total_bytes_pinned. However this counter can be inconsistent with + * reality if there are outstanding delayed refs. This is because we adjust + * the counter based solely on the current set of delayed refs and disregard + * any on-disk state which might include more refs. So for example, if we + * have an extent with 2 references, but we only drop 1, we'll see that there + * is a negative delayed ref count for the extent and assume that the space + * will be freed, and thus increase ->total_bytes_pinned. + * + * Running the delayed refs gives us the actual real view of what will be + * freed at the transaction commit time. This stage will not actually free + * space for us, it just makes sure that may_commit_transaction() has all of + * the information it needs to make the right decision. + * + * COMMIT_TRANS + * This is where we reclaim all of the pinned space generated by the previous + * two stages. We will not commit the transaction if we don't think we're + * likely to satisfy our request, which means if our current free space + + * total_bytes_pinned < reservation we will not commit. This is why the + * previous states are actually important, to make sure we know for sure + * whether committing the transaction will allow us to make progress. + * + * ALLOC_CHUNK_FORCE + * For data we start with alloc chunk force, however we could have been full + * before, and then the transaction commit could have freed new block groups, + * so if we now have space to allocate do the force chunk allocation. + */ +static const enum btrfs_flush_state data_flush_states[] = { + FLUSH_DELALLOC_WAIT, + RUN_DELAYED_IPUTS, + FLUSH_DELAYED_REFS, + COMMIT_TRANS, + ALLOC_CHUNK_FORCE, +}; + +static void btrfs_async_reclaim_data_space(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + u64 last_tickets_id; + int flush_state = 0; + + fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work); + space_info = fs_info->data_sinfo; + + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets)) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + last_tickets_id = space_info->tickets_id; + spin_unlock(&space_info->lock); + + while (!space_info->full) { + flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE); + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets)) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + last_tickets_id = space_info->tickets_id; + spin_unlock(&space_info->lock); + } + + while (flush_state < ARRAY_SIZE(data_flush_states)) { + flush_space(fs_info, space_info, U64_MAX, + data_flush_states[flush_state]); + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets)) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + + if (last_tickets_id == space_info->tickets_id) { + flush_state++; + } else { + last_tickets_id = space_info->tickets_id; + flush_state = 0; + } + + if (flush_state >= ARRAY_SIZE(data_flush_states)) { + if (space_info->full) { + if (maybe_fail_all_tickets(fs_info, space_info)) + flush_state = 0; + else + space_info->flush = 0; + } else { + flush_state = 0; + } + } + spin_unlock(&space_info->lock); + } +} + +void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info) { - INIT_WORK(work, btrfs_async_reclaim_metadata_space); + INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space); + INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space); } static const enum btrfs_flush_state priority_flush_states[] = { @@ -1089,6 +1162,21 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, } while (flush_state < states_nr); } +static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket) +{ + while (!space_info->full) { + flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE); + spin_lock(&space_info->lock); + if (ticket->bytes == 0) { + spin_unlock(&space_info->lock); + return; + } + spin_unlock(&space_info->lock); + } +} + static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, struct reserve_ticket *ticket) @@ -1141,6 +1229,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, int ret; switch (flush) { + case BTRFS_RESERVE_FLUSH_DATA: case BTRFS_RESERVE_FLUSH_ALL: case BTRFS_RESERVE_FLUSH_ALL_STEAL: wait_reserve_ticket(fs_info, space_info, ticket); @@ -1155,6 +1244,9 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, evict_flush_states, ARRAY_SIZE(evict_flush_states)); break; + case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE: + priority_reclaim_data_space(fs_info, space_info, ticket); + break; default: ASSERT(0); break; @@ -1214,11 +1306,11 @@ static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush) * regain reservations will be made and this will fail if there is not enough * space already. */ -static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 orig_bytes, - enum btrfs_reserve_flush_enum flush) +static int __reserve_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) { + struct work_struct *async_work; struct reserve_ticket ticket; u64 used; int ret = 0; @@ -1227,6 +1319,11 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, ASSERT(orig_bytes); ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); + if (flush == BTRFS_RESERVE_FLUSH_DATA) + async_work = &fs_info->async_data_reclaim_work; + else + async_work = &fs_info->async_reclaim_work; + spin_lock(&space_info->lock); ret = -ENOSPC; used = btrfs_space_info_used(space_info, true); @@ -1268,7 +1365,8 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, init_waitqueue_head(&ticket.wait); ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL); if (flush == BTRFS_RESERVE_FLUSH_ALL || - flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) { + flush == BTRFS_RESERVE_FLUSH_ALL_STEAL || + flush == BTRFS_RESERVE_FLUSH_DATA) { list_add_tail(&ticket.list, &space_info->tickets); if (!space_info->flush) { space_info->flush = 1; @@ -1276,8 +1374,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, space_info->flags, orig_bytes, flush, "enospc"); - queue_work(system_unbound_wq, - &fs_info->async_reclaim_work); + queue_work(system_unbound_wq, async_work); } } else { list_add_tail(&ticket.list, @@ -1329,8 +1426,7 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; int ret; - ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, - orig_bytes, flush); + ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush); if (ret == -ENOSPC && unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { if (block_rsv != global_rsv && @@ -1348,3 +1444,32 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root, } return ret; } + +/** + * btrfs_reserve_data_bytes - try to reserve data bytes for an allocation + * @fs_info - the filesystem + * @bytes - the number of bytes we need + * @flush - how we are allowed to flush + * + * This will reserve bytes from the data space info. If there is not enough + * space then we will attempt to flush space as specified by flush. + */ +int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, + enum btrfs_reserve_flush_enum flush) +{ + struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; + int ret; + + ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA || + flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE); + ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA); + + ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush); + if (ret == -ENOSPC) { + trace_btrfs_space_reservation(fs_info, "space_info:enospc", + data_sinfo->flags, bytes, 1); + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) + btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0); + } + return ret; +} diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index c3c64019950a..5646393b928c 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -149,5 +149,7 @@ static inline void btrfs_space_info_free_bytes_may_use( btrfs_try_granting_tickets(fs_info, space_info); spin_unlock(&space_info->lock); } +int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, + enum btrfs_reserve_flush_enum flush); #endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index 079b059818e9..c46be27be700 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -7,16 +7,6 @@ #include "ctree.h" -static inline u8 get_unaligned_le8(const void *p) -{ - return *(u8 *)p; -} - -static inline void put_unaligned_le8(u8 val, void *p) -{ - *(u8 *)p = val; -} - static bool check_setget_bounds(const struct extent_buffer *eb, const void *ptr, unsigned off, int size) { diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index e529ddb35b87..8840a4fa81eb 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -625,6 +625,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, } else if (strncmp(args[0].from, "lzo", 3) == 0) { compress_type = "lzo"; info->compress_type = BTRFS_COMPRESS_LZO; + info->compress_level = 0; btrfs_set_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, NODATACOW); btrfs_clear_opt(info->mount_opt, NODATASUM); @@ -1870,6 +1871,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) * the filesystem is busy. */ cancel_work_sync(&fs_info->async_reclaim_work); + cancel_work_sync(&fs_info->async_data_reclaim_work); btrfs_discard_cleanup(fs_info); @@ -2162,8 +2164,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 thresh = 0; int mixed = 0; - rcu_read_lock(); - list_for_each_entry_rcu(found, &fs_info->space_info, list) { + list_for_each_entry(found, &fs_info->space_info, list) { if (found->flags & BTRFS_BLOCK_GROUP_DATA) { int i; @@ -2192,8 +2193,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) total_used += found->disk_used; } - rcu_read_unlock(); - buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor); buf->f_blocks >>= bits; buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index c8df2edafd85..279d9262b676 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -14,6 +14,7 @@ #include "ctree.h" #include "discard.h" #include "disk-io.h" +#include "send.h" #include "transaction.h" #include "sysfs.h" #include "volumes.h" @@ -321,9 +322,17 @@ static ssize_t supported_checksums_show(struct kobject *kobj, } BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show); +static ssize_t send_stream_version_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", BTRFS_SEND_STREAM_VERSION); +} +BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show); + static struct attribute *btrfs_supported_static_feature_attrs[] = { BTRFS_ATTR_PTR(static_feature, rmdir_subvol), BTRFS_ATTR_PTR(static_feature, supported_checksums), + BTRFS_ATTR_PTR(static_feature, send_stream_version), NULL }; @@ -809,6 +818,42 @@ static ssize_t btrfs_checksum_show(struct kobject *kobj, BTRFS_ATTR(, checksum, btrfs_checksum_show); +static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + const char *str; + + switch (READ_ONCE(fs_info->exclusive_operation)) { + case BTRFS_EXCLOP_NONE: + str = "none\n"; + break; + case BTRFS_EXCLOP_BALANCE: + str = "balance\n"; + break; + case BTRFS_EXCLOP_DEV_ADD: + str = "device add\n"; + break; + case BTRFS_EXCLOP_DEV_REMOVE: + str = "device remove\n"; + break; + case BTRFS_EXCLOP_DEV_REPLACE: + str = "device replace\n"; + break; + case BTRFS_EXCLOP_RESIZE: + str = "resize\n"; + break; + case BTRFS_EXCLOP_SWAP_ACTIVATE: + str = "swap activate\n"; + break; + default: + str = "UNKNOWN\n"; + break; + } + return scnprintf(buf, PAGE_SIZE, "%s", str); +} +BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show); + static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, label), BTRFS_ATTR_PTR(, nodesize), @@ -817,6 +862,7 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, quota_override), BTRFS_ATTR_PTR(, metadata_uuid), BTRFS_ATTR_PTR(, checksum), + BTRFS_ATTR_PTR(, exclusive_operation), NULL, }; @@ -935,12 +981,24 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) } } +static void btrfs_sysfs_remove_fs_devices(struct btrfs_fs_devices *fs_devices) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *seed; + + list_for_each_entry(device, &fs_devices->devices, dev_list) + btrfs_sysfs_remove_device(device); + + list_for_each_entry(seed, &fs_devices->seed_list, seed_list) { + list_for_each_entry(device, &seed->devices, dev_list) + btrfs_sysfs_remove_device(device); + } +} + void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) { struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj; - btrfs_reset_fs_info_ptr(fs_info); - sysfs_remove_link(fsid_kobj, "bdi"); if (fs_info->space_info_kobj) { @@ -964,7 +1022,7 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) addrm_unknown_feature_attrs(fs_info, false); sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group); sysfs_remove_files(fsid_kobj, btrfs_attrs); - btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, NULL); + btrfs_sysfs_remove_fs_devices(fs_info->fs_devices); } static const char * const btrfs_feature_set_names[FEAT_MAX] = { @@ -973,7 +1031,7 @@ static const char * const btrfs_feature_set_names[FEAT_MAX] = { [FEAT_INCOMPAT] = "incompat", }; -const char * const btrfs_feature_set_name(enum btrfs_feature_set set) +const char *btrfs_feature_set_name(enum btrfs_feature_set set) { return btrfs_feature_set_names[set]; } @@ -1079,17 +1137,38 @@ void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache) rkobj->flags = cache->flags; kobject_init(&rkobj->kobj, &btrfs_raid_ktype); + + /* + * We call this either on mount, or if we've created a block group for a + * new index type while running (i.e. when restriping). The running + * case is tricky because we could race with other threads, so we need + * to have this check to make sure we didn't already init the kobject. + * + * We don't have to protect on the free side because it only happens on + * unmount. + */ + spin_lock(&space_info->lock); + if (space_info->block_group_kobjs[index]) { + spin_unlock(&space_info->lock); + kobject_put(&rkobj->kobj); + return; + } else { + space_info->block_group_kobjs[index] = &rkobj->kobj; + } + spin_unlock(&space_info->lock); + ret = kobject_add(&rkobj->kobj, &space_info->kobj, "%s", btrfs_bg_type_to_raid_name(rkobj->flags)); memalloc_nofs_restore(nofs_flag); if (ret) { + spin_lock(&space_info->lock); + space_info->block_group_kobjs[index] = NULL; + spin_unlock(&space_info->lock); kobject_put(&rkobj->kobj); btrfs_warn(fs_info, "failed to add kobject for block cache, ignoring"); return; } - - space_info->block_group_kobjs[index] = &rkobj->kobj; } /* @@ -1151,48 +1230,30 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, return 0; } -/* when one_device is NULL, it removes all device links */ - -int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *one_device) +void btrfs_sysfs_remove_device(struct btrfs_device *device) { struct hd_struct *disk; struct kobject *disk_kobj; + struct kobject *devices_kobj; - if (!fs_devices->devices_kobj) - return -EINVAL; - - if (one_device) { - if (one_device->bdev) { - disk = one_device->bdev->bd_part; - disk_kobj = &part_to_dev(disk)->kobj; - sysfs_remove_link(fs_devices->devices_kobj, - disk_kobj->name); - } - - kobject_del(&one_device->devid_kobj); - kobject_put(&one_device->devid_kobj); - - wait_for_completion(&one_device->kobj_unregister); + /* + * Seed fs_devices devices_kobj aren't used, fetch kobject from the + * fs_info::fs_devices. + */ + devices_kobj = device->fs_info->fs_devices->devices_kobj; + ASSERT(devices_kobj); - return 0; + if (device->bdev) { + disk = device->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; + sysfs_remove_link(devices_kobj, disk_kobj->name); } - list_for_each_entry(one_device, &fs_devices->devices, dev_list) { - - if (one_device->bdev) { - disk = one_device->bdev->bd_part; - disk_kobj = &part_to_dev(disk)->kobj; - sysfs_remove_link(fs_devices->devices_kobj, - disk_kobj->name); - } - kobject_del(&one_device->devid_kobj); - kobject_put(&one_device->devid_kobj); - - wait_for_completion(&one_device->kobj_unregister); + if (device->devid_kobj.state_initialized) { + kobject_del(&device->devid_kobj); + kobject_put(&device->devid_kobj); + wait_for_completion(&device->kobj_unregister); } - - return 0; } static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj, @@ -1273,44 +1334,80 @@ static struct kobj_type devid_ktype = { .release = btrfs_release_devid_kobj, }; -int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *one_device) +int btrfs_sysfs_add_device(struct btrfs_device *device) { - int error = 0; - struct btrfs_device *dev; + int ret; unsigned int nofs_flag; + struct kobject *devices_kobj; + struct kobject *devinfo_kobj; - nofs_flag = memalloc_nofs_save(); - list_for_each_entry(dev, &fs_devices->devices, dev_list) { + /* + * Make sure we use the fs_info::fs_devices to fetch the kobjects even + * for the seed fs_devices + */ + devices_kobj = device->fs_info->fs_devices->devices_kobj; + devinfo_kobj = device->fs_info->fs_devices->devinfo_kobj; + ASSERT(devices_kobj); + ASSERT(devinfo_kobj); - if (one_device && one_device != dev) - continue; + nofs_flag = memalloc_nofs_save(); - if (dev->bdev) { - struct hd_struct *disk; - struct kobject *disk_kobj; + if (device->bdev) { + struct hd_struct *disk; + struct kobject *disk_kobj; - disk = dev->bdev->bd_part; - disk_kobj = &part_to_dev(disk)->kobj; + disk = device->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; - error = sysfs_create_link(fs_devices->devices_kobj, - disk_kobj, disk_kobj->name); - if (error) - break; + ret = sysfs_create_link(devices_kobj, disk_kobj, disk_kobj->name); + if (ret) { + btrfs_warn(device->fs_info, + "creating sysfs device link for devid %llu failed: %d", + device->devid, ret); + goto out; } + } - init_completion(&dev->kobj_unregister); - error = kobject_init_and_add(&dev->devid_kobj, &devid_ktype, - fs_devices->devinfo_kobj, "%llu", - dev->devid); - if (error) { - kobject_put(&dev->devid_kobj); - break; - } + init_completion(&device->kobj_unregister); + ret = kobject_init_and_add(&device->devid_kobj, &devid_ktype, + devinfo_kobj, "%llu", device->devid); + if (ret) { + kobject_put(&device->devid_kobj); + btrfs_warn(device->fs_info, + "devinfo init for devid %llu failed: %d", + device->devid, ret); } + +out: memalloc_nofs_restore(nofs_flag); + return ret; +} - return error; +static int btrfs_sysfs_add_fs_devices(struct btrfs_fs_devices *fs_devices) +{ + int ret; + struct btrfs_device *device; + struct btrfs_fs_devices *seed; + + list_for_each_entry(device, &fs_devices->devices, dev_list) { + ret = btrfs_sysfs_add_device(device); + if (ret) + goto fail; + } + + list_for_each_entry(seed, &fs_devices->seed_list, seed_list) { + list_for_each_entry(device, &seed->devices, dev_list) { + ret = btrfs_sysfs_add_device(device); + if (ret) + goto fail; + } + } + + return 0; + +fail: + btrfs_sysfs_remove_fs_devices(fs_devices); + return ret; } void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action) @@ -1324,8 +1421,8 @@ void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action) &disk_to_dev(bdev->bd_disk)->kobj); } -void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices, - const u8 *fsid) +void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices) + { char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; @@ -1333,7 +1430,7 @@ void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices, * Sprouting changes fsid of the mounted filesystem, rename the fsid * directory */ - snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fsid); + snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fs_devices->fsid); if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf)) btrfs_warn(fs_devices->fs_info, "sysfs: failed to create fsid for sprout"); @@ -1400,15 +1497,13 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) struct btrfs_fs_devices *fs_devs = fs_info->fs_devices; struct kobject *fsid_kobj = &fs_devs->fsid_kobj; - btrfs_set_fs_info_ptr(fs_info); - - error = btrfs_sysfs_add_devices_dir(fs_devs, NULL); + error = btrfs_sysfs_add_fs_devices(fs_devs); if (error) return error; error = sysfs_create_files(fsid_kobj, btrfs_attrs); if (error) { - btrfs_sysfs_remove_devices_dir(fs_devs, NULL); + btrfs_sysfs_remove_fs_devices(fs_devs); return error; } @@ -1626,12 +1721,16 @@ void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, { struct btrfs_fs_devices *fs_devs; struct kobject *fsid_kobj; - u64 features; - int ret; + u64 __maybe_unused features; + int __maybe_unused ret; if (!fs_info) return; + /* + * See 14e46e04958df74 and e410e34fad913dd, feature bit updates are not + * safe when called from some contexts (eg. balance) + */ features = get_features(fs_info, set); ASSERT(bit & supported_feature_masks[set]); diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index cf839c46a131..bacef43f7267 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -13,15 +13,12 @@ enum btrfs_feature_set { }; char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); -const char * const btrfs_feature_set_name(enum btrfs_feature_set set); -int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *one_device); -int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *one_device); +const char *btrfs_feature_set_name(enum btrfs_feature_set set); +int btrfs_sysfs_add_device(struct btrfs_device *device); +void btrfs_sysfs_remove_device(struct btrfs_device *device); int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); -void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices, - const u8 *fsid); +void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices); void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, u64 bit, enum btrfs_feature_set set); void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action); diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index a1b9f9b5978e..df54cdfdc250 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -60,8 +60,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) key.type = BTRFS_EXTENT_CSUM_KEY; key.offset = 0; - setup_items_for_insert(root, path, &key, &value_len, value_len, - value_len + sizeof(struct btrfs_item), 1); + setup_items_for_insert(root, path, &key, &value_len, 1); item = btrfs_item_nr(0); write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0), value_len); diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 894a63a92236..e6719f7db386 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -33,8 +33,7 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = start; - setup_items_for_insert(root, &path, &key, &value_len, value_len, - value_len + sizeof(struct btrfs_item), 1); + setup_items_for_insert(root, &path, &key, &value_len, 1); fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, 1); btrfs_set_file_extent_type(leaf, fi, type); @@ -64,8 +63,7 @@ static void insert_inode_item_key(struct btrfs_root *root) key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - setup_items_for_insert(root, &path, &key, &value_len, value_len, - value_len + sizeof(struct btrfs_item), 1); + setup_items_for_insert(root, &path, &key, &value_len, 1); } /* @@ -951,7 +949,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } BTRFS_I(inode)->root = root; - btrfs_test_inode_set_ops(inode); /* [BTRFS_MAX_EXTENT_SIZE] */ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), 0, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 20c6ac1a5de7..52ada47aff50 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -292,6 +292,8 @@ loop: } cur_trans->fs_info = fs_info; + atomic_set(&cur_trans->pending_ordered, 0); + init_waitqueue_head(&cur_trans->pending_wait); atomic_set(&cur_trans->num_writers, 1); extwriter_counter_init(cur_trans, type); init_waitqueue_head(&cur_trans->writer_wait); @@ -1182,7 +1184,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) eb = btrfs_lock_root_node(fs_info->tree_root); ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, - 0, &eb); + 0, &eb, BTRFS_NESTING_COW); btrfs_tree_unlock(eb); free_extent_buffer(eb); @@ -1587,7 +1589,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_set_root_otransid(new_root_item, trans->transid); old = btrfs_lock_root_node(root); - ret = btrfs_cow_block(trans, root, old, NULL, 0, &old); + ret = btrfs_cow_block(trans, root, old, NULL, 0, &old, + BTRFS_NESTING_COW); if (ret) { btrfs_tree_unlock(old); free_extent_buffer(old); @@ -1636,6 +1639,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev); if (IS_ERR(pending->snap)) { ret = PTR_ERR(pending->snap); + pending->snap = NULL; btrfs_abort_transaction(trans, ret); goto fail; } @@ -2164,6 +2168,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_wait_delalloc_flush(trans); + /* + * Wait for all ordered extents started by a fast fsync that joined this + * transaction. Otherwise if this transaction commits before the ordered + * extents complete we lose logged data after a power failure. + */ + wait_event(cur_trans->pending_wait, + atomic_read(&cur_trans->pending_ordered) == 0); + btrfs_scrub_pause(fs_info); /* * Ok now we need to make sure to block out any other joins while we diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index d60b055b8695..858d9153a1cd 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -85,6 +85,13 @@ struct btrfs_transaction { spinlock_t dropped_roots_lock; struct btrfs_delayed_ref_root delayed_refs; struct btrfs_fs_info *fs_info; + + /* + * Number of ordered extents the transaction must wait for before + * committing. These are ordered extents started by a fast fsync. + */ + atomic_t pending_ordered; + wait_queue_head_t pending_wait; }; #define __TRANS_FREEZABLE (1U << 0) @@ -105,6 +112,7 @@ struct btrfs_transaction { #define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH) #define BTRFS_SEND_TRANS_STUB ((void *)1) +#define BTRFS_DIO_SYNC_STUB ((void *)2) struct btrfs_trans_handle { u64 transid; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 517b44300a05..f0ffd5ee77bd 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -984,7 +984,7 @@ static int check_inode_item(struct extent_buffer *leaf, /* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */ if (btrfs_inode_transid(leaf, iitem) > super_gen + 1) { inode_item_err(leaf, slot, - "invalid inode generation: has %llu expect [0, %llu]", + "invalid inode transid: has %llu expect [0, %llu]", btrfs_inode_transid(leaf, iitem), super_gen + 1); return -EUCLEAN; } @@ -1035,7 +1035,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { struct btrfs_fs_info *fs_info = leaf->fs_info; - struct btrfs_root_item ri; + struct btrfs_root_item ri = { 0 }; const u64 valid_root_flags = BTRFS_ROOT_SUBVOL_RDONLY | BTRFS_ROOT_SUBVOL_DEAD; int ret; @@ -1044,14 +1044,21 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, if (ret < 0) return ret; - if (btrfs_item_size_nr(leaf, slot) != sizeof(ri)) { + if (btrfs_item_size_nr(leaf, slot) != sizeof(ri) && + btrfs_item_size_nr(leaf, slot) != btrfs_legacy_root_item_size()) { generic_err(leaf, slot, - "invalid root item size, have %u expect %zu", - btrfs_item_size_nr(leaf, slot), sizeof(ri)); + "invalid root item size, have %u expect %zu or %u", + btrfs_item_size_nr(leaf, slot), sizeof(ri), + btrfs_legacy_root_item_size()); } + /* + * For legacy root item, the members starting at generation_v2 will be + * all filled with 0. + * And since we allow geneartion_v2 as 0, it will still pass the check. + */ read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot), - sizeof(ri)); + btrfs_item_size_nr(leaf, slot)); /* Generation related */ if (btrfs_root_generation(&ri) > diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 696dd861cc3c..56cbc1706b6f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -96,8 +96,6 @@ enum { static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, int inode_only, - const loff_t start, - const loff_t end, struct btrfs_log_ctx *ctx); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -176,7 +174,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); - if (ctx) { + if (ctx && !ctx->logging_new_name) { int index = root->log_transid % 2; list_add_tail(&ctx->list, &root->log_ctxs[index]); ctx->log_transid = root->log_transid; @@ -215,9 +213,7 @@ static int join_running_log_trans(struct btrfs_root *root) */ void btrfs_pin_log_trans(struct btrfs_root *root) { - mutex_lock(&root->log_mutex); atomic_inc(&root->log_writers); - mutex_unlock(&root->log_mutex); } /* @@ -3449,11 +3445,13 @@ fail: btrfs_free_path(path); out_unlock: mutex_unlock(&dir->log_mutex); - if (ret == -ENOSPC) { + if (err == -ENOSPC) { btrfs_set_log_full_commit(trans); - ret = 0; - } else if (ret < 0) - btrfs_abort_transaction(trans, ret); + err = 0; + } else if (err < 0 && err != -ENOENT) { + /* ENOENT can be returned if the entry hasn't been fsynced yet */ + btrfs_abort_transaction(trans, err); + } btrfs_end_log_trans(root); @@ -3613,6 +3611,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, * search and this search we'll not find the key again and can just * bail. */ +search: ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); if (ret != 0) goto done; @@ -3632,6 +3631,13 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, if (min_key.objectid != ino || min_key.type != key_type) goto done; + + if (need_resched()) { + btrfs_release_path(path); + cond_resched(); + goto search; + } + ret = overwrite_item(trans, log, dst_path, src, i, &min_key); if (ret) { @@ -4080,10 +4086,14 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) static int log_extent_csums(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_root *log_root, - const struct extent_map *em) + const struct extent_map *em, + struct btrfs_log_ctx *ctx) { + struct btrfs_ordered_extent *ordered; u64 csum_offset; u64 csum_len; + u64 mod_start = em->mod_start; + u64 mod_len = em->mod_len; LIST_HEAD(ordered_sums); int ret = 0; @@ -4092,13 +4102,71 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, em->block_start == EXTENT_MAP_HOLE) return 0; + list_for_each_entry(ordered, &ctx->ordered_extents, log_list) { + const u64 ordered_end = ordered->file_offset + ordered->num_bytes; + const u64 mod_end = mod_start + mod_len; + struct btrfs_ordered_sum *sums; + + if (mod_len == 0) + break; + + if (ordered_end <= mod_start) + continue; + if (mod_end <= ordered->file_offset) + break; + + /* + * We are going to copy all the csums on this ordered extent, so + * go ahead and adjust mod_start and mod_len in case this ordered + * extent has already been logged. + */ + if (ordered->file_offset > mod_start) { + if (ordered_end >= mod_end) + mod_len = ordered->file_offset - mod_start; + /* + * If we have this case + * + * |--------- logged extent ---------| + * |----- ordered extent ----| + * + * Just don't mess with mod_start and mod_len, we'll + * just end up logging more csums than we need and it + * will be ok. + */ + } else { + if (ordered_end < mod_end) { + mod_len = mod_end - ordered_end; + mod_start = ordered_end; + } else { + mod_len = 0; + } + } + + /* + * To keep us from looping for the above case of an ordered + * extent that falls inside of the logged extent. + */ + if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags)) + continue; + + list_for_each_entry(sums, &ordered->list, list) { + ret = log_csums(trans, inode, log_root, sums); + if (ret) + return ret; + } + } + + /* We're done, found all csums in the ordered extents. */ + if (mod_len == 0) + return 0; + /* If we're compressed we have to save the entire range of csums. */ if (em->compress_type) { csum_offset = 0; csum_len = max(em->block_len, em->orig_block_len); } else { - csum_offset = em->mod_start - em->start; - csum_len = em->mod_len; + csum_offset = mod_start - em->start; + csum_len = mod_len; } /* block start is already adjusted for the file extent offset. */ @@ -4138,7 +4206,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, int ret; int extent_inserted = 0; - ret = log_extent_csums(trans, inode, log, em); + ret = log_extent_csums(trans, inode, log, em, ctx); if (ret) return ret; @@ -4340,10 +4408,10 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_path *path, - struct btrfs_log_ctx *ctx, - const u64 start, - const u64 end) + struct btrfs_log_ctx *ctx) { + struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_extent *tmp; struct extent_map *em, *n; struct list_head extents; struct extent_map_tree *tree = &inode->extent_tree; @@ -4357,23 +4425,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, test_gen = root->fs_info->last_trans_committed; list_for_each_entry_safe(em, n, &tree->modified_extents, list) { - /* - * Skip extents outside our logging range. It's important to do - * it for correctness because if we don't ignore them, we may - * log them before their ordered extent completes, and therefore - * we could log them without logging their respective checksums - * (the checksum items are added to the csum tree at the very - * end of btrfs_finish_ordered_io()). Also leave such extents - * outside of our range in the list, since we may have another - * ranged fsync in the near future that needs them. If an extent - * outside our range corresponds to a hole, log it to avoid - * leaving gaps between extents (fsck will complain when we are - * not using the NO_HOLES feature). - */ - if ((em->start > end || em->start + em->len <= start) && - em->block_start != EXTENT_MAP_HOLE) - continue; - list_del_init(&em->list); /* * Just an arbitrary number, this can be really CPU intensive @@ -4432,8 +4483,32 @@ process: btrfs_release_path(path); if (!ret) ret = btrfs_log_prealloc_extents(trans, inode, path); + if (ret) + return ret; - return ret; + /* + * We have logged all extents successfully, now make sure the commit of + * the current transaction waits for the ordered extents to complete + * before it commits and wipes out the log trees, otherwise we would + * lose data if an ordered extents completes after the transaction + * commits and a power failure happens after the transaction commit. + */ + list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { + list_del_init(&ordered->log_list); + set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags); + + if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { + spin_lock_irq(&inode->ordered_tree.lock); + if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { + set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); + atomic_inc(&trans->transaction->pending_ordered); + } + spin_unlock_irq(&inode->ordered_tree.lock); + } + btrfs_put_ordered_extent(ordered); + } + + return 0; } static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, @@ -4839,7 +4914,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, ret = btrfs_log_inode(trans, root, BTRFS_I(inode), LOG_OTHER_INODE_ALL, - 0, LLONG_MAX, ctx); + ctx); btrfs_add_delayed_iput(inode); } } @@ -4881,7 +4956,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * Check the inode's logged_trans only instead of * btrfs_inode_in_log(). This is because the last_log_commit of * the inode is not updated when we only log that it exists and - * and it has the full sync bit set (see btrfs_log_inode()). + * it has the full sync bit set (see btrfs_log_inode()). */ if (BTRFS_I(inode)->logged_trans == trans->transid) { spin_unlock(&BTRFS_I(inode)->lock); @@ -4897,7 +4972,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * log with the new name before we unpin it. */ ret = btrfs_log_inode(trans, root, BTRFS_I(inode), - LOG_OTHER_INODE, 0, LLONG_MAX, ctx); + LOG_OTHER_INODE, ctx); if (ret) { btrfs_add_delayed_iput(inode); continue; @@ -5110,8 +5185,6 @@ next_key: static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, int inode_only, - const loff_t start, - const loff_t end, struct btrfs_log_ctx *ctx) { struct btrfs_path *path; @@ -5290,7 +5363,7 @@ log_extents: } if (fast_search) { ret = btrfs_log_changed_extents(trans, root, inode, dst_path, - ctx, start, end); + ctx); if (ret) { err = ret; goto out_unlock; @@ -5299,31 +5372,8 @@ log_extents: struct extent_map *em, *n; write_lock(&em_tree->lock); - /* - * We can't just remove every em if we're called for a ranged - * fsync - that is, one that doesn't cover the whole possible - * file range (0 to LLONG_MAX). This is because we can have - * em's that fall outside the range we're logging and therefore - * their ordered operations haven't completed yet - * (btrfs_finish_ordered_io() not invoked yet). This means we - * didn't get their respective file extent item in the fs/subvol - * tree yet, and need to let the next fast fsync (one which - * consults the list of modified extent maps) find the em so - * that it logs a matching file extent item and waits for the - * respective ordered operation to complete (if it's still - * running). - * - * Removing every em outside the range we're logging would make - * the next fast fsync not log their matching file extent items, - * therefore making us lose data after a log replay. - */ - list_for_each_entry_safe(em, n, &em_tree->modified_extents, - list) { - const u64 mod_end = em->mod_start + em->mod_len - 1; - - if (em->mod_start >= start && mod_end <= end) - list_del_init(&em->list); - } + list_for_each_entry_safe(em, n, &em_tree->modified_extents, list) + list_del_init(&em->list); write_unlock(&em_tree->lock); } @@ -5337,19 +5387,34 @@ log_extents: } /* - * Don't update last_log_commit if we logged that an inode exists after - * it was loaded to memory (full_sync bit set). - * This is to prevent data loss when we do a write to the inode, then - * the inode gets evicted after all delalloc was flushed, then we log - * it exists (due to a rename for example) and then fsync it. This last - * fsync would do nothing (not logging the extents previously written). + * If we are logging that an ancestor inode exists as part of logging a + * new name from a link or rename operation, don't mark the inode as + * logged - otherwise if an explicit fsync is made against an ancestor, + * the fsync considers the inode in the log and doesn't sync the log, + * resulting in the ancestor missing after a power failure unless the + * log was synced as part of an fsync against any other unrelated inode. + * So keep it simple for this case and just don't flag the ancestors as + * logged. */ - spin_lock(&inode->lock); - inode->logged_trans = trans->transid; - if (inode_only != LOG_INODE_EXISTS || - !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) - inode->last_log_commit = inode->last_sub_trans; - spin_unlock(&inode->lock); + if (!ctx || + !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name && + &inode->vfs_inode != ctx->inode)) { + spin_lock(&inode->lock); + inode->logged_trans = trans->transid; + /* + * Don't update last_log_commit if we logged that an inode exists + * after it was loaded to memory (full_sync bit set). + * This is to prevent data loss when we do a write to the inode, + * then the inode gets evicted after all delalloc was flushed, + * then we log it exists (due to a rename for example) and then + * fsync it. This last fsync would do nothing (not logging the + * extents previously written). + */ + if (inode_only != LOG_INODE_EXISTS || + !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) + inode->last_log_commit = inode->last_sub_trans; + spin_unlock(&inode->lock); + } out_unlock: mutex_unlock(&inode->log_mutex); @@ -5589,7 +5654,7 @@ process_leaf: if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) log_mode = LOG_INODE_ALL; ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), - log_mode, 0, LLONG_MAX, ctx); + log_mode, ctx); if (!ret && btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) ret = 1; @@ -5733,7 +5798,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, if (ctx) ctx->log_new_dentries = false; ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), - LOG_INODE_ALL, 0, LLONG_MAX, ctx); + LOG_INODE_ALL, ctx); if (!ret && btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode))) ret = 1; @@ -5784,8 +5849,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, if (BTRFS_I(inode)->generation > last_committed) ret = btrfs_log_inode(trans, root, BTRFS_I(inode), - LOG_INODE_EXISTS, - 0, LLONG_MAX, ctx); + LOG_INODE_EXISTS, ctx); btrfs_add_delayed_iput(inode); if (ret) return ret; @@ -5840,7 +5904,7 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, if (inode->generation > fs_info->last_trans_committed) { ret = btrfs_log_inode(trans, root, inode, - LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); + LOG_INODE_EXISTS, ctx); if (ret) break; } @@ -5948,8 +6012,6 @@ out: static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct dentry *parent, - const loff_t start, - const loff_t end, int inode_only, struct btrfs_log_ctx *ctx) { @@ -6002,7 +6064,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (ret) goto end_no_trans; - ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx); + ret = btrfs_log_inode(trans, root, inode, inode_only, ctx); if (ret) goto end_trans; @@ -6098,15 +6160,13 @@ end_no_trans: */ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct dentry *dentry, - const loff_t start, - const loff_t end, struct btrfs_log_ctx *ctx) { struct dentry *parent = dget_parent(dentry); int ret; ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent, - start, end, LOG_INODE_ALL, ctx); + LOG_INODE_ALL, ctx); dput(parent); return ret; @@ -6369,26 +6429,13 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, /* * Call this after adding a new name for a file and it will properly * update the log to reflect the new name. - * - * @ctx can not be NULL when @sync_log is false, and should be NULL when it's - * true (because it's not used). - * - * Return value depends on whether @sync_log is true or false. - * When true: returns BTRFS_NEED_TRANS_COMMIT if the transaction needs to be - * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT - * otherwise. - * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to - * to sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log, - * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be - * committed (without attempting to sync the log). */ -int btrfs_log_new_name(struct btrfs_trans_handle *trans, +void btrfs_log_new_name(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_inode *old_dir, - struct dentry *parent, - bool sync_log, struct btrfs_log_ctx *ctx) + struct dentry *parent) { struct btrfs_fs_info *fs_info = trans->fs_info; - int ret; + struct btrfs_log_ctx ctx; /* * this will force the logging code to walk the dentry chain @@ -6403,34 +6450,17 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, */ if (inode->logged_trans <= fs_info->last_trans_committed && (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) - return sync_log ? BTRFS_DONT_NEED_TRANS_COMMIT : - BTRFS_DONT_NEED_LOG_SYNC; - - if (sync_log) { - struct btrfs_log_ctx ctx2; - - btrfs_init_log_ctx(&ctx2, &inode->vfs_inode); - ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, - LOG_INODE_EXISTS, &ctx2); - if (ret == BTRFS_NO_LOG_SYNC) - return BTRFS_DONT_NEED_TRANS_COMMIT; - else if (ret) - return BTRFS_NEED_TRANS_COMMIT; - - ret = btrfs_sync_log(trans, inode->root, &ctx2); - if (ret) - return BTRFS_NEED_TRANS_COMMIT; - return BTRFS_DONT_NEED_TRANS_COMMIT; - } - - ASSERT(ctx); - ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, - LOG_INODE_EXISTS, ctx); - if (ret == BTRFS_NO_LOG_SYNC) - return BTRFS_DONT_NEED_LOG_SYNC; - else if (ret) - return BTRFS_NEED_TRANS_COMMIT; + return; - return BTRFS_NEED_LOG_SYNC; + btrfs_init_log_ctx(&ctx, &inode->vfs_inode); + ctx.logging_new_name = true; + /* + * We don't care about the return value. If we fail to log the new name + * then we know the next attempt to sync the log will fallback to a full + * transaction commit (due to a call to btrfs_set_log_full_commit()), so + * we don't need to worry about getting a log committed that has an + * inconsistent state after a rename operation. + */ + btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 132e43d29034..731bd9c029f5 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -16,8 +16,11 @@ struct btrfs_log_ctx { int log_ret; int log_transid; bool log_new_dentries; + bool logging_new_name; struct inode *inode; struct list_head list; + /* Only used for fast fsyncs. */ + struct list_head ordered_extents; }; static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, @@ -26,8 +29,23 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, ctx->log_ret = 0; ctx->log_transid = 0; ctx->log_new_dentries = false; + ctx->logging_new_name = false; ctx->inode = inode; INIT_LIST_HEAD(&ctx->list); + INIT_LIST_HEAD(&ctx->ordered_extents); +} + +static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx) +{ + struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_extent *tmp; + + ASSERT(inode_is_locked(ctx->inode)); + + list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { + list_del_init(&ordered->log_list); + btrfs_put_ordered_extent(ordered); + } } static inline void btrfs_set_log_full_commit(struct btrfs_trans_handle *trans) @@ -49,8 +67,6 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, int btrfs_recover_log_trees(struct btrfs_root *tree_root); int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct dentry *dentry, - const loff_t start, - const loff_t end, struct btrfs_log_ctx *ctx); int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -67,16 +83,8 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, int for_rename); void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, struct btrfs_inode *dir); -/* Return values for btrfs_log_new_name() */ -enum { - BTRFS_DONT_NEED_TRANS_COMMIT, - BTRFS_NEED_TRANS_COMMIT, - BTRFS_DONT_NEED_LOG_SYNC, - BTRFS_NEED_LOG_SYNC, -}; -int btrfs_log_new_name(struct btrfs_trans_handle *trans, +void btrfs_log_new_name(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_inode *old_dir, - struct dentry *parent, - bool sync_log, struct btrfs_log_ctx *ctx); + struct dentry *parent); #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ee96c5869f57..58b9c419a2b6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4,6 +4,7 @@ */ #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/bio.h> #include <linux/slab.h> #include <linux/blkdev.h> @@ -290,8 +291,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * balance_mutex * * - * Exclusive operations, BTRFS_FS_EXCL_OP - * ====================================== + * Exclusive operations + * ==================== * * Maintains the exclusivity of the following operations that apply to the * whole filesystem and cannot run in parallel. @@ -317,11 +318,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * - system power-cycle and filesystem mounted as read-only * - filesystem or device errors leading to forced read-only * - * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations. - * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set. + * The status of exclusive operation is set and cleared atomically. + * During the course of Paused state, fs_info::exclusive_operation remains set. * A device operation in Paused or Running state can be canceled or resumed * either by ioctl (Balance only) or when remounted as read-write. - * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or + * The exclusive status is cleared when the device operation is canceled or * completed. */ @@ -355,6 +356,7 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, INIT_LIST_HEAD(&fs_devs->devices); INIT_LIST_HEAD(&fs_devs->alloc_list); INIT_LIST_HEAD(&fs_devs->fs_list); + INIT_LIST_HEAD(&fs_devs->seed_list); if (fsid) memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); @@ -405,7 +407,7 @@ void __exit btrfs_cleanup_fs_uuids(void) * Returned struct is not linked onto any lists and must be destroyed using * btrfs_free_device. */ -static struct btrfs_device *__alloc_device(void) +static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info) { struct btrfs_device *dev; @@ -432,7 +434,8 @@ static struct btrfs_device *__alloc_device(void) btrfs_device_data_ordered_init(dev); INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); - extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL); + extent_io_tree_init(fs_info, &dev->alloc_state, + IO_TREE_DEVICE_ALLOC_STATE, NULL); return dev; } @@ -592,8 +595,6 @@ static int btrfs_free_stale_devices(const char *path, btrfs_free_device(device); ret = 0; - if (fs_devices->num_devices == 0) - break; } mutex_unlock(&fs_devices->device_list_mutex); @@ -940,16 +941,18 @@ static noinline struct btrfs_device *device_list_add(const char *path, bdput(path_bdev); mutex_unlock(&fs_devices->device_list_mutex); btrfs_warn_in_rcu(device->fs_info, - "duplicate device fsid:devid for %pU:%llu old:%s new:%s", - disk_super->fsid, devid, - rcu_str_deref(device->name), path); + "duplicate device %s devid %llu generation %llu scanned by %s (%d)", + path, devid, found_transid, + current->comm, + task_pid_nr(current)); return ERR_PTR(-EEXIST); } bdput(path_bdev); btrfs_info_in_rcu(device->fs_info, - "device fsid %pU devid %llu moved old:%s new:%s", - disk_super->fsid, devid, - rcu_str_deref(device->name), path); + "devid %llu device path %s changed to %s scanned by %s (%d)", + devid, rcu_str_deref(device->name), + path, current->comm, + task_pid_nr(current)); } name = rcu_string_strdup(path, GFP_NOFS); @@ -1034,28 +1037,21 @@ error: return ERR_PTR(ret); } -/* - * After we have read the system tree and know devids belonging to - * this filesystem, remove the device which does not belong there. - */ -void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) +static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, + int step, struct btrfs_device **latest_dev) { struct btrfs_device *device, *next; - struct btrfs_device *latest_dev = NULL; - mutex_lock(&uuid_mutex); -again: /* This is the initialized path, it is safe to release the devices. */ list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { - if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, - &device->dev_state)) { + if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) { if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, - &device->dev_state) && + &device->dev_state) && !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) && - (!latest_dev || - device->generation > latest_dev->generation)) { - latest_dev = device; + (!*latest_dev || + device->generation > (*latest_dev)->generation)) { + *latest_dev = device; } continue; } @@ -1093,10 +1089,22 @@ again: btrfs_free_device(device); } - if (fs_devices->seed) { - fs_devices = fs_devices->seed; - goto again; - } +} + +/* + * After we have read the system tree and know devids belonging to this + * filesystem, remove the device which does not belong there. + */ +void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) +{ + struct btrfs_device *latest_dev = NULL; + struct btrfs_fs_devices *seed_dev; + + mutex_lock(&uuid_mutex); + __btrfs_free_extra_devids(fs_devices, step, &latest_dev); + + list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list) + __btrfs_free_extra_devids(seed_dev, step, &latest_dev); fs_devices->latest_bdev = latest_dev->bdev; @@ -1148,47 +1156,41 @@ static void btrfs_close_one_device(struct btrfs_device *device) ASSERT(atomic_read(&device->reada_in_flight) == 0); } -static int close_fs_devices(struct btrfs_fs_devices *fs_devices) +static void close_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device, *tmp; + lockdep_assert_held(&uuid_mutex); + if (--fs_devices->opened > 0) - return 0; + return; - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { + list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) btrfs_close_one_device(device); - } - mutex_unlock(&fs_devices->device_list_mutex); WARN_ON(fs_devices->open_devices); WARN_ON(fs_devices->rw_devices); fs_devices->opened = 0; fs_devices->seeding = false; - - return 0; + fs_devices->fs_info = NULL; } -int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) +void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) { - struct btrfs_fs_devices *seed_devices = NULL; - int ret; + LIST_HEAD(list); + struct btrfs_fs_devices *tmp; mutex_lock(&uuid_mutex); - ret = close_fs_devices(fs_devices); - if (!fs_devices->opened) { - seed_devices = fs_devices->seed; - fs_devices->seed = NULL; - } - mutex_unlock(&uuid_mutex); + close_fs_devices(fs_devices); + if (!fs_devices->opened) + list_splice_init(&fs_devices->seed_list, &list); - while (seed_devices) { - fs_devices = seed_devices; - seed_devices = fs_devices->seed; + list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { close_fs_devices(fs_devices); + list_del(&fs_devices->seed_list); free_fs_devices(fs_devices); } - return ret; + mutex_unlock(&uuid_mutex); } static int open_fs_devices(struct btrfs_fs_devices *fs_devices, @@ -1196,17 +1198,23 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, { struct btrfs_device *device; struct btrfs_device *latest_dev = NULL; + struct btrfs_device *tmp_device; flags |= FMODE_EXCL; - list_for_each_entry(device, &fs_devices->devices, dev_list) { - /* Just open everything we can; ignore failures here */ - if (btrfs_open_one_device(fs_devices, device, flags, holder)) - continue; + list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, + dev_list) { + int ret; - if (!latest_dev || - device->generation > latest_dev->generation) + ret = btrfs_open_one_device(fs_devices, device, flags, holder); + if (ret == 0 && + (!latest_dev || device->generation > latest_dev->generation)) { latest_dev = device; + } else if (ret == -ENODATA) { + fs_devices->num_devices--; + list_del(&device->dev_list); + btrfs_free_device(device); + } } if (fs_devices->open_devices == 0) return -EINVAL; @@ -1960,16 +1968,13 @@ static struct btrfs_device * btrfs_find_next_active_device( * this_dev) which is active. */ void __cold btrfs_assign_next_active_device(struct btrfs_device *device, - struct btrfs_device *this_dev) + struct btrfs_device *next_device) { struct btrfs_fs_info *fs_info = device->fs_info; - struct btrfs_device *next_device; - if (this_dev) - next_device = this_dev; - else + if (!next_device) next_device = btrfs_find_next_active_device(fs_info->fs_devices, - device); + device); ASSERT(next_device); if (fs_info->sb->s_bdev && @@ -1998,9 +2003,9 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) return num_devices; } -static void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, - struct block_device *bdev, - const char *device_path) +void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, + struct block_device *bdev, + const char *device_path) { struct btrfs_super_block *disk_super; int copy_num; @@ -2039,7 +2044,7 @@ static void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, } int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, - u64 devid) + u64 devid) { struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; @@ -2143,7 +2148,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, if (device->bdev) { cur_devices->open_devices--; /* remove sysfs entry */ - btrfs_sysfs_remove_devices_dir(fs_devices, device); + btrfs_sysfs_remove_device(device); } num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; @@ -2164,14 +2169,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, btrfs_free_device(device); if (cur_devices->open_devices == 0) { - while (fs_devices) { - if (fs_devices->seed == cur_devices) { - fs_devices->seed = cur_devices->seed; - break; - } - fs_devices = fs_devices->seed; - } - cur_devices->seed = NULL; + list_del_init(&cur_devices->seed_list); close_fs_devices(cur_devices); free_fs_devices(cur_devices); } @@ -2220,14 +2218,9 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) { - struct btrfs_fs_info *fs_info = srcdev->fs_info; struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; - if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) { - /* zero out the old super if it is writable */ - btrfs_scratch_superblocks(fs_info, srcdev->bdev, - srcdev->name->str); - } + mutex_lock(&uuid_mutex); btrfs_close_bdev(srcdev); synchronize_rcu(); @@ -2235,8 +2228,6 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) /* if this is no devs we rather delete the fs_devices */ if (!fs_devices->num_devices) { - struct btrfs_fs_devices *tmp_fs_devices; - /* * On a mounted FS, num_devices can't be zero unless it's a * seed. In case of a seed device being replaced, the replace @@ -2245,18 +2236,11 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) */ ASSERT(fs_devices->seeding); - tmp_fs_devices = fs_info->fs_devices; - while (tmp_fs_devices) { - if (tmp_fs_devices->seed == fs_devices) { - tmp_fs_devices->seed = fs_devices->seed; - break; - } - tmp_fs_devices = tmp_fs_devices->seed; - } - fs_devices->seed = NULL; + list_del_init(&fs_devices->seed_list); close_fs_devices(fs_devices); free_fs_devices(fs_devices); } + mutex_unlock(&uuid_mutex); } void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) @@ -2265,7 +2249,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) mutex_lock(&fs_devices->device_list_mutex); - btrfs_sysfs_remove_devices_dir(fs_devices, tgtdev); + btrfs_sysfs_remove_device(tgtdev); if (tgtdev->bdev) fs_devices->open_devices--; @@ -2374,10 +2358,20 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) if (!fs_devices->seeding) return -EINVAL; + /* + * Private copy of the seed devices, anchored at + * fs_info->fs_devices->seed_list + */ seed_devices = alloc_fs_devices(NULL, NULL); if (IS_ERR(seed_devices)) return PTR_ERR(seed_devices); + /* + * It's necessary to retain a copy of the original seed fs_devices in + * fs_uuids so that filesystems which have been seeded can successfully + * reference the seed device from open_seed_devices. This also supports + * multiple fs seed. + */ old_devices = clone_fs_devices(fs_devices); if (IS_ERR(old_devices)) { kfree(seed_devices); @@ -2398,16 +2392,12 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) list_for_each_entry(device, &seed_devices->devices, dev_list) device->fs_devices = seed_devices; - mutex_lock(&fs_info->chunk_mutex); - list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); - mutex_unlock(&fs_info->chunk_mutex); - fs_devices->seeding = false; fs_devices->num_devices = 0; fs_devices->open_devices = 0; fs_devices->missing_devices = 0; fs_devices->rotating = false; - fs_devices->seed = seed_devices; + list_add(&seed_devices->seed_list, &fs_devices->seed_list); generate_random_uuid(fs_devices->fsid); memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); @@ -2510,7 +2500,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path u64 orig_super_num_devices; int seeding_dev = 0; int ret = 0; - bool unlocked = false; + bool locked = false; if (sb_rdonly(sb) && !fs_devices->seeding) return -EROFS; @@ -2524,20 +2514,20 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path seeding_dev = 1; down_write(&sb->s_umount); mutex_lock(&uuid_mutex); + locked = true; } - filemap_write_and_wait(bdev->bd_inode->i_mapping); + sync_blockdev(bdev); - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) { + rcu_read_lock(); + list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { if (device->bdev == bdev) { ret = -EEXIST; - mutex_unlock( - &fs_devices->device_list_mutex); + rcu_read_unlock(); goto error; } } - mutex_unlock(&fs_devices->device_list_mutex); + rcu_read_unlock(); device = btrfs_alloc_device(fs_info, NULL, NULL); if (IS_ERR(device)) { @@ -2612,9 +2602,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path btrfs_set_super_num_devices(fs_info->super_copy, orig_super_num_devices + 1); - /* add sysfs device entry */ - btrfs_sysfs_add_devices_dir(fs_devices, device); - /* * we've got more storage, clear any full flags on the space * infos @@ -2622,6 +2609,10 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path btrfs_clear_space_info_full(fs_info); mutex_unlock(&fs_info->chunk_mutex); + + /* Add sysfs device entry */ + btrfs_sysfs_add_device(device); + mutex_unlock(&fs_devices->device_list_mutex); if (seeding_dev) { @@ -2647,8 +2638,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path goto error_sysfs; } - btrfs_sysfs_update_sprout_fsid(fs_devices, - fs_info->fs_devices->fsid); + /* + * fs_devices now represents the newly sprouted filesystem and + * its fsid has been changed by btrfs_prepare_sprout + */ + btrfs_sysfs_update_sprout_fsid(fs_devices); } ret = btrfs_commit_transaction(trans); @@ -2656,7 +2650,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (seeding_dev) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); - unlocked = true; + locked = false; if (ret) /* transaction commit */ return ret; @@ -2691,7 +2685,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path return ret; error_sysfs: - btrfs_sysfs_remove_devices_dir(fs_devices, device); + btrfs_sysfs_remove_device(device); mutex_lock(&fs_info->fs_devices->device_list_mutex); mutex_lock(&fs_info->chunk_mutex); list_del_rcu(&device->dev_list); @@ -2717,7 +2711,7 @@ error_free_device: btrfs_free_device(device); error: blkdev_put(bdev, FMODE_EXCL); - if (seeding_dev && !unlocked) { + if (locked) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); } @@ -4044,7 +4038,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, /* * rw_devices will not change at the moment, device add/delete/replace - * are excluded by EXCL_OP + * are exclusive */ num_devices = fs_info->fs_devices->rw_devices; @@ -4180,7 +4174,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, if ((ret && ret != -ECANCELED && ret != -ENOSPC) || balance_need_close(fs_info)) { reset_balance_state(fs_info); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); } wake_up(&fs_info->balance_wait_q); @@ -4191,7 +4185,7 @@ out: reset_balance_state(fs_info); else kfree(bctl); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); return ret; } @@ -4293,7 +4287,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) * is in a paused state and must have fs_info::balance_ctl properly * set up. */ - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) btrfs_warn(fs_info, "balance: cannot set exclusive op status, resume manually"); @@ -4375,7 +4369,7 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) if (fs_info->balance_ctl) { reset_balance_state(fs_info); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); btrfs_info(fs_info, "balance: canceled"); } } @@ -4462,6 +4456,7 @@ int btrfs_uuid_scan_kthread(void *data) goto skip; } update_tree: + btrfs_release_path(path); if (!btrfs_is_empty_uuid(root_item.uuid)) { ret = btrfs_uuid_tree_add(trans, root_item.uuid, BTRFS_UUID_KEY_SUBVOL, @@ -4486,6 +4481,7 @@ update_tree: } skip: + btrfs_release_path(path); if (trans) { ret = btrfs_end_transaction(trans); trans = NULL; @@ -4493,7 +4489,6 @@ skip: break; } - btrfs_release_path(path); if (key.offset < (u64)-1) { key.offset++; } else if (key.type < BTRFS_ROOT_ITEM_KEY) { @@ -6459,11 +6454,21 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, bool seed) { struct btrfs_device *device; + struct btrfs_fs_devices *seed_devs; + + if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (device->devid == devid && + (!uuid || memcmp(device->uuid, uuid, + BTRFS_UUID_SIZE) == 0)) + return device; + } + } - while (fs_devices) { + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { if (!fsid || - !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { - list_for_each_entry(device, &fs_devices->devices, + !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { + list_for_each_entry(device, &seed_devs->devices, dev_list) { if (device->devid == devid && (!uuid || memcmp(device->uuid, uuid, @@ -6471,11 +6476,8 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, return device; } } - if (seed) - fs_devices = fs_devices->seed; - else - return NULL; } + return NULL; } @@ -6483,8 +6485,17 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, u64 devid, u8 *dev_uuid) { struct btrfs_device *device; + unsigned int nofs_flag; + /* + * We call this under the chunk_mutex, so we want to use NOFS for this + * allocation, however we don't want to change btrfs_alloc_device() to + * always do NOFS because we use it in a lot of other GFP_KERNEL safe + * places. + */ + nofs_flag = memalloc_nofs_save(); device = btrfs_alloc_device(NULL, &devid, dev_uuid); + memalloc_nofs_restore(nofs_flag); if (IS_ERR(device)) return device; @@ -6521,7 +6532,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, if (WARN_ON(!devid && !fs_info)) return ERR_PTR(-EINVAL); - dev = __alloc_device(); + dev = __alloc_device(fs_info); if (IS_ERR(dev)) return dev; @@ -6717,13 +6728,11 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, lockdep_assert_held(&uuid_mutex); ASSERT(fsid); - fs_devices = fs_info->fs_devices->seed; - while (fs_devices) { + /* This will match only for multi-device seed fs */ + list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list) if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) return fs_devices; - fs_devices = fs_devices->seed; - } fs_devices = find_fsid(fsid, NULL); if (!fs_devices) { @@ -6739,6 +6748,10 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, return fs_devices; } + /* + * Upon first call for a seed fs fsid, just create a private copy of the + * respective fs_devices and anchor it at fs_info->fs_devices->seed_list + */ fs_devices = clone_fs_devices(fs_devices); if (IS_ERR(fs_devices)) return fs_devices; @@ -6746,20 +6759,17 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder); if (ret) { free_fs_devices(fs_devices); - fs_devices = ERR_PTR(ret); - goto out; + return ERR_PTR(ret); } if (!fs_devices->seeding) { close_fs_devices(fs_devices); free_fs_devices(fs_devices); - fs_devices = ERR_PTR(-EINVAL); - goto out; + return ERR_PTR(-EINVAL); } - fs_devices->seed = fs_info->fs_devices->seed; - fs_info->fs_devices->seed = fs_devices; -out: + list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list); + return fs_devices; } @@ -7178,17 +7188,22 @@ error: void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; struct btrfs_device *device; - while (fs_devices) { - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) + fs_devices->fs_info = fs_info; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) + device->fs_info = fs_info; + + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { + list_for_each_entry(device, &seed_devs->devices, dev_list) device->fs_info = fs_info; - mutex_unlock(&fs_devices->device_list_mutex); - fs_devices = fs_devices->seed; + seed_devs->fs_info = fs_info; } + mutex_unlock(&fs_devices->device_list_mutex); } static u64 btrfs_dev_stats_value(const struct extent_buffer *eb, @@ -7214,17 +7229,53 @@ static void btrfs_set_dev_stats_value(struct extent_buffer *eb, sizeof(val)); } -int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) +static int btrfs_device_init_dev_stats(struct btrfs_device *device, + struct btrfs_path *path) { - struct btrfs_key key; - struct btrfs_root *dev_root = fs_info->dev_root; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_dev_stats_item *ptr; struct extent_buffer *eb; - int slot; - int ret = 0; + struct btrfs_key key; + int item_size; + int i, ret, slot; + + key.objectid = BTRFS_DEV_STATS_OBJECTID; + key.type = BTRFS_PERSISTENT_ITEM_KEY; + key.offset = device->devid; + ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0); + if (ret) { + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) + btrfs_dev_stat_set(device, i, 0); + device->dev_stats_valid = 1; + btrfs_release_path(path); + return ret < 0 ? ret : 0; + } + slot = path->slots[0]; + eb = path->nodes[0]; + item_size = btrfs_item_size_nr(eb, slot); + + ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item); + + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { + if (item_size >= (1 + i) * sizeof(__le64)) + btrfs_dev_stat_set(device, i, + btrfs_dev_stats_value(eb, ptr, i)); + else + btrfs_dev_stat_set(device, i, 0); + } + + device->dev_stats_valid = 1; + btrfs_dev_stat_print_on_load(device); + btrfs_release_path(path); + + return 0; +} + +int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; struct btrfs_device *device; struct btrfs_path *path = NULL; - int i; + int ret = 0; path = btrfs_alloc_path(); if (!path) @@ -7232,43 +7283,22 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { - int item_size; - struct btrfs_dev_stats_item *ptr; - - key.objectid = BTRFS_DEV_STATS_OBJECTID; - key.type = BTRFS_PERSISTENT_ITEM_KEY; - key.offset = device->devid; - ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); - if (ret) { - for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) - btrfs_dev_stat_set(device, i, 0); - device->dev_stats_valid = 1; - btrfs_release_path(path); - continue; - } - slot = path->slots[0]; - eb = path->nodes[0]; - item_size = btrfs_item_size_nr(eb, slot); - - ptr = btrfs_item_ptr(eb, slot, - struct btrfs_dev_stats_item); - - for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { - if (item_size >= (1 + i) * sizeof(__le64)) - btrfs_dev_stat_set(device, i, - btrfs_dev_stats_value(eb, ptr, i)); - else - btrfs_dev_stat_set(device, i, 0); + ret = btrfs_device_init_dev_stats(device, path); + if (ret) + goto out; + } + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { + list_for_each_entry(device, &seed_devs->devices, dev_list) { + ret = btrfs_device_init_dev_stats(device, path); + if (ret) + goto out; } - - device->dev_stats_valid = 1; - btrfs_dev_stat_print_on_load(device); - btrfs_release_path(path); } +out: mutex_unlock(&fs_devices->device_list_mutex); btrfs_free_path(path); - return ret < 0 ? ret : 0; + return ret; } static int update_dev_stat_item(struct btrfs_trans_handle *trans, @@ -7485,24 +7515,6 @@ void btrfs_commit_device_sizes(struct btrfs_transaction *trans) mutex_unlock(&trans->fs_info->chunk_mutex); } -void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info) -{ - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - while (fs_devices) { - fs_devices->fs_info = fs_info; - fs_devices = fs_devices->seed; - } -} - -void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) -{ - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - while (fs_devices) { - fs_devices->fs_info = NULL; - fs_devices = fs_devices->seed; - } -} - /* * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10. */ @@ -7583,8 +7595,11 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, /* It's possible this device is a dummy for seed device */ if (dev->disk_total_bytes == 0) { - dev = btrfs_find_device(fs_info->fs_devices->seed, devid, NULL, - NULL, false); + struct btrfs_fs_devices *devs; + + devs = list_first_entry(&fs_info->fs_devices->seed_list, + struct btrfs_fs_devices, seed_list); + dev = btrfs_find_device(devs, devid, NULL, NULL, false); if (!dev) { btrfs_err(fs_info, "failed to find seed devid %llu", devid); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 5eea93916fbf..bf27ac07d315 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -58,7 +58,7 @@ struct btrfs_device { struct btrfs_fs_devices *fs_devices; struct btrfs_fs_info *fs_info; - struct rcu_string *name; + struct rcu_string __rcu *name; u64 generation; @@ -246,7 +246,7 @@ struct btrfs_fs_devices { */ struct list_head alloc_list; - struct btrfs_fs_devices *seed; + struct list_head seed_list; bool seeding; int opened; @@ -435,7 +435,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, void *holder); int btrfs_forget_devices(const char *path); -int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); +void btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step); void btrfs_assign_next_active_device(struct btrfs_device *device, struct btrfs_device *this_dev); @@ -569,10 +569,11 @@ static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags) void btrfs_commit_device_sizes(struct btrfs_transaction *trans); struct list_head * __attribute_const__ btrfs_get_fs_uuids(void); -void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); -void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, struct btrfs_device *failing_dev); +void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, + struct block_device *bdev, + const char *device_path); int btrfs_bg_type_to_factor(u64 flags); const char *btrfs_bg_type_to_raid_name(u64 flags); diff --git a/fs/buffer.c b/fs/buffer.c index 061dd202979d..5a28a6aa7f16 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1958,7 +1958,7 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, */ set_buffer_new(bh); set_buffer_unwritten(bh); - /* FALLTHRU */ + fallthrough; case IOMAP_MAPPED: if ((iomap->flags & IOMAP_F_NEW) || offset >= i_size_read(inode)) @@ -2771,16 +2771,6 @@ int nobh_writepage(struct page *page, get_block_t *get_block, /* Is the page fully outside i_size? (truncate in progress) */ offset = i_size & (PAGE_SIZE-1); if (page->index >= end_index+1 || !offset) { - /* - * The page may have dirty, unmapped buffers. For example, - * they may have been added in ext3_writepage(). Make them - * freeable here, so the page does not leak. - */ -#if 0 - /* Not really sure about this - do we need this ? */ - if (page->mapping->a_ops->invalidatepage) - page->mapping->a_ops->invalidatepage(page, offset); -#endif unlock_page(page); return 0; /* don't care */ } @@ -2975,12 +2965,6 @@ int block_write_full_page(struct page *page, get_block_t *get_block, /* Is the page fully outside i_size? (truncate in progress) */ offset = i_size & (PAGE_SIZE-1); if (page->index >= end_index+1 || !offset) { - /* - * The page may have dirty, unmapped buffers. For example, - * they may have been added in ext3_writepage(). Make them - * freeable here, so the page does not leak. - */ - do_invalidatepage(page, 0, PAGE_SIZE); unlock_page(page); return 0; /* don't care */ } @@ -3157,6 +3141,15 @@ int __sync_dirty_buffer(struct buffer_head *bh, int op_flags) WARN_ON(atomic_read(&bh->b_count) < 1); lock_buffer(bh); if (test_clear_buffer_dirty(bh)) { + /* + * The bh should be mapped, but it might not be if the + * device was hot-removed. Not much we can do but fail the I/O. + */ + if (!buffer_mapped(bh)) { + unlock_buffer(bh); + return -EIO; + } + get_bh(bh); bh->b_end_io = end_buffer_write_sync; ret = submit_bh(REQ_OP_WRITE, op_flags, bh); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 55ccccf77cea..034b3f4fdd3a 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -887,8 +887,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) int have = ci->i_snap_caps; if ((have & mask) == mask) { - dout("__ceph_caps_issued_mask ino 0x%lx snap issued %s" - " (mask %s)\n", ci->vfs_inode.i_ino, + dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s" + " (mask %s)\n", ceph_ino(&ci->vfs_inode), ceph_cap_string(have), ceph_cap_string(mask)); return 1; @@ -899,8 +899,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) if (!__cap_is_valid(cap)) continue; if ((cap->issued & mask) == mask) { - dout("__ceph_caps_issued_mask ino 0x%lx cap %p issued %s" - " (mask %s)\n", ci->vfs_inode.i_ino, cap, + dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s" + " (mask %s)\n", ceph_ino(&ci->vfs_inode), cap, ceph_cap_string(cap->issued), ceph_cap_string(mask)); if (touch) @@ -911,8 +911,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) /* does a combination of caps satisfy mask? */ have |= cap->issued; if ((have & mask) == mask) { - dout("__ceph_caps_issued_mask ino 0x%lx combo issued %s" - " (mask %s)\n", ci->vfs_inode.i_ino, + dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s" + " (mask %s)\n", ceph_ino(&ci->vfs_inode), ceph_cap_string(cap->issued), ceph_cap_string(mask)); if (touch) { @@ -2872,7 +2872,7 @@ int ceph_get_caps(struct file *filp, int need, int want, struct cap_wait cw; DEFINE_WAIT_FUNC(wait, woken_wake_function); - cw.ino = inode->i_ino; + cw.ino = ceph_ino(inode); cw.tgid = current->tgid; cw.need = need; cw.want = want; diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 97539b497e4c..3e3fcda9b276 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -202,7 +202,7 @@ static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p) { struct seq_file *s = p; - seq_printf(s, "0x%-17lx%-17s%-17s\n", inode->i_ino, + seq_printf(s, "0x%-17llx%-17s%-17s\n", ceph_ino(inode), ceph_cap_string(cap->issued), ceph_cap_string(cap->implemented)); return 0; @@ -247,7 +247,7 @@ static int caps_show(struct seq_file *s, void *p) spin_lock(&mdsc->caps_list_lock); list_for_each_entry(cw, &mdsc->cap_wait_list, list) { - seq_printf(s, "%-13d0x%-17lx%-17s%-17s\n", cw->tgid, cw->ino, + seq_printf(s, "%-13d0x%-17llx%-17s%-17s\n", cw->tgid, cw->ino, ceph_cap_string(cw->need), ceph_cap_string(cw->want)); } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 060bdcc5ce32..d72e4a12bb69 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -259,9 +259,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, dentry, dentry, d_inode(dentry)); ctx->pos = di->offset; if (!dir_emit(ctx, dentry->d_name.name, - dentry->d_name.len, - ceph_translate_ino(dentry->d_sb, - d_inode(dentry)->i_ino), + dentry->d_name.len, ceph_present_inode(d_inode(dentry)), d_inode(dentry)->i_mode >> 12)) { dput(dentry); err = 0; @@ -324,18 +322,21 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) /* always start with . and .. */ if (ctx->pos == 0) { dout("readdir off 0 -> '.'\n"); - if (!dir_emit(ctx, ".", 1, - ceph_translate_ino(inode->i_sb, inode->i_ino), + if (!dir_emit(ctx, ".", 1, ceph_present_inode(inode), inode->i_mode >> 12)) return 0; ctx->pos = 1; } if (ctx->pos == 1) { - ino_t ino = parent_ino(file->f_path.dentry); + u64 ino; + struct dentry *dentry = file->f_path.dentry; + + spin_lock(&dentry->d_lock); + ino = ceph_present_inode(dentry->d_parent->d_inode); + spin_unlock(&dentry->d_lock); + dout("readdir off 1 -> '..'\n"); - if (!dir_emit(ctx, "..", 2, - ceph_translate_ino(inode->i_sb, ino), - inode->i_mode >> 12)) + if (!dir_emit(ctx, "..", 2, ino, inode->i_mode >> 12)) return 0; ctx->pos = 2; } @@ -507,9 +508,6 @@ more: } for (; i < rinfo->dir_nr; i++) { struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; - struct ceph_vino vino; - ino_t ino; - u32 ftype; BUG_ON(rde->offset < ctx->pos); @@ -519,13 +517,10 @@ more: rde->name_len, rde->name, &rde->inode.in); BUG_ON(!rde->inode.in); - ftype = le32_to_cpu(rde->inode.in->mode) >> 12; - vino.ino = le64_to_cpu(rde->inode.in->ino); - vino.snap = le64_to_cpu(rde->inode.in->snapid); - ino = ceph_vino_to_ino(vino); if (!dir_emit(ctx, rde->name, rde->name_len, - ceph_translate_ino(inode->i_sb, ino), ftype)) { + ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)), + le32_to_cpu(rde->inode.in->mode) >> 12)) { dout("filldir stopping us...\n"); return 0; } @@ -1161,7 +1156,7 @@ retry: if (try_async && op == CEPH_MDS_OP_UNLINK && (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) { - dout("async unlink on %lu/%.*s caps=%s", dir->i_ino, + dout("async unlink on %llu/%.*s caps=%s", ceph_ino(dir), dentry->d_name.len, dentry->d_name.name, ceph_cap_string(req->r_dir_caps)); set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags); @@ -1745,7 +1740,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) case -ENOENT: if (d_really_is_negative(dentry)) valid = 1; - /* Fallthrough */ + fallthrough; default: break; } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index d51c3f2fdca0..3f4c993dfc6f 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -252,7 +252,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) case S_IFREG: ceph_fscache_register_inode_cookie(inode); ceph_fscache_file_set_cookie(inode, file); - /* fall through */ + fallthrough; case S_IFDIR: ret = ceph_init_file_info(inode, file, fmode, S_ISDIR(inode->i_mode)); @@ -630,8 +630,8 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, } else { struct dentry *dn; - dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__, - vino.ino, dir->i_ino, dentry->d_name.name); + dout("%s d_adding new inode 0x%llx to 0x%llx/%s\n", __func__, + vino.ino, ceph_ino(dir), dentry->d_name.name); ceph_dir_clear_ordered(dir); ceph_init_inode_acls(inode, as_ctx); if (inode->i_state & I_NEW) { @@ -2507,6 +2507,7 @@ const struct file_operations ceph_file_fops = { .mmap = ceph_mmap, .fsync = ceph_fsync, .lock = ceph_lock, + .setlease = simple_nosetlease, .flock = ceph_flock, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 357c937699d5..d163fa96cb40 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -41,8 +41,10 @@ static void ceph_inode_work(struct work_struct *work); */ static int ceph_set_ino_cb(struct inode *inode, void *data) { - ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; - inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data); + struct ceph_inode_info *ci = ceph_inode(inode); + + ci->i_vino = *(struct ceph_vino *)data; + inode->i_ino = ceph_vino_to_ino_t(ci->i_vino); inode_set_iversion_raw(inode, 0); return 0; } @@ -50,17 +52,14 @@ static int ceph_set_ino_cb(struct inode *inode, void *data) struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino) { struct inode *inode; - ino_t t = ceph_vino_to_ino(vino); - inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino); + inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare, + ceph_set_ino_cb, &vino); if (!inode) return ERR_PTR(-ENOMEM); - if (inode->i_state & I_NEW) - dout("get_inode created new inode %p %llx.%llx ino %llx\n", - inode, ceph_vinop(inode), (u64)inode->i_ino); - dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino, - vino.snap, inode); + dout("get_inode on %llu=%llx.%llx got %p new %d\n", ceph_present_inode(inode), + ceph_vinop(inode), inode, !!(inode->i_state & I_NEW)); return inode; } @@ -2378,7 +2377,7 @@ int ceph_getattr(const struct path *path, struct kstat *stat, } generic_fillattr(inode, stat); - stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino); + stat->ino = ceph_present_inode(inode); /* * btime on newly-allocated inodes is 0, so if this is still set to diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index bc9e95937d7c..658800605bfb 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -372,7 +372,7 @@ struct ceph_quotarealm_inode { struct cap_wait { struct list_head list; - unsigned long ino; + u64 ino; pid_t tgid; int need; int want; diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 198ddde5c1e6..cc2c4d40b022 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -23,12 +23,12 @@ static inline bool ceph_has_realms_with_quotas(struct inode *inode) { struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct super_block *sb = mdsc->fsc->sb; + struct inode *root = d_inode(sb->s_root); if (atomic64_read(&mdsc->quotarealms_count) > 0) return true; /* if root is the real CephFS root, we don't have quota realms */ - if (sb->s_root->d_inode && - (sb->s_root->d_inode->i_ino == CEPH_INO_ROOT)) + if (root && ceph_ino(root) == CEPH_INO_ROOT) return false; /* otherwise, we can't know for sure */ return true; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 4c3c964b1c54..a3995ebe0623 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -457,15 +457,7 @@ ceph_vino(const struct inode *inode) return ceph_inode(inode)->i_vino; } -/* - * ino_t is <64 bits on many architectures, blech. - * - * i_ino (kernel inode) st_ino (userspace) - * i386 32 32 - * x86_64+ino32 64 32 - * x86_64 64 64 - */ -static inline u32 ceph_ino_to_ino32(__u64 vino) +static inline u32 ceph_ino_to_ino32(u64 vino) { u32 ino = vino & 0xffffffff; ino ^= vino >> 32; @@ -475,34 +467,17 @@ static inline u32 ceph_ino_to_ino32(__u64 vino) } /* - * kernel i_ino value + * Inode numbers in cephfs are 64 bits, but inode->i_ino is 32-bits on + * some arches. We generally do not use this value inside the ceph driver, but + * we do want to set it to something, so that generic vfs code has an + * appropriate value for tracepoints and the like. */ -static inline ino_t ceph_vino_to_ino(struct ceph_vino vino) +static inline ino_t ceph_vino_to_ino_t(struct ceph_vino vino) { -#if BITS_PER_LONG == 32 - return ceph_ino_to_ino32(vino.ino); -#else + if (sizeof(ino_t) == sizeof(u32)) + return ceph_ino_to_ino32(vino.ino); return (ino_t)vino.ino; -#endif -} - -/* - * user-visible ino (stat, filldir) - */ -#if BITS_PER_LONG == 32 -static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino) -{ - return ino; -} -#else -static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino) -{ - if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32)) - ino = ceph_ino_to_ino32(ino); - return ino; } -#endif - /* for printf-style formatting */ #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap @@ -511,11 +486,34 @@ static inline u64 ceph_ino(struct inode *inode) { return ceph_inode(inode)->i_vino.ino; } + static inline u64 ceph_snap(struct inode *inode) { return ceph_inode(inode)->i_vino.snap; } +/** + * ceph_present_ino - format an inode number for presentation to userland + * @sb: superblock where the inode lives + * @ino: inode number to (possibly) convert + * + * If the user mounted with the ino32 option, then the 64-bit value needs + * to be converted to something that can fit inside 32 bits. Note that + * internal kernel code never uses this value, so this is entirely for + * userland consumption. + */ +static inline u64 ceph_present_ino(struct super_block *sb, u64 ino) +{ + if (unlikely(ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))) + return ceph_ino_to_ino32(ino); + return ino; +} + +static inline u64 ceph_present_inode(struct inode *inode) +{ + return ceph_present_ino(inode->i_sb, ceph_ino(inode)); +} + static inline int ceph_ino_compare(struct inode *inode, void *data) { struct ceph_vino *pvino = (struct ceph_vino *)data; @@ -524,11 +522,16 @@ static inline int ceph_ino_compare(struct inode *inode, void *data) ci->i_vino.snap == pvino->snap; } + static inline struct inode *ceph_find_inode(struct super_block *sb, struct ceph_vino vino) { - ino_t t = ceph_vino_to_ino(vino); - return ilookup5(sb, t, ceph_ino_compare, &vino); + /* + * NB: The hashval will be run through the fs/inode.c hash function + * anyway, so there is no need to squash the inode number down to + * 32-bits first. Just use low-order bits on arches with 32-bit long. + */ + return ilookup5(sb, (unsigned long)vino.ino, ceph_ino_compare, &vino); } diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index b296964b8afa..b565d83ba89e 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -2031,4 +2031,19 @@ static inline bool is_smb1_server(struct TCP_Server_Info *server) return strcmp(server->vals->version_string, SMB1_VERSION_STRING) == 0; } +static inline bool is_tcon_dfs(struct cifs_tcon *tcon) +{ + /* + * For SMB1, see MS-CIFS 2.4.55 SMB_COM_TREE_CONNECT_ANDX (0x75) and MS-CIFS 3.3.4.4 DFS + * Subsystem Notifies That a Share Is a DFS Share. + * + * For SMB2+, see MS-SMB2 2.2.10 SMB2 TREE_CONNECT Response and MS-SMB2 3.3.4.14 Server + * Application Updates a Share. + */ + if (!tcon || !tcon->ses || !tcon->ses->server) + return false; + return is_smb1_server(tcon->ses->server) ? tcon->Flags & SMB_SHARE_IS_IN_DFS : + tcon->share_flags & (SHI1005_FLAGS_DFS | SHI1005_FLAGS_DFS_ROOT); +} + #endif /* _CIFS_GLOB_H */ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 0e763d2dcf16..0496934feecb 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -581,7 +581,7 @@ should_set_ext_sec_flag(enum securityEnum sectype) if (global_secflags & (CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP)) return true; - /* Fallthrough */ + fallthrough; default: return false; } diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index a275ee399dce..a5731dd6e656 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1378,25 +1378,25 @@ static int cifs_parse_security_flavors(char *value, return 1; case Opt_sec_krb5i: vol->sign = true; - /* Fallthrough */ + fallthrough; case Opt_sec_krb5: vol->sectype = Kerberos; break; case Opt_sec_ntlmsspi: vol->sign = true; - /* Fallthrough */ + fallthrough; case Opt_sec_ntlmssp: vol->sectype = RawNTLMSSP; break; case Opt_sec_ntlmi: vol->sign = true; - /* Fallthrough */ + fallthrough; case Opt_ntlm: vol->sectype = NTLM; break; case Opt_sec_ntlmv2i: vol->sign = true; - /* Fallthrough */ + fallthrough; case Opt_sec_ntlmv2: vol->sectype = NTLMv2; break; @@ -2187,7 +2187,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->password = NULL; break; } - /* Fallthrough - to Opt_pass below.*/ + fallthrough; /* to Opt_pass below */ case Opt_pass: /* Obtain the value string */ value = strchr(data, '='); @@ -4909,7 +4909,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol) if (!tcon) continue; /* Make sure that requests go through new root servers */ - if (tcon->share_flags & (SHI1005_FLAGS_DFS | SHI1005_FLAGS_DFS_ROOT)) { + if (is_tcon_dfs(tcon)) { put_root_ses(root_ses); set_root_ses(cifs_sb, ses, &root_ses); } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 3989d08396ac..1f75b25e559a 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1017,6 +1017,8 @@ handle_mnt_opt: if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID) { rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, true, full_path, fid); + if (rc == -EREMOTE) + rc = 0; if (rc) { cifs_dbg(FYI, "%s: Get mode from SID failed. rc=%d\n", __func__, rc); @@ -1025,6 +1027,8 @@ handle_mnt_opt: } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, false, full_path, fid); + if (rc == -EREMOTE) + rc = 0; if (rc) { cifs_dbg(FYI, "%s: Getting ACL failed with error: %d\n", __func__, rc); diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 69cd5856621b..de564368a887 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -798,7 +798,7 @@ cifs_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested) if ((server->sec_kerberos || server->sec_mskerberos) && (global_secflags & CIFSSEC_MAY_KRB5)) return Kerberos; - /* Fallthrough */ + fallthrough; default: return Unspecified; } @@ -815,7 +815,7 @@ cifs_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested) default: break; } - /* Fallthrough - to attempt LANMAN authentication next */ + fallthrough; /* to attempt LANMAN authentication next */ case CIFS_NEGFLAVOR_LANMAN: switch (requested) { case LANMAN: @@ -823,7 +823,7 @@ cifs_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested) case Unspecified: if (global_secflags & CIFSSEC_MAY_LANMAN) return LANMAN; - /* Fallthrough */ + fallthrough; default: return Unspecified; } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 32f90dc82c84..d44df8f95bcd 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1208,7 +1208,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, rqst[1].rq_iov = si_iov; rqst[1].rq_nvec = 1; - len = sizeof(ea) + ea_name_len + ea_value_len + 1; + len = sizeof(*ea) + ea_name_len + ea_value_len + 1; ea = kzalloc(len, GFP_KERNEL); if (ea == NULL) { rc = -ENOMEM; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 667d70aa335f..96c172d94fba 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1101,7 +1101,7 @@ smb2_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested) if ((server->sec_kerberos || server->sec_mskerberos) && (global_secflags & CIFSSEC_MAY_KRB5)) return Kerberos; - /* Fallthrough */ + fallthrough; default: return Unspecified; } diff --git a/fs/compat.c b/fs/compat.c deleted file mode 100644 index 436d228cf71c..000000000000 --- a/fs/compat.c +++ /dev/null @@ -1,132 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/fs/compat.c - * - * Kernel compatibililty routines for e.g. 32 bit syscall support - * on 64 bit kernels. - * - * Copyright (C) 2002 Stephen Rothwell, IBM Corporation - * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com) - * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be) - * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs - * Copyright (C) 2003 Pavel Machek (pavel@ucw.cz) - */ - -#include <linux/compat.h> -#include <linux/nfs4_mount.h> -#include <linux/syscalls.h> -#include <linux/slab.h> -#include <linux/uaccess.h> -#include "internal.h" - -struct compat_nfs_string { - compat_uint_t len; - compat_uptr_t data; -}; - -static inline void compat_nfs_string(struct nfs_string *dst, - struct compat_nfs_string *src) -{ - dst->data = compat_ptr(src->data); - dst->len = src->len; -} - -struct compat_nfs4_mount_data_v1 { - compat_int_t version; - compat_int_t flags; - compat_int_t rsize; - compat_int_t wsize; - compat_int_t timeo; - compat_int_t retrans; - compat_int_t acregmin; - compat_int_t acregmax; - compat_int_t acdirmin; - compat_int_t acdirmax; - struct compat_nfs_string client_addr; - struct compat_nfs_string mnt_path; - struct compat_nfs_string hostname; - compat_uint_t host_addrlen; - compat_uptr_t host_addr; - compat_int_t proto; - compat_int_t auth_flavourlen; - compat_uptr_t auth_flavours; -}; - -static int do_nfs4_super_data_conv(void *raw_data) -{ - int version = *(compat_uint_t *) raw_data; - - if (version == 1) { - struct compat_nfs4_mount_data_v1 *raw = raw_data; - struct nfs4_mount_data *real = raw_data; - - /* copy the fields backwards */ - real->auth_flavours = compat_ptr(raw->auth_flavours); - real->auth_flavourlen = raw->auth_flavourlen; - real->proto = raw->proto; - real->host_addr = compat_ptr(raw->host_addr); - real->host_addrlen = raw->host_addrlen; - compat_nfs_string(&real->hostname, &raw->hostname); - compat_nfs_string(&real->mnt_path, &raw->mnt_path); - compat_nfs_string(&real->client_addr, &raw->client_addr); - real->acdirmax = raw->acdirmax; - real->acdirmin = raw->acdirmin; - real->acregmax = raw->acregmax; - real->acregmin = raw->acregmin; - real->retrans = raw->retrans; - real->timeo = raw->timeo; - real->wsize = raw->wsize; - real->rsize = raw->rsize; - real->flags = raw->flags; - real->version = raw->version; - } - - return 0; -} - -#define NFS4_NAME "nfs4" - -COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name, - const char __user *, dir_name, - const char __user *, type, compat_ulong_t, flags, - const void __user *, data) -{ - char *kernel_type; - void *options; - char *kernel_dev; - int retval; - - kernel_type = copy_mount_string(type); - retval = PTR_ERR(kernel_type); - if (IS_ERR(kernel_type)) - goto out; - - kernel_dev = copy_mount_string(dev_name); - retval = PTR_ERR(kernel_dev); - if (IS_ERR(kernel_dev)) - goto out1; - - options = copy_mount_options(data); - retval = PTR_ERR(options); - if (IS_ERR(options)) - goto out2; - - if (kernel_type && options) { - if (!strcmp(kernel_type, NFS4_NAME)) { - retval = -EINVAL; - if (do_nfs4_super_data_conv(options)) - goto out3; - } - } - - retval = do_mount(kernel_dev, dir_name, kernel_type, flags, options); - - out3: - kfree(options); - out2: - kfree(kernel_dev); - out1: - kfree(kernel_type); - out: - return retval; -} diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index cb733652ecca..ca2273727225 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1688,11 +1688,11 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) switch (whence) { case 1: offset += file->f_pos; - /* fall through */ + fallthrough; case 0: if (offset >= 0) break; - /* fall through */ + fallthrough; default: return -EINVAL; } diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 9212325763b0..4ef3f714046a 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -343,9 +343,11 @@ void fscrypt_msg(const struct inode *inode, const char *level, va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - if (inode) + if (inode && inode->i_ino) printk("%sfscrypt (%s, inode %lu): %pV\n", level, inode->i_sb->s_id, inode->i_ino, &vaf); + else if (inode) + printk("%sfscrypt (%s): %pV\n", level, inode->i_sb->s_id, &vaf); else printk("%sfscrypt: %pV\n", level, &vaf); va_end(args); diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 011830f84d8d..1fbe6c24d705 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -61,15 +61,6 @@ struct fscrypt_nokey_name { */ #define FSCRYPT_NOKEY_NAME_MAX offsetofend(struct fscrypt_nokey_name, sha256) -static void fscrypt_do_sha256(const u8 *data, unsigned int data_len, u8 *result) -{ - struct sha256_state sctx; - - sha256_init(&sctx); - sha256_update(&sctx, data, data_len); - sha256_final(&sctx, result); -} - static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) { if (str->len == 1 && str->name[0] == '.') @@ -242,11 +233,11 @@ static int base64_decode(const char *src, int len, u8 *dst) return cp - dst; } -bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, - u32 max_len, u32 *encrypted_len_ret) +bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, + u32 orig_len, u32 max_len, + u32 *encrypted_len_ret) { - const struct fscrypt_info *ci = inode->i_crypt_info; - int padding = 4 << (fscrypt_policy_flags(&ci->ci_policy) & + int padding = 4 << (fscrypt_policy_flags(policy) & FSCRYPT_POLICY_FLAGS_PAD_MASK); u32 encrypted_len; @@ -260,8 +251,6 @@ bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, /** * fscrypt_fname_alloc_buffer() - allocate a buffer for presented filenames - * @inode: inode of the parent directory (for regular filenames) - * or of the symlink (for symlink targets) * @max_encrypted_len: maximum length of encrypted filenames the buffer will be * used to present * @crypto_str: (output) buffer to allocate @@ -271,8 +260,7 @@ bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, * * Return: 0 on success, -errno on failure */ -int fscrypt_fname_alloc_buffer(const struct inode *inode, - u32 max_encrypted_len, +int fscrypt_fname_alloc_buffer(u32 max_encrypted_len, struct fscrypt_str *crypto_str) { const u32 max_encoded_len = BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX); @@ -369,9 +357,9 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode, } else { memcpy(nokey_name.bytes, iname->name, sizeof(nokey_name.bytes)); /* Compute strong hash of remaining part of name. */ - fscrypt_do_sha256(&iname->name[sizeof(nokey_name.bytes)], - iname->len - sizeof(nokey_name.bytes), - nokey_name.sha256); + sha256(&iname->name[sizeof(nokey_name.bytes)], + iname->len - sizeof(nokey_name.bytes), + nokey_name.sha256); size = FSCRYPT_NOKEY_NAME_MAX; } oname->len = base64_encode((const u8 *)&nokey_name, size, oname->name); @@ -394,9 +382,9 @@ EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); * directory's encryption key, then @iname is the plaintext, so we encrypt it to * get the disk_name. * - * Else, for keyless @lookup operations, @iname is the presented ciphertext, so - * we decode it to get the fscrypt_nokey_name. Non-@lookup operations will be - * impossible in this case, so we fail them with ENOKEY. + * Else, for keyless @lookup operations, @iname should be a no-key name, so we + * decode it to get the struct fscrypt_nokey_name. Non-@lookup operations will + * be impossible in this case, so we fail them with ENOKEY. * * If successful, fscrypt_free_filename() must be called later to clean up. * @@ -421,7 +409,8 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, return ret; if (fscrypt_has_encryption_key(dir)) { - if (!fscrypt_fname_encrypted_size(dir, iname->len, + if (!fscrypt_fname_encrypted_size(&dir->i_crypt_info->ci_policy, + iname->len, dir->i_sb->s_cop->max_namelen, &fname->crypto_buf.len)) return -ENAMETOOLONG; @@ -440,7 +429,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, } if (!lookup) return -ENOKEY; - fname->is_ciphertext_name = true; + fname->is_nokey_name = true; /* * We don't have the key and we are doing a lookup; decode the @@ -499,7 +488,7 @@ bool fscrypt_match_name(const struct fscrypt_name *fname, { const struct fscrypt_nokey_name *nokey_name = (const void *)fname->crypto_buf.name; - u8 sha256[SHA256_DIGEST_SIZE]; + u8 digest[SHA256_DIGEST_SIZE]; if (likely(fname->disk_name.name)) { if (de_name_len != fname->disk_name.len) @@ -510,9 +499,9 @@ bool fscrypt_match_name(const struct fscrypt_name *fname, return false; if (memcmp(de_name, nokey_name->bytes, sizeof(nokey_name->bytes))) return false; - fscrypt_do_sha256(&de_name[sizeof(nokey_name->bytes)], - de_name_len - sizeof(nokey_name->bytes), sha256); - return !memcmp(sha256, nokey_name->sha256, sizeof(sha256)); + sha256(&de_name[sizeof(nokey_name->bytes)], + de_name_len - sizeof(nokey_name->bytes), digest); + return !memcmp(digest, nokey_name->sha256, sizeof(digest)); } EXPORT_SYMBOL_GPL(fscrypt_match_name); @@ -541,7 +530,7 @@ EXPORT_SYMBOL_GPL(fscrypt_fname_siphash); * Validate dentries in encrypted directories to make sure we aren't potentially * caching stale dentries after a key has been added. */ -static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) +int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) { struct dentry *dir; int err; @@ -549,17 +538,17 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) /* * Plaintext names are always valid, since fscrypt doesn't support - * reverting to ciphertext names without evicting the directory's inode + * reverting to no-key names without evicting the directory's inode * -- which implies eviction of the dentries in the directory. */ - if (!(dentry->d_flags & DCACHE_ENCRYPTED_NAME)) + if (!(dentry->d_flags & DCACHE_NOKEY_NAME)) return 1; /* - * Ciphertext name; valid if the directory's key is still unavailable. + * No-key name; valid if the directory's key is still unavailable. * - * Although fscrypt forbids rename() on ciphertext names, we still must - * use dget_parent() here rather than use ->d_parent directly. That's + * Although fscrypt forbids rename() on no-key names, we still must use + * dget_parent() here rather than use ->d_parent directly. That's * because a corrupted fs image may contain directory hard links, which * the VFS handles by moving the directory's dentry tree in the dcache * each time ->lookup() finds the directory and it already has a dentry @@ -580,6 +569,7 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) return valid; } +EXPORT_SYMBOL_GPL(fscrypt_d_revalidate); const struct dentry_operations fscrypt_d_ops = { .d_revalidate = fscrypt_d_revalidate, diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 8117a61b6f55..4f5806a3b73d 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -97,7 +97,6 @@ static inline const u8 *fscrypt_context_nonce(const union fscrypt_context *ctx) return NULL; } -#undef fscrypt_policy union fscrypt_policy { u8 version; struct fscrypt_policy_v1 v1; @@ -292,8 +291,9 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, /* fname.c */ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, u8 *out, unsigned int olen); -bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, - u32 max_len, u32 *encrypted_len_ret); +bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, + u32 orig_len, u32 max_len, + u32 *encrypted_len_ret); extern const struct dentry_operations fscrypt_d_ops; /* hkdf.c */ @@ -572,6 +572,9 @@ int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key); int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, const struct fscrypt_master_key *mk); +void fscrypt_hash_inode_number(struct fscrypt_info *ci, + const struct fscrypt_master_key *mk); + /* keysetup_v1.c */ void fscrypt_put_direct_key(struct fscrypt_direct_key *dk); @@ -590,5 +593,6 @@ bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, int fscrypt_policy_from_context(union fscrypt_policy *policy_u, const union fscrypt_context *ctx_u, int ctx_size); +const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir); #endif /* _FSCRYPT_PRIVATE_H */ diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 09fb8aa0f2e9..20b0df47fe6a 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -60,8 +60,8 @@ int __fscrypt_prepare_link(struct inode *inode, struct inode *dir, if (err) return err; - /* ... in case we looked up ciphertext name before key was added */ - if (dentry->d_flags & DCACHE_ENCRYPTED_NAME) + /* ... in case we looked up no-key name before key was added */ + if (dentry->d_flags & DCACHE_NOKEY_NAME) return -ENOKEY; if (!fscrypt_has_permitted_context(dir, inode)) @@ -85,9 +85,8 @@ int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) return err; - /* ... in case we looked up ciphertext name(s) before key was added */ - if ((old_dentry->d_flags | new_dentry->d_flags) & - DCACHE_ENCRYPTED_NAME) + /* ... in case we looked up no-key name(s) before key was added */ + if ((old_dentry->d_flags | new_dentry->d_flags) & DCACHE_NOKEY_NAME) return -ENOKEY; if (old_dir != new_dir) { @@ -114,9 +113,9 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, if (err && err != -ENOENT) return err; - if (fname->is_ciphertext_name) { + if (fname->is_nokey_name) { spin_lock(&dentry->d_lock); - dentry->d_flags |= DCACHE_ENCRYPTED_NAME; + dentry->d_flags |= DCACHE_NOKEY_NAME; spin_unlock(&dentry->d_lock); d_set_d_op(dentry, &fscrypt_d_ops); } @@ -166,26 +165,51 @@ int fscrypt_prepare_setflags(struct inode *inode, return 0; } -int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len, - unsigned int max_len, - struct fscrypt_str *disk_link) +/** + * fscrypt_prepare_symlink() - prepare to create a possibly-encrypted symlink + * @dir: directory in which the symlink is being created + * @target: plaintext symlink target + * @len: length of @target excluding null terminator + * @max_len: space the filesystem has available to store the symlink target + * @disk_link: (out) the on-disk symlink target being prepared + * + * This function computes the size the symlink target will require on-disk, + * stores it in @disk_link->len, and validates it against @max_len. An + * encrypted symlink may be longer than the original. + * + * Additionally, @disk_link->name is set to @target if the symlink will be + * unencrypted, but left NULL if the symlink will be encrypted. For encrypted + * symlinks, the filesystem must call fscrypt_encrypt_symlink() to create the + * on-disk target later. (The reason for the two-step process is that some + * filesystems need to know the size of the symlink target before creating the + * inode, e.g. to determine whether it will be a "fast" or "slow" symlink.) + * + * Return: 0 on success, -ENAMETOOLONG if the symlink target is too long, + * -ENOKEY if the encryption key is missing, or another -errno code if a problem + * occurred while setting up the encryption key. + */ +int fscrypt_prepare_symlink(struct inode *dir, const char *target, + unsigned int len, unsigned int max_len, + struct fscrypt_str *disk_link) { - int err; + const union fscrypt_policy *policy; /* * To calculate the size of the encrypted symlink target we need to know * the amount of NUL padding, which is determined by the flags set in * the encryption policy which will be inherited from the directory. - * The easiest way to get access to this is to just load the directory's - * fscrypt_info, since we'll need it to create the dir_entry anyway. - * - * Note: in test_dummy_encryption mode, @dir may be unencrypted. */ - err = fscrypt_get_encryption_info(dir); - if (err) - return err; - if (!fscrypt_has_encryption_key(dir)) - return -ENOKEY; + policy = fscrypt_policy_to_inherit(dir); + if (policy == NULL) { + /* Not encrypted */ + disk_link->name = (unsigned char *)target; + disk_link->len = len + 1; + if (disk_link->len > max_len) + return -ENAMETOOLONG; + return 0; + } + if (IS_ERR(policy)) + return PTR_ERR(policy); /* * Calculate the size of the encrypted symlink and verify it won't @@ -198,7 +222,7 @@ int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len, * counting it (even though it is meaningless for ciphertext) is simpler * for now since filesystems will assume it is there and subtract it. */ - if (!fscrypt_fname_encrypted_size(dir, len, + if (!fscrypt_fname_encrypted_size(policy, len, max_len - sizeof(struct fscrypt_symlink_data), &disk_link->len)) return -ENAMETOOLONG; @@ -207,7 +231,7 @@ int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len, disk_link->name = NULL; return 0; } -EXPORT_SYMBOL_GPL(__fscrypt_prepare_symlink); +EXPORT_SYMBOL_GPL(fscrypt_prepare_symlink); int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, unsigned int len, struct fscrypt_str *disk_link) @@ -217,9 +241,13 @@ int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, struct fscrypt_symlink_data *sd; unsigned int ciphertext_len; - err = fscrypt_require_key(inode); - if (err) - return err; + /* + * fscrypt_prepare_new_inode() should have already set up the new + * symlink inode's encryption key. We don't wait until now to do it, + * since we may be in a filesystem transaction now. + */ + if (WARN_ON_ONCE(!fscrypt_has_encryption_key(inode))) + return -ENOKEY; if (disk_link->name) { /* filesystem-provided buffer */ @@ -319,7 +347,7 @@ const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, if (cstr.len + sizeof(*sd) - 1 > max_size) return ERR_PTR(-EUCLEAN); - err = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr); + err = fscrypt_fname_alloc_buffer(cstr.len, &pstr); if (err) return ERR_PTR(err); diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index faa25541ccb6..89bffa82ed74 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -106,7 +106,7 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci) crypto_cfg.data_unit_size = sb->s_blocksize; crypto_cfg.dun_bytes = fscrypt_get_dun_bytes(ci); num_devs = fscrypt_get_num_devices(sb); - devs = kmalloc_array(num_devs, sizeof(*devs), GFP_NOFS); + devs = kmalloc_array(num_devs, sizeof(*devs), GFP_KERNEL); if (!devs) return -ENOMEM; fscrypt_get_devices(sb, num_devs, devs); @@ -135,9 +135,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, struct fscrypt_blk_crypto_key *blk_key; int err; int i; - unsigned int flags; - blk_key = kzalloc(struct_size(blk_key, devs, num_devs), GFP_NOFS); + blk_key = kzalloc(struct_size(blk_key, devs, num_devs), GFP_KERNEL); if (!blk_key) return -ENOMEM; @@ -166,10 +165,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, } queue_refs++; - flags = memalloc_nofs_save(); err = blk_crypto_start_using_key(&blk_key->base, blk_key->devs[i]); - memalloc_nofs_restore(flags); if (err) { fscrypt_err(inode, "error %d starting to use blk-crypto", err); diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index e74f239c4428..53cc552a7b8f 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -817,6 +817,7 @@ static int check_for_busy_inodes(struct super_block *sb, struct list_head *pos; size_t busy_count = 0; unsigned long ino; + char ino_str[50] = ""; spin_lock(&mk->mk_decrypted_inodes_lock); @@ -838,11 +839,15 @@ static int check_for_busy_inodes(struct super_block *sb, } spin_unlock(&mk->mk_decrypted_inodes_lock); + /* If the inode is currently being created, ino may still be 0. */ + if (ino) + snprintf(ino_str, sizeof(ino_str), ", including ino %lu", ino); + fscrypt_warn(NULL, - "%s: %zu inode(s) still busy after removing key with %s %*phN, including ino %lu", + "%s: %zu inode(s) still busy after removing key with %s %*phN%s", sb->s_id, busy_count, master_key_spec_type(&mk->mk_spec), master_key_spec_len(&mk->mk_spec), (u8 *)&mk->mk_spec.u, - ino); + ino_str); return -EBUSY; } diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index fea6226afc2b..d3c3e5d9b41f 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -10,6 +10,7 @@ #include <crypto/skcipher.h> #include <linux/key.h> +#include <linux/random.h> #include "fscrypt_private.h" @@ -222,6 +223,16 @@ int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, return 0; } +void fscrypt_hash_inode_number(struct fscrypt_info *ci, + const struct fscrypt_master_key *mk) +{ + WARN_ON(ci->ci_inode->i_ino == 0); + WARN_ON(!mk->mk_ino_hash_key_initialized); + + ci->ci_hashed_ino = (u32)siphash_1u64(ci->ci_inode->i_ino, + &mk->mk_ino_hash_key); +} + static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_info *ci, struct fscrypt_master_key *mk) { @@ -254,13 +265,20 @@ unlock: return err; } - ci->ci_hashed_ino = (u32)siphash_1u64(ci->ci_inode->i_ino, - &mk->mk_ino_hash_key); + /* + * New inodes may not have an inode number assigned yet. + * Hashing their inode number is delayed until later. + */ + if (ci->ci_inode->i_ino == 0) + WARN_ON(!(ci->ci_inode->i_state & I_CREATING)); + else + fscrypt_hash_inode_number(ci, mk); return 0; } static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, - struct fscrypt_master_key *mk) + struct fscrypt_master_key *mk, + bool need_dirhash_key) { int err; @@ -306,7 +324,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, return err; /* Derive a secret dirhash key for directories that need it. */ - if (S_ISDIR(ci->ci_inode->i_mode) && IS_CASEFOLDED(ci->ci_inode)) { + if (need_dirhash_key) { err = fscrypt_derive_dirhash_key(ci, mk); if (err) return err; @@ -326,6 +344,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, * key being removed with a new inode starting to use it. */ static int setup_file_encryption_key(struct fscrypt_info *ci, + bool need_dirhash_key, struct key **master_key_ret) { struct key *key; @@ -400,7 +419,7 @@ static int setup_file_encryption_key(struct fscrypt_info *ci, err = fscrypt_setup_v1_file_key(ci, mk->mk_secret.raw); break; case FSCRYPT_POLICY_V2: - err = fscrypt_setup_v2_file_key(ci, mk); + err = fscrypt_setup_v2_file_key(ci, mk, need_dirhash_key); break; default: WARN_ON(1); @@ -454,57 +473,28 @@ static void put_crypt_info(struct fscrypt_info *ci) kmem_cache_free(fscrypt_info_cachep, ci); } -int fscrypt_get_encryption_info(struct inode *inode) +static int +fscrypt_setup_encryption_info(struct inode *inode, + const union fscrypt_policy *policy, + const u8 nonce[FSCRYPT_FILE_NONCE_SIZE], + bool need_dirhash_key) { struct fscrypt_info *crypt_info; - union fscrypt_context ctx; struct fscrypt_mode *mode; struct key *master_key = NULL; int res; - if (fscrypt_has_encryption_key(inode)) - return 0; - res = fscrypt_initialize(inode->i_sb->s_cop->flags); if (res) return res; - res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); - if (res < 0) { - const union fscrypt_context *dummy_ctx = - fscrypt_get_dummy_context(inode->i_sb); - - if (IS_ENCRYPTED(inode) || !dummy_ctx) { - fscrypt_warn(inode, - "Error %d getting encryption context", - res); - return res; - } - /* Fake up a context for an unencrypted directory */ - res = fscrypt_context_size(dummy_ctx); - memcpy(&ctx, dummy_ctx, res); - } - - crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_NOFS); + crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_KERNEL); if (!crypt_info) return -ENOMEM; crypt_info->ci_inode = inode; - - res = fscrypt_policy_from_context(&crypt_info->ci_policy, &ctx, res); - if (res) { - fscrypt_warn(inode, - "Unrecognized or corrupt encryption context"); - goto out; - } - - memcpy(crypt_info->ci_nonce, fscrypt_context_nonce(&ctx), - FSCRYPT_FILE_NONCE_SIZE); - - if (!fscrypt_supported_policy(&crypt_info->ci_policy, inode)) { - res = -EINVAL; - goto out; - } + crypt_info->ci_policy = *policy; + memcpy(crypt_info->ci_nonce, nonce, FSCRYPT_FILE_NONCE_SIZE); mode = select_encryption_mode(&crypt_info->ci_policy, inode); if (IS_ERR(mode)) { @@ -514,13 +504,14 @@ int fscrypt_get_encryption_info(struct inode *inode) WARN_ON(mode->ivsize > FSCRYPT_MAX_IV_SIZE); crypt_info->ci_mode = mode; - res = setup_file_encryption_key(crypt_info, &master_key); + res = setup_file_encryption_key(crypt_info, need_dirhash_key, + &master_key); if (res) goto out; /* - * Multiple tasks may race to set ->i_crypt_info, so use - * cmpxchg_release(). This pairs with the smp_load_acquire() in + * For existing inodes, multiple tasks may race to set ->i_crypt_info. + * So use cmpxchg_release(). This pairs with the smp_load_acquire() in * fscrypt_get_info(). I.e., here we publish ->i_crypt_info with a * RELEASE barrier so that other tasks can ACQUIRE it. */ @@ -550,14 +541,113 @@ out: up_read(&mk->mk_secret_sem); key_put(master_key); } + put_crypt_info(crypt_info); + return res; +} + +/** + * fscrypt_get_encryption_info() - set up an inode's encryption key + * @inode: the inode to set up the key for. Must be encrypted. + * + * Set up ->i_crypt_info, if it hasn't already been done. + * + * Note: unless ->i_crypt_info is already set, this isn't %GFP_NOFS-safe. So + * generally this shouldn't be called from within a filesystem transaction. + * + * Return: 0 if ->i_crypt_info was set or was already set, *or* if the + * encryption key is unavailable. (Use fscrypt_has_encryption_key() to + * distinguish these cases.) Also can return another -errno code. + */ +int fscrypt_get_encryption_info(struct inode *inode) +{ + int res; + union fscrypt_context ctx; + union fscrypt_policy policy; + + if (fscrypt_has_encryption_key(inode)) + return 0; + + res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (res < 0) { + fscrypt_warn(inode, "Error %d getting encryption context", res); + return res; + } + + res = fscrypt_policy_from_context(&policy, &ctx, res); + if (res) { + fscrypt_warn(inode, + "Unrecognized or corrupt encryption context"); + return res; + } + + if (!fscrypt_supported_policy(&policy, inode)) + return -EINVAL; + + res = fscrypt_setup_encryption_info(inode, &policy, + fscrypt_context_nonce(&ctx), + IS_CASEFOLDED(inode) && + S_ISDIR(inode->i_mode)); if (res == -ENOKEY) res = 0; - put_crypt_info(crypt_info); return res; } EXPORT_SYMBOL(fscrypt_get_encryption_info); /** + * fscrypt_prepare_new_inode() - prepare to create a new inode in a directory + * @dir: a possibly-encrypted directory + * @inode: the new inode. ->i_mode must be set already. + * ->i_ino doesn't need to be set yet. + * @encrypt_ret: (output) set to %true if the new inode will be encrypted + * + * If the directory is encrypted, set up its ->i_crypt_info in preparation for + * encrypting the name of the new file. Also, if the new inode will be + * encrypted, set up its ->i_crypt_info and set *encrypt_ret=true. + * + * This isn't %GFP_NOFS-safe, and therefore it should be called before starting + * any filesystem transaction to create the inode. For this reason, ->i_ino + * isn't required to be set yet, as the filesystem may not have set it yet. + * + * This doesn't persist the new inode's encryption context. That still needs to + * be done later by calling fscrypt_set_context(). + * + * Return: 0 on success, -ENOKEY if the encryption key is missing, or another + * -errno code + */ +int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode, + bool *encrypt_ret) +{ + const union fscrypt_policy *policy; + u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; + + policy = fscrypt_policy_to_inherit(dir); + if (policy == NULL) + return 0; + if (IS_ERR(policy)) + return PTR_ERR(policy); + + if (WARN_ON_ONCE(inode->i_mode == 0)) + return -EINVAL; + + /* + * Only regular files, directories, and symlinks are encrypted. + * Special files like device nodes and named pipes aren't. + */ + if (!S_ISREG(inode->i_mode) && + !S_ISDIR(inode->i_mode) && + !S_ISLNK(inode->i_mode)) + return 0; + + *encrypt_ret = true; + + get_random_bytes(nonce, FSCRYPT_FILE_NONCE_SIZE); + return fscrypt_setup_encryption_info(inode, policy, nonce, + IS_CASEFOLDED(dir) && + S_ISDIR(inode->i_mode)); +} +EXPORT_SYMBOL_GPL(fscrypt_prepare_new_inode); + +/** * fscrypt_put_encryption_info() - free most of an inode's fscrypt data * @inode: an inode being evicted * diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c index a3cb52572b05..2762c5350432 100644 --- a/fs/crypto/keysetup_v1.c +++ b/fs/crypto/keysetup_v1.c @@ -60,7 +60,7 @@ static int derive_key_aes(const u8 *master_key, goto out; } crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); - req = skcipher_request_alloc(tfm, GFP_NOFS); + req = skcipher_request_alloc(tfm, GFP_KERNEL); if (!req) { res = -ENOMEM; goto out; @@ -99,7 +99,7 @@ find_and_lock_process_key(const char *prefix, const struct user_key_payload *ukp; const struct fscrypt_key *payload; - description = kasprintf(GFP_NOFS, "%s%*phN", prefix, + description = kasprintf(GFP_KERNEL, "%s%*phN", prefix, FSCRYPT_KEY_DESCRIPTOR_SIZE, descriptor); if (!description) return ERR_PTR(-ENOMEM); @@ -228,7 +228,7 @@ fscrypt_get_direct_key(const struct fscrypt_info *ci, const u8 *raw_key) return dk; /* Nope, allocate one. */ - dk = kzalloc(sizeof(*dk), GFP_NOFS); + dk = kzalloc(sizeof(*dk), GFP_KERNEL); if (!dk) return ERR_PTR(-ENOMEM); refcount_set(&dk->dk_refcount, 1); @@ -272,7 +272,7 @@ static int setup_v1_file_key_derived(struct fscrypt_info *ci, * This cannot be a stack buffer because it will be passed to the * scatterlist crypto API during derive_key_aes(). */ - derived_key = kmalloc(ci->ci_mode->keysize, GFP_NOFS); + derived_key = kmalloc(ci->ci_mode->keysize, GFP_KERNEL); if (!derived_key) return -ENOMEM; diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 2d73fd39ad96..4441d9944b9e 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -32,6 +32,14 @@ bool fscrypt_policies_equal(const union fscrypt_policy *policy1, return !memcmp(policy1, policy2, fscrypt_policy_size(policy1)); } +static const union fscrypt_policy * +fscrypt_get_dummy_policy(struct super_block *sb) +{ + if (!sb->s_cop->get_dummy_policy) + return NULL; + return sb->s_cop->get_dummy_policy(sb); +} + static bool fscrypt_valid_enc_modes(u32 contents_mode, u32 filenames_mode) { if (contents_mode == FSCRYPT_MODE_AES_256_XTS && @@ -192,10 +200,15 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy, 32, 32)) return false; + /* + * IV_INO_LBLK_32 hashes the inode number, so in principle it can + * support any ino_bits. However, currently the inode number is gotten + * from inode::i_ino which is 'unsigned long'. So for now the + * implementation limit is 32 bits. + */ if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) && - /* This uses hashed inode numbers, so ino_bits doesn't matter. */ !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_32", - INT_MAX, 32)) + 32, 32)) return false; if (memchr_inv(policy->__reserved, 0, sizeof(policy->__reserved))) { @@ -231,18 +244,19 @@ bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, } /** - * fscrypt_new_context_from_policy() - create a new fscrypt_context from - * an fscrypt_policy + * fscrypt_new_context() - create a new fscrypt_context * @ctx_u: output context * @policy_u: input policy + * @nonce: nonce to use * * Create an fscrypt_context for an inode that is being assigned the given - * encryption policy. A new nonce is randomly generated. + * encryption policy. @nonce must be a new random nonce. * * Return: the size of the new context in bytes. */ -static int fscrypt_new_context_from_policy(union fscrypt_context *ctx_u, - const union fscrypt_policy *policy_u) +static int fscrypt_new_context(union fscrypt_context *ctx_u, + const union fscrypt_policy *policy_u, + const u8 nonce[FSCRYPT_FILE_NONCE_SIZE]) { memset(ctx_u, 0, sizeof(*ctx_u)); @@ -260,7 +274,7 @@ static int fscrypt_new_context_from_policy(union fscrypt_context *ctx_u, memcpy(ctx->master_key_descriptor, policy->master_key_descriptor, sizeof(ctx->master_key_descriptor)); - get_random_bytes(ctx->nonce, sizeof(ctx->nonce)); + memcpy(ctx->nonce, nonce, FSCRYPT_FILE_NONCE_SIZE); return sizeof(*ctx); } case FSCRYPT_POLICY_V2: { @@ -276,7 +290,7 @@ static int fscrypt_new_context_from_policy(union fscrypt_context *ctx_u, memcpy(ctx->master_key_identifier, policy->master_key_identifier, sizeof(ctx->master_key_identifier)); - get_random_bytes(ctx->nonce, sizeof(ctx->nonce)); + memcpy(ctx->nonce, nonce, FSCRYPT_FILE_NONCE_SIZE); return sizeof(*ctx); } } @@ -372,6 +386,7 @@ static int fscrypt_get_policy(struct inode *inode, union fscrypt_policy *policy) static int set_encryption_policy(struct inode *inode, const union fscrypt_policy *policy) { + u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; union fscrypt_context ctx; int ctxsize; int err; @@ -409,7 +424,8 @@ static int set_encryption_policy(struct inode *inode, return -EINVAL; } - ctxsize = fscrypt_new_context_from_policy(&ctx, policy); + get_random_bytes(nonce, FSCRYPT_FILE_NONCE_SIZE); + ctxsize = fscrypt_new_context(&ctx, policy, nonce); return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, NULL); } @@ -620,86 +636,99 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) } EXPORT_SYMBOL(fscrypt_has_permitted_context); +/* + * Return the encryption policy that new files in the directory will inherit, or + * NULL if none, or an ERR_PTR() on error. If the directory is encrypted, also + * ensure that its key is set up, so that the new filename can be encrypted. + */ +const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir) +{ + int err; + + if (IS_ENCRYPTED(dir)) { + err = fscrypt_require_key(dir); + if (err) + return ERR_PTR(err); + return &dir->i_crypt_info->ci_policy; + } + + return fscrypt_get_dummy_policy(dir->i_sb); +} + /** - * fscrypt_inherit_context() - Sets a child context from its parent - * @parent: Parent inode from which the context is inherited. - * @child: Child inode that inherits the context from @parent. - * @fs_data: private data given by FS. - * @preload: preload child i_crypt_info if true + * fscrypt_set_context() - Set the fscrypt context of a new inode + * @inode: a new inode + * @fs_data: private data given by FS and passed to ->set_context() + * + * This should be called after fscrypt_prepare_new_inode(), generally during a + * filesystem transaction. Everything here must be %GFP_NOFS-safe. * * Return: 0 on success, -errno on failure */ -int fscrypt_inherit_context(struct inode *parent, struct inode *child, - void *fs_data, bool preload) +int fscrypt_set_context(struct inode *inode, void *fs_data) { + struct fscrypt_info *ci = inode->i_crypt_info; union fscrypt_context ctx; int ctxsize; - struct fscrypt_info *ci; - int res; - - res = fscrypt_get_encryption_info(parent); - if (res < 0) - return res; - ci = fscrypt_get_info(parent); - if (ci == NULL) + /* fscrypt_prepare_new_inode() should have set up the key already. */ + if (WARN_ON_ONCE(!ci)) return -ENOKEY; - ctxsize = fscrypt_new_context_from_policy(&ctx, &ci->ci_policy); - BUILD_BUG_ON(sizeof(ctx) != FSCRYPT_SET_CONTEXT_MAX_SIZE); - res = parent->i_sb->s_cop->set_context(child, &ctx, ctxsize, fs_data); - if (res) - return res; - return preload ? fscrypt_get_encryption_info(child): 0; + ctxsize = fscrypt_new_context(&ctx, &ci->ci_policy, ci->ci_nonce); + + /* + * This may be the first time the inode number is available, so do any + * delayed key setup that requires the inode number. + */ + if (ci->ci_policy.version == FSCRYPT_POLICY_V2 && + (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) { + const struct fscrypt_master_key *mk = + ci->ci_master_key->payload.data[0]; + + fscrypt_hash_inode_number(ci, mk); + } + + return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, fs_data); } -EXPORT_SYMBOL(fscrypt_inherit_context); +EXPORT_SYMBOL_GPL(fscrypt_set_context); /** * fscrypt_set_test_dummy_encryption() - handle '-o test_dummy_encryption' * @sb: the filesystem on which test_dummy_encryption is being specified - * @arg: the argument to the test_dummy_encryption option. - * If no argument was specified, then @arg->from == NULL. - * @dummy_ctx: the filesystem's current dummy context (input/output, see below) + * @arg: the argument to the test_dummy_encryption option. May be NULL. + * @dummy_policy: the filesystem's current dummy policy (input/output, see + * below) * * Handle the test_dummy_encryption mount option by creating a dummy encryption - * context, saving it in @dummy_ctx, and adding the corresponding dummy - * encryption key to the filesystem. If the @dummy_ctx is already set, then + * policy, saving it in @dummy_policy, and adding the corresponding dummy + * encryption key to the filesystem. If the @dummy_policy is already set, then * instead validate that it matches @arg. Don't support changing it via * remount, as that is difficult to do safely. * - * The reason we use an fscrypt_context rather than an fscrypt_policy is because - * we mustn't generate a new nonce each time we access a dummy-encrypted - * directory, as that would change the way filenames are encrypted. - * - * Return: 0 on success (dummy context set, or the same context is already set); - * -EEXIST if a different dummy context is already set; + * Return: 0 on success (dummy policy set, or the same policy is already set); + * -EEXIST if a different dummy policy is already set; * or another -errno value. */ -int fscrypt_set_test_dummy_encryption(struct super_block *sb, - const substring_t *arg, - struct fscrypt_dummy_context *dummy_ctx) +int fscrypt_set_test_dummy_encryption(struct super_block *sb, const char *arg, + struct fscrypt_dummy_policy *dummy_policy) { - const char *argstr = "v2"; - const char *argstr_to_free = NULL; struct fscrypt_key_specifier key_spec = { 0 }; int version; - union fscrypt_context *ctx = NULL; + union fscrypt_policy *policy = NULL; int err; - if (arg->from) { - argstr = argstr_to_free = match_strdup(arg); - if (!argstr) - return -ENOMEM; - } + if (!arg) + arg = "v2"; - if (!strcmp(argstr, "v1")) { - version = FSCRYPT_CONTEXT_V1; + if (!strcmp(arg, "v1")) { + version = FSCRYPT_POLICY_V1; key_spec.type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR; memset(key_spec.u.descriptor, 0x42, FSCRYPT_KEY_DESCRIPTOR_SIZE); - } else if (!strcmp(argstr, "v2")) { - version = FSCRYPT_CONTEXT_V2; + } else if (!strcmp(arg, "v2")) { + version = FSCRYPT_POLICY_V2; key_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER; /* key_spec.u.identifier gets filled in when adding the key */ } else { @@ -707,21 +736,8 @@ int fscrypt_set_test_dummy_encryption(struct super_block *sb, goto out; } - if (dummy_ctx->ctx) { - /* - * Note: if we ever make test_dummy_encryption support - * specifying other encryption settings, such as the encryption - * modes, we'll need to compare those settings here. - */ - if (dummy_ctx->ctx->version == version) - err = 0; - else - err = -EEXIST; - goto out; - } - - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) { + policy = kzalloc(sizeof(*policy), GFP_KERNEL); + if (!policy) { err = -ENOMEM; goto out; } @@ -730,18 +746,18 @@ int fscrypt_set_test_dummy_encryption(struct super_block *sb, if (err) goto out; - ctx->version = version; - switch (ctx->version) { - case FSCRYPT_CONTEXT_V1: - ctx->v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; - ctx->v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; - memcpy(ctx->v1.master_key_descriptor, key_spec.u.descriptor, + policy->version = version; + switch (policy->version) { + case FSCRYPT_POLICY_V1: + policy->v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; + policy->v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; + memcpy(policy->v1.master_key_descriptor, key_spec.u.descriptor, FSCRYPT_KEY_DESCRIPTOR_SIZE); break; - case FSCRYPT_CONTEXT_V2: - ctx->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; - ctx->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; - memcpy(ctx->v2.master_key_identifier, key_spec.u.identifier, + case FSCRYPT_POLICY_V2: + policy->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; + policy->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; + memcpy(policy->v2.master_key_identifier, key_spec.u.identifier, FSCRYPT_KEY_IDENTIFIER_SIZE); break; default: @@ -749,12 +765,19 @@ int fscrypt_set_test_dummy_encryption(struct super_block *sb, err = -EINVAL; goto out; } - dummy_ctx->ctx = ctx; - ctx = NULL; + + if (dummy_policy->policy) { + if (fscrypt_policies_equal(policy, dummy_policy->policy)) + err = 0; + else + err = -EEXIST; + goto out; + } + dummy_policy->policy = policy; + policy = NULL; err = 0; out: - kfree(ctx); - kfree(argstr_to_free); + kfree(policy); return err; } EXPORT_SYMBOL_GPL(fscrypt_set_test_dummy_encryption); @@ -771,10 +794,16 @@ EXPORT_SYMBOL_GPL(fscrypt_set_test_dummy_encryption); void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep, struct super_block *sb) { - const union fscrypt_context *ctx = fscrypt_get_dummy_context(sb); + const union fscrypt_policy *policy = fscrypt_get_dummy_policy(sb); + int vers; - if (!ctx) + if (!policy) return; - seq_printf(seq, "%ctest_dummy_encryption=v%d", sep, ctx->version); + + vers = policy->version; + if (vers == FSCRYPT_POLICY_V1) /* Handle numbering quirk */ + vers = 1; + + seq_printf(seq, "%ctest_dummy_encryption=v%d", sep, vers); } EXPORT_SYMBOL_GPL(fscrypt_show_test_dummy_encryption); diff --git a/fs/d_path.c b/fs/d_path.c index 0f1fc1743302..a69e2cd36e6e 100644 --- a/fs/d_path.c +++ b/fs/d_path.c @@ -102,6 +102,8 @@ restart: if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { struct mount *parent = READ_ONCE(mnt->mnt_parent); + struct mnt_namespace *mnt_ns; + /* Escaped? */ if (dentry != vfsmnt->mnt_root) { bptr = *buffer; @@ -116,7 +118,9 @@ restart: vfsmnt = &mnt->mnt; continue; } - if (is_mounted(vfsmnt) && !is_anon_ns(mnt->mnt_ns)) + mnt_ns = READ_ONCE(mnt->mnt_ns); + /* open-coded is_mounted() to use local mnt_ns */ + if (!IS_ERR_OR_NULL(mnt_ns) && !is_anon_ns(mnt_ns)) error = 1; // absolute root else error = 2; // detached or not attached yet @@ -1037,18 +1037,18 @@ static vm_fault_t dax_load_hole(struct xa_state *xas, return ret; } -int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size, - struct iomap *iomap) +s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap) { sector_t sector = iomap_sector(iomap, pos & PAGE_MASK); pgoff_t pgoff; long rc, id; void *kaddr; bool page_aligned = false; - + unsigned offset = offset_in_page(pos); + unsigned size = min_t(u64, PAGE_SIZE - offset, length); if (IS_ALIGNED(sector << SECTOR_SHIFT, PAGE_SIZE) && - IS_ALIGNED(size, PAGE_SIZE)) + (size == PAGE_SIZE)) page_aligned = true; rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff); @@ -1058,8 +1058,7 @@ int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size, id = dax_read_lock(); if (page_aligned) - rc = dax_zero_page_range(iomap->dax_dev, pgoff, - size >> PAGE_SHIFT); + rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1); else rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL); if (rc < 0) { @@ -1072,7 +1071,7 @@ int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size, dax_flush(iomap->dax_dev, kaddr + offset, size); } dax_read_unlock(id); - return 0; + return size; } static loff_t @@ -1367,7 +1366,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, ret = dax_load_hole(&xas, mapping, &entry, vmf); goto finish_iomap; } - /*FALLTHRU*/ + fallthrough; default: WARN_ON_ONCE(1); error = -EIO; diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index b167d2d02148..a768a09430c3 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -177,7 +177,7 @@ static int open_proxy_open(struct inode *inode, struct file *filp) goto out; if (!fops_get(real_fops)) { -#ifdef MODULE +#ifdef CONFIG_MODULES if (real_fops->owner && real_fops->owner->state == MODULE_STATE_GOING) goto out; @@ -312,7 +312,7 @@ static int full_proxy_open(struct inode *inode, struct file *filp) goto out; if (!fops_get(real_fops)) { -#ifdef MODULE +#ifdef CONFIG_MODULES if (real_fops->owner && real_fops->owner->state == MODULE_STATE_GOING) goto out; diff --git a/fs/direct-io.c b/fs/direct-io.c index 183299892465..abf535b036ab 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -386,25 +386,6 @@ static void dio_bio_end_io(struct bio *bio) spin_unlock_irqrestore(&dio->bio_lock, flags); } -/** - * dio_end_io - handle the end io action for the given bio - * @bio: The direct io bio thats being completed - * - * This is meant to be called by any filesystem that uses their own dio_submit_t - * so that the DIO specific endio actions are dealt with after the filesystem - * has done it's completion work. - */ -void dio_end_io(struct bio *bio) -{ - struct dio *dio = bio->bi_private; - - if (dio->is_async) - dio_bio_end_aio(bio); - else - dio_bio_end_io(bio); -} -EXPORT_SYMBOL_GPL(dio_end_io); - static inline void dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, struct block_device *bdev, diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig index f82a4952769d..ee92634196a8 100644 --- a/fs/dlm/Kconfig +++ b/fs/dlm/Kconfig @@ -4,6 +4,7 @@ menuconfig DLM depends on INET depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n) select IP_SCTP + select SRCU help A general purpose distributed lock manager for kernel or userspace applications. diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 47f0b98b707f..49c5f9407098 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -125,7 +125,7 @@ static ssize_t cluster_cluster_name_store(struct config_item *item, CONFIGFS_ATTR(cluster_, cluster_name); static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, - int *info_field, int check_zero, + int *info_field, bool (*check_cb)(unsigned int x), const char *buf, size_t len) { unsigned int x; @@ -137,7 +137,7 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, if (rc) return rc; - if (check_zero && !x) + if (check_cb && check_cb(x)) return -EINVAL; *cl_field = x; @@ -146,13 +146,13 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, return len; } -#define CLUSTER_ATTR(name, check_zero) \ +#define CLUSTER_ATTR(name, check_cb) \ static ssize_t cluster_##name##_store(struct config_item *item, \ const char *buf, size_t len) \ { \ struct dlm_cluster *cl = config_item_to_cluster(item); \ return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name, \ - check_zero, buf, len); \ + check_cb, buf, len); \ } \ static ssize_t cluster_##name##_show(struct config_item *item, char *buf) \ { \ @@ -161,20 +161,30 @@ static ssize_t cluster_##name##_show(struct config_item *item, char *buf) \ } \ CONFIGFS_ATTR(cluster_, name); -CLUSTER_ATTR(tcp_port, 1); -CLUSTER_ATTR(buffer_size, 1); -CLUSTER_ATTR(rsbtbl_size, 1); -CLUSTER_ATTR(recover_timer, 1); -CLUSTER_ATTR(toss_secs, 1); -CLUSTER_ATTR(scan_secs, 1); -CLUSTER_ATTR(log_debug, 0); -CLUSTER_ATTR(log_info, 0); -CLUSTER_ATTR(protocol, 0); -CLUSTER_ATTR(mark, 0); -CLUSTER_ATTR(timewarn_cs, 1); -CLUSTER_ATTR(waitwarn_us, 0); -CLUSTER_ATTR(new_rsb_count, 0); -CLUSTER_ATTR(recover_callbacks, 0); +static bool dlm_check_zero(unsigned int x) +{ + return !x; +} + +static bool dlm_check_buffer_size(unsigned int x) +{ + return (x < DEFAULT_BUFFER_SIZE); +} + +CLUSTER_ATTR(tcp_port, dlm_check_zero); +CLUSTER_ATTR(buffer_size, dlm_check_buffer_size); +CLUSTER_ATTR(rsbtbl_size, dlm_check_zero); +CLUSTER_ATTR(recover_timer, dlm_check_zero); +CLUSTER_ATTR(toss_secs, dlm_check_zero); +CLUSTER_ATTR(scan_secs, dlm_check_zero); +CLUSTER_ATTR(log_debug, NULL); +CLUSTER_ATTR(log_info, NULL); +CLUSTER_ATTR(protocol, NULL); +CLUSTER_ATTR(mark, NULL); +CLUSTER_ATTR(timewarn_cs, dlm_check_zero); +CLUSTER_ATTR(waitwarn_us, NULL); +CLUSTER_ATTR(new_rsb_count, NULL); +CLUSTER_ATTR(recover_callbacks, NULL); static struct configfs_attribute *cluster_attrs[] = { [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port, @@ -221,6 +231,7 @@ struct dlm_space { struct list_head members; struct mutex members_lock; int members_count; + struct dlm_nodes *nds; }; struct dlm_comms { @@ -430,6 +441,7 @@ static struct config_group *make_space(struct config_group *g, const char *name) INIT_LIST_HEAD(&sp->members); mutex_init(&sp->members_lock); sp->members_count = 0; + sp->nds = nds; return &sp->group; fail: @@ -451,6 +463,7 @@ static void drop_space(struct config_group *g, struct config_item *i) static void release_space(struct config_item *i) { struct dlm_space *sp = config_item_to_space(i); + kfree(sp->nds); kfree(sp); } @@ -857,18 +870,22 @@ int dlm_comm_seq(int nodeid, uint32_t *seq) return 0; } -int dlm_comm_mark(int nodeid, unsigned int *mark) +void dlm_comm_mark(int nodeid, unsigned int *mark) { struct dlm_comm *cm; cm = get_comm(nodeid); - if (!cm) - return -ENOENT; + if (!cm) { + *mark = dlm_config.ci_mark; + return; + } - *mark = cm->mark; - put_comm(cm); + if (cm->mark) + *mark = cm->mark; + else + *mark = dlm_config.ci_mark; - return 0; + put_comm(cm); } int dlm_our_nodeid(void) @@ -889,7 +906,6 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) /* Config file defaults */ #define DEFAULT_TCP_PORT 21064 -#define DEFAULT_BUFFER_SIZE 4096 #define DEFAULT_RSBTBL_SIZE 1024 #define DEFAULT_RECOVER_TIMER 5 #define DEFAULT_TOSS_SECS 10 diff --git a/fs/dlm/config.h b/fs/dlm/config.h index f62996cad561..c210250a2581 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -12,6 +12,8 @@ #ifndef __CONFIG_DOT_H__ #define __CONFIG_DOT_H__ +#define DEFAULT_BUFFER_SIZE 4096 + struct dlm_config_node { int nodeid; int weight; @@ -46,7 +48,7 @@ void dlm_config_exit(void); int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, int *count_out); int dlm_comm_seq(int nodeid, uint32_t *seq); -int dlm_comm_mark(int nodeid, unsigned int *mark); +void dlm_comm_mark(int nodeid, unsigned int *mark); int dlm_our_nodeid(void); int dlm_our_addr(struct sockaddr_storage *addr, int num); diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 18d81599522f..002123efc6b0 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -5817,7 +5817,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, break; case -EAGAIN: error = 0; - /* fall through */ + fallthrough; default: __put_lkb(ls, lkb); goto out; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 5050fe05769b..79f56f16bc2c 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -65,40 +65,6 @@ #define MAX_SEND_MSG_COUNT 25 #define DLM_SHUTDOWN_WAIT_TIMEOUT msecs_to_jiffies(10000) -struct cbuf { - unsigned int base; - unsigned int len; - unsigned int mask; -}; - -static void cbuf_add(struct cbuf *cb, int n) -{ - cb->len += n; -} - -static int cbuf_data(struct cbuf *cb) -{ - return ((cb->base + cb->len) & cb->mask); -} - -static void cbuf_init(struct cbuf *cb, int size) -{ - cb->base = cb->len = 0; - cb->mask = size-1; -} - -static void cbuf_eat(struct cbuf *cb, int n) -{ - cb->len -= n; - cb->base += n; - cb->base &= cb->mask; -} - -static bool cbuf_empty(struct cbuf *cb) -{ - return cb->len == 0; -} - struct connection { struct socket *sock; /* NULL if not connected */ uint32_t nodeid; /* So we know who we are in the list */ @@ -117,8 +83,6 @@ struct connection { int (*rx_action) (struct connection *); /* What to do when active */ void (*connect_action) (struct connection *); /* What to do to connect */ void (*shutdown_action)(struct connection *con); /* What to do to shutdown */ - struct page *rx_page; - struct cbuf cb; int retries; #define MAX_CONNECT_RETRIES 3 struct hlist_node list; @@ -126,6 +90,10 @@ struct connection { struct work_struct rwork; /* Receive workqueue */ struct work_struct swork; /* Send workqueue */ wait_queue_head_t shutdown_wait; /* wait for graceful shutdown */ + unsigned char *rx_buf; + int rx_buflen; + int rx_leftover; + struct rcu_head rcu; }; #define sock2con(x) ((struct connection *)(x)->sk_user_data) @@ -167,8 +135,8 @@ static struct workqueue_struct *recv_workqueue; static struct workqueue_struct *send_workqueue; static struct hlist_head connection_hash[CONN_HASH_SIZE]; -static DEFINE_MUTEX(connections_lock); -static struct kmem_cache *con_cache; +static DEFINE_SPINLOCK(connections_lock); +DEFINE_STATIC_SRCU(connections_srcu); static void process_recv_sockets(struct work_struct *work); static void process_send_sockets(struct work_struct *work); @@ -184,15 +152,20 @@ static inline int nodeid_hash(int nodeid) static struct connection *__find_con(int nodeid) { - int r; + int r, idx; struct connection *con; r = nodeid_hash(nodeid); - hlist_for_each_entry(con, &connection_hash[r], list) { - if (con->nodeid == nodeid) + idx = srcu_read_lock(&connections_srcu); + hlist_for_each_entry_rcu(con, &connection_hash[r], list) { + if (con->nodeid == nodeid) { + srcu_read_unlock(&connections_srcu, idx); return con; + } } + srcu_read_unlock(&connections_srcu, idx); + return NULL; } @@ -200,21 +173,25 @@ static struct connection *__find_con(int nodeid) * If 'allocation' is zero then we don't attempt to create a new * connection structure for this node. */ -static struct connection *__nodeid2con(int nodeid, gfp_t alloc) +static struct connection *nodeid2con(int nodeid, gfp_t alloc) { - struct connection *con = NULL; + struct connection *con, *tmp; int r; con = __find_con(nodeid); if (con || !alloc) return con; - con = kmem_cache_zalloc(con_cache, alloc); + con = kzalloc(sizeof(*con), alloc); if (!con) return NULL; - r = nodeid_hash(nodeid); - hlist_add_head(&con->list, &connection_hash[r]); + con->rx_buflen = dlm_config.ci_buffer_size; + con->rx_buf = kmalloc(con->rx_buflen, GFP_NOFS); + if (!con->rx_buf) { + kfree(con); + return NULL; + } con->nodeid = nodeid; mutex_init(&con->sock_mutex); @@ -233,31 +210,41 @@ static struct connection *__nodeid2con(int nodeid, gfp_t alloc) con->rx_action = zerocon->rx_action; } + r = nodeid_hash(nodeid); + + spin_lock(&connections_lock); + /* Because multiple workqueues/threads calls this function it can + * race on multiple cpu's. Instead of locking hot path __find_con() + * we just check in rare cases of recently added nodes again + * under protection of connections_lock. If this is the case we + * abort our connection creation and return the existing connection. + */ + tmp = __find_con(nodeid); + if (tmp) { + spin_unlock(&connections_lock); + kfree(con->rx_buf); + kfree(con); + return tmp; + } + + hlist_add_head_rcu(&con->list, &connection_hash[r]); + spin_unlock(&connections_lock); + return con; } /* Loop round all connections */ static void foreach_conn(void (*conn_func)(struct connection *c)) { - int i; - struct hlist_node *n; + int i, idx; struct connection *con; + idx = srcu_read_lock(&connections_srcu); for (i = 0; i < CONN_HASH_SIZE; i++) { - hlist_for_each_entry_safe(con, n, &connection_hash[i], list) + hlist_for_each_entry_rcu(con, &connection_hash[i], list) conn_func(con); } -} - -static struct connection *nodeid2con(int nodeid, gfp_t allocation) -{ - struct connection *con; - - mutex_lock(&connections_lock); - con = __nodeid2con(nodeid, allocation); - mutex_unlock(&connections_lock); - - return con; + srcu_read_unlock(&connections_srcu, idx); } static struct dlm_node_addr *find_node_addr(int nodeid) @@ -614,11 +601,8 @@ static void close_connection(struct connection *con, bool and_other, /* Will only re-enter once. */ close_connection(con->othercon, false, true, true); } - if (con->rx_page) { - __free_page(con->rx_page); - con->rx_page = NULL; - } + con->rx_leftover = 0; con->retries = 0; mutex_unlock(&con->sock_mutex); clear_bit(CF_CLOSING, &con->flags); @@ -672,16 +656,33 @@ static void dlm_tcp_shutdown(struct connection *con) shutdown_connection(con); } +static int con_realloc_receive_buf(struct connection *con, int newlen) +{ + unsigned char *newbuf; + + newbuf = kmalloc(newlen, GFP_NOFS); + if (!newbuf) + return -ENOMEM; + + /* copy any leftover from last receive */ + if (con->rx_leftover) + memmove(newbuf, con->rx_buf, con->rx_leftover); + + /* swap to new buffer space */ + kfree(con->rx_buf); + con->rx_buflen = newlen; + con->rx_buf = newbuf; + + return 0; +} + /* Data received from remote end */ static int receive_from_sock(struct connection *con) { - int ret = 0; - struct msghdr msg = {}; - struct kvec iov[2]; - unsigned len; - int r; int call_again_soon = 0; - int nvec; + struct msghdr msg; + struct kvec iov; + int ret, buflen; mutex_lock(&con->sock_mutex); @@ -689,71 +690,55 @@ static int receive_from_sock(struct connection *con) ret = -EAGAIN; goto out_close; } + if (con->nodeid == 0) { ret = -EINVAL; goto out_close; } - if (con->rx_page == NULL) { - /* - * This doesn't need to be atomic, but I think it should - * improve performance if it is. - */ - con->rx_page = alloc_page(GFP_ATOMIC); - if (con->rx_page == NULL) + /* realloc if we get new buffer size to read out */ + buflen = dlm_config.ci_buffer_size; + if (con->rx_buflen != buflen && con->rx_leftover <= buflen) { + ret = con_realloc_receive_buf(con, buflen); + if (ret < 0) goto out_resched; - cbuf_init(&con->cb, PAGE_SIZE); } - /* - * iov[0] is the bit of the circular buffer between the current end - * point (cb.base + cb.len) and the end of the buffer. - */ - iov[0].iov_len = con->cb.base - cbuf_data(&con->cb); - iov[0].iov_base = page_address(con->rx_page) + cbuf_data(&con->cb); - iov[1].iov_len = 0; - nvec = 1; - - /* - * iov[1] is the bit of the circular buffer between the start of the - * buffer and the start of the currently used section (cb.base) + /* calculate new buffer parameter regarding last receive and + * possible leftover bytes */ - if (cbuf_data(&con->cb) >= con->cb.base) { - iov[0].iov_len = PAGE_SIZE - cbuf_data(&con->cb); - iov[1].iov_len = con->cb.base; - iov[1].iov_base = page_address(con->rx_page); - nvec = 2; - } - len = iov[0].iov_len + iov[1].iov_len; - iov_iter_kvec(&msg.msg_iter, READ, iov, nvec, len); + iov.iov_base = con->rx_buf + con->rx_leftover; + iov.iov_len = con->rx_buflen - con->rx_leftover; - r = ret = sock_recvmsg(con->sock, &msg, MSG_DONTWAIT | MSG_NOSIGNAL); + memset(&msg, 0, sizeof(msg)); + msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; + ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len, + msg.msg_flags); if (ret <= 0) goto out_close; - else if (ret == len) + else if (ret == iov.iov_len) call_again_soon = 1; - cbuf_add(&con->cb, ret); - ret = dlm_process_incoming_buffer(con->nodeid, - page_address(con->rx_page), - con->cb.base, con->cb.len, - PAGE_SIZE); - if (ret < 0) { - log_print("lowcomms err %d: addr=%p, base=%u, len=%u, read=%d", - ret, page_address(con->rx_page), con->cb.base, - con->cb.len, r); - cbuf_eat(&con->cb, r); - } else { - cbuf_eat(&con->cb, ret); - } + /* new buflen according readed bytes and leftover from last receive */ + buflen = ret + con->rx_leftover; + ret = dlm_process_incoming_buffer(con->nodeid, con->rx_buf, buflen); + if (ret < 0) + goto out_close; - if (cbuf_empty(&con->cb) && !call_again_soon) { - __free_page(con->rx_page); - con->rx_page = NULL; + /* calculate leftover bytes from process and put it into begin of + * the receive buffer, so next receive we have the full message + * at the start address of the receive buffer. + */ + con->rx_leftover = buflen - ret; + if (con->rx_leftover) { + memmove(con->rx_buf, con->rx_buf + ret, + con->rx_leftover); + call_again_soon = true; } if (call_again_soon) goto out_resched; + mutex_unlock(&con->sock_mutex); return 0; @@ -791,13 +776,11 @@ static int accept_from_sock(struct connection *con) int nodeid; struct connection *newcon; struct connection *addcon; + unsigned int mark; - mutex_lock(&connections_lock); if (!dlm_allow_conn) { - mutex_unlock(&connections_lock); return -1; } - mutex_unlock(&connections_lock); mutex_lock_nested(&con->sock_mutex, 0); @@ -830,6 +813,9 @@ static int accept_from_sock(struct connection *con) return -1; } + dlm_comm_mark(nodeid, &mark); + sock_set_mark(newsock->sk, mark); + log_print("got connection from %d", nodeid); /* Check to see if we already have a connection to this node. This @@ -847,13 +833,24 @@ static int accept_from_sock(struct connection *con) struct connection *othercon = newcon->othercon; if (!othercon) { - othercon = kmem_cache_zalloc(con_cache, GFP_NOFS); + othercon = kzalloc(sizeof(*othercon), GFP_NOFS); if (!othercon) { log_print("failed to allocate incoming socket"); mutex_unlock(&newcon->sock_mutex); result = -ENOMEM; goto accept_err; } + + othercon->rx_buflen = dlm_config.ci_buffer_size; + othercon->rx_buf = kmalloc(othercon->rx_buflen, GFP_NOFS); + if (!othercon->rx_buf) { + mutex_unlock(&newcon->sock_mutex); + kfree(othercon); + log_print("failed to allocate incoming socket receive buffer"); + result = -ENOMEM; + goto accept_err; + } + othercon->nodeid = nodeid; othercon->rx_action = receive_from_sock; mutex_init(&othercon->sock_mutex); @@ -975,6 +972,8 @@ static void sctp_connect_to_sock(struct connection *con) return; } + dlm_comm_mark(con->nodeid, &mark); + mutex_lock(&con->sock_mutex); /* Some odd races can cause double-connects, ignore them */ @@ -999,11 +998,6 @@ static void sctp_connect_to_sock(struct connection *con) if (result < 0) goto socket_err; - /* set skb mark */ - result = dlm_comm_mark(con->nodeid, &mark); - if (result < 0) - goto bind_err; - sock_set_mark(sock->sk, mark); con->rx_action = receive_from_sock; @@ -1076,6 +1070,8 @@ static void tcp_connect_to_sock(struct connection *con) return; } + dlm_comm_mark(con->nodeid, &mark); + mutex_lock(&con->sock_mutex); if (con->retries++ > MAX_CONNECT_RETRIES) goto out; @@ -1090,11 +1086,6 @@ static void tcp_connect_to_sock(struct connection *con) if (result < 0) goto out_err; - /* set skb mark */ - result = dlm_comm_mark(con->nodeid, &mark); - if (result < 0) - goto out_err; - sock_set_mark(sock->sk, mark); memset(&saddr, 0, sizeof(saddr)); @@ -1238,6 +1229,14 @@ static void init_local(void) } } +static void deinit_local(void) +{ + int i; + + for (i = 0; i < dlm_local_count; i++) + kfree(dlm_local_addr[i]); +} + /* Initialise SCTP socket and bind to all interfaces */ static int sctp_listen_for_all(void) { @@ -1546,13 +1545,6 @@ static void process_send_sockets(struct work_struct *work) send_to_sock(con); } - -/* Discard all entries on the write queues */ -static void clean_writequeues(void) -{ - foreach_conn(clean_one_writequeue); -} - static void work_stop(void) { if (recv_workqueue) @@ -1608,26 +1600,34 @@ static void shutdown_conn(struct connection *con) con->shutdown_action(con); } +static void connection_release(struct rcu_head *rcu) +{ + struct connection *con = container_of(rcu, struct connection, rcu); + + kfree(con->rx_buf); + kfree(con); +} + static void free_conn(struct connection *con) { close_connection(con, true, true, true); - if (con->othercon) - kmem_cache_free(con_cache, con->othercon); - hlist_del(&con->list); - kmem_cache_free(con_cache, con); + spin_lock(&connections_lock); + hlist_del_rcu(&con->list); + spin_unlock(&connections_lock); + if (con->othercon) { + clean_one_writequeue(con->othercon); + call_rcu(&con->othercon->rcu, connection_release); + } + clean_one_writequeue(con); + call_rcu(&con->rcu, connection_release); } static void work_flush(void) { - int ok; + int ok, idx; int i; - struct hlist_node *n; struct connection *con; - if (recv_workqueue) - flush_workqueue(recv_workqueue); - if (send_workqueue) - flush_workqueue(send_workqueue); do { ok = 1; foreach_conn(stop_conn); @@ -1635,9 +1635,10 @@ static void work_flush(void) flush_workqueue(recv_workqueue); if (send_workqueue) flush_workqueue(send_workqueue); + idx = srcu_read_lock(&connections_srcu); for (i = 0; i < CONN_HASH_SIZE && ok; i++) { - hlist_for_each_entry_safe(con, n, - &connection_hash[i], list) { + hlist_for_each_entry_rcu(con, &connection_hash[i], + list) { ok &= test_bit(CF_READ_PENDING, &con->flags); ok &= test_bit(CF_WRITE_PENDING, &con->flags); if (con->othercon) { @@ -1648,6 +1649,7 @@ static void work_flush(void) } } } + srcu_read_unlock(&connections_srcu, idx); } while (!ok); } @@ -1656,16 +1658,18 @@ void dlm_lowcomms_stop(void) /* Set all the flags to prevent any socket activity. */ - mutex_lock(&connections_lock); dlm_allow_conn = 0; - mutex_unlock(&connections_lock); + + if (recv_workqueue) + flush_workqueue(recv_workqueue); + if (send_workqueue) + flush_workqueue(send_workqueue); + foreach_conn(shutdown_conn); work_flush(); - clean_writequeues(); foreach_conn(free_conn); work_stop(); - - kmem_cache_destroy(con_cache); + deinit_local(); } int dlm_lowcomms_start(void) @@ -1684,16 +1688,9 @@ int dlm_lowcomms_start(void) goto fail; } - error = -ENOMEM; - con_cache = kmem_cache_create("dlm_conn", sizeof(struct connection), - __alignof__(struct connection), 0, - NULL); - if (!con_cache) - goto fail; - error = work_start(); if (error) - goto fail_destroy; + goto fail; dlm_allow_conn = 1; @@ -1710,12 +1707,8 @@ int dlm_lowcomms_start(void) fail_unlisten: dlm_allow_conn = 0; con = nodeid2con(0,0); - if (con) { - close_connection(con, false, true, true); - kmem_cache_free(con_cache, con); - } -fail_destroy: - kmem_cache_destroy(con_cache); + if (con) + free_conn(con); fail: return error; } diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 921322d133e3..fde3a6afe4be 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -22,114 +22,84 @@ * into packets and sends them to the comms layer. */ +#include <asm/unaligned.h> + #include "dlm_internal.h" #include "lowcomms.h" #include "config.h" #include "lock.h" #include "midcomms.h" - -static void copy_from_cb(void *dst, const void *base, unsigned offset, - unsigned len, unsigned limit) -{ - unsigned copy = len; - - if ((copy + offset) > limit) - copy = limit - offset; - memcpy(dst, base + offset, copy); - len -= copy; - if (len) - memcpy(dst + copy, base, len); -} - /* * Called from the low-level comms layer to process a buffer of * commands. - * - * Only complete messages are processed here, any "spare" bytes from - * the end of a buffer are saved and tacked onto the front of the next - * message that comes in. I doubt this will happen very often but we - * need to be able to cope with it and I don't want the task to be waiting - * for packets to come in when there is useful work to be done. */ -int dlm_process_incoming_buffer(int nodeid, const void *base, - unsigned offset, unsigned len, unsigned limit) +int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) { - union { - unsigned char __buf[DLM_INBUF_LEN]; - /* this is to force proper alignment on some arches */ - union dlm_packet p; - } __tmp; - union dlm_packet *p = &__tmp.p; - int ret = 0; - int err = 0; + const unsigned char *ptr = buf; + const struct dlm_header *hd; uint16_t msglen; - uint32_t lockspace; - - while (len > sizeof(struct dlm_header)) { - - /* Copy just the header to check the total length. The - message may wrap around the end of the buffer back to the - start, so we need to use a temp buffer and copy_from_cb. */ - - copy_from_cb(p, base, offset, sizeof(struct dlm_header), - limit); - - msglen = le16_to_cpu(p->header.h_length); - lockspace = p->header.h_lockspace; + int ret = 0; - err = -EINVAL; - if (msglen < sizeof(struct dlm_header)) - break; - if (p->header.h_cmd == DLM_MSG) { - if (msglen < sizeof(struct dlm_message)) - break; - } else { - if (msglen < sizeof(struct dlm_rcom)) - break; - } - err = -E2BIG; - if (msglen > dlm_config.ci_buffer_size) { - log_print("message size %d from %d too big, buf len %d", - msglen, nodeid, len); - break; + while (len >= sizeof(struct dlm_header)) { + hd = (struct dlm_header *)ptr; + + /* no message should be more than this otherwise we + * cannot deliver this message to upper layers + */ + msglen = get_unaligned_le16(&hd->h_length); + if (msglen > DEFAULT_BUFFER_SIZE) { + log_print("received invalid length header: %u, will abort message parsing", + msglen); + return -EBADMSG; } - err = 0; - - /* If only part of the full message is contained in this - buffer, then do nothing and wait for lowcomms to call - us again later with more data. We return 0 meaning - we've consumed none of the input buffer. */ + /* caller will take care that leftover + * will be parsed next call with more data + */ if (msglen > len) break; - /* Allocate a larger temp buffer if the full message won't fit - in the buffer on the stack (which should work for most - ordinary messages). */ - - if (msglen > sizeof(__tmp) && p == &__tmp.p) { - p = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS); - if (p == NULL) - return ret; - } + switch (hd->h_cmd) { + case DLM_MSG: + if (msglen < sizeof(struct dlm_message)) { + log_print("dlm msg too small: %u, will skip this message", + msglen); + goto skip; + } - copy_from_cb(p, base, offset, msglen, limit); + break; + case DLM_RCOM: + if (msglen < sizeof(struct dlm_rcom)) { + log_print("dlm rcom msg too small: %u, will skip this message", + msglen); + goto skip; + } - BUG_ON(lockspace != p->header.h_lockspace); + break; + default: + log_print("unsupported h_cmd received: %u, will skip this message", + hd->h_cmd); + goto skip; + } + /* for aligned memory access, we just copy current message + * to begin of the buffer which contains already parsed buffer + * data and should provide align access for upper layers + * because the start address of the buffer has a aligned + * address. This memmove can be removed when the upperlayer + * is capable of unaligned memory access. + */ + memmove(buf, ptr, msglen); + dlm_receive_buffer((union dlm_packet *)buf, nodeid); + +skip: ret += msglen; - offset += msglen; - offset &= (limit - 1); len -= msglen; - - dlm_receive_buffer(p, nodeid); + ptr += msglen; } - if (p != &__tmp.p) - kfree(p); - - return err ? err : ret; + return ret; } diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h index 2e122e81c8d0..61e90a921849 100644 --- a/fs/dlm/midcomms.h +++ b/fs/dlm/midcomms.h @@ -12,8 +12,7 @@ #ifndef __MIDCOMMS_DOT_H__ #define __MIDCOMMS_DOT_H__ -int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset, - unsigned len, unsigned limit); +int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int buflen); #endif /* __MIDCOMMS_DOT_H__ */ diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 28bb5689333a..15880a68faad 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -141,6 +141,9 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, name[len + EFI_VARIABLE_GUID_LEN+1] = '\0'; + /* replace invalid slashes like kobject_set_name_vargs does for /sys/firmware/efi/vars. */ + strreplace(name, '/', '!'); + inode = efivarfs_get_inode(sb, d_inode(root), S_IFREG | 0644, 0, is_removable); if (!inode) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 459ecb42cbd3..347be146884c 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -224,7 +224,7 @@ submit_bio_retry: bio_set_dev(bio, sb->s_bdev); bio->bi_iter.bi_sector = (sector_t)blknr << LOG_SECTORS_PER_BLOCK; - bio->bi_opf = REQ_OP_READ; + bio->bi_opf = REQ_OP_READ | (ra ? REQ_RAHEAD : 0); } err = bio_add_page(bio, page, PAGE_SIZE, 0); diff --git a/fs/erofs/super.c b/fs/erofs/super.c index ddaa516c008a..b9a09806512a 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -211,9 +211,7 @@ static void erofs_default_options(struct erofs_fs_context *ctx) enum { Opt_user_xattr, - Opt_nouser_xattr, Opt_acl, - Opt_noacl, Opt_cache_strategy, Opt_err }; diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index c8c381eadcd6..5bde77d70852 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -473,8 +473,6 @@ static int erofs_xattr_generic_get(const struct xattr_handler *handler, return -EOPNOTSUPP; break; case EROFS_XATTR_INDEX_TRUSTED: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; break; case EROFS_XATTR_INDEX_SECURITY: break; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 6c939def00f9..50912a5420b4 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -135,6 +135,7 @@ struct z_erofs_decompress_frontend { struct z_erofs_collector clt; struct erofs_map_blocks map; + bool readahead; /* used for applying cache strategy on the fly */ bool backmost; erofs_off_t headoffset; @@ -153,8 +154,7 @@ static DEFINE_MUTEX(z_pagemap_global_lock); static void preload_compressed_pages(struct z_erofs_collector *clt, struct address_space *mc, - enum z_erofs_cache_alloctype type, - struct list_head *pagepool) + enum z_erofs_cache_alloctype type) { const struct z_erofs_pcluster *pcl = clt->pcl; const unsigned int clusterpages = BIT(pcl->clusterbits); @@ -562,8 +562,7 @@ static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe, } static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, - struct page *page, - struct list_head *pagepool) + struct page *page) { struct inode *const inode = fe->inode; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); @@ -620,8 +619,7 @@ restart_now: else cache_strategy = DONTALLOC; - preload_compressed_pages(clt, MNGD_MAPPING(sbi), - cache_strategy, pagepool); + preload_compressed_pages(clt, MNGD_MAPPING(sbi), cache_strategy); hitted: /* @@ -653,7 +651,7 @@ retry: /* should allocate an additional staging page for pagevec */ if (err == -EAGAIN) { struct page *const newpage = - erofs_allocpage(pagepool, GFP_NOFS | __GFP_NOFAIL); + alloc_page(GFP_NOFS | __GFP_NOFAIL); newpage->mapping = Z_EROFS_MAPPING_STAGING; err = z_erofs_attach_page(clt, newpage, @@ -1151,7 +1149,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, } static void z_erofs_submit_queue(struct super_block *sb, - z_erofs_next_pcluster_t owned_head, + struct z_erofs_decompress_frontend *f, struct list_head *pagepool, struct z_erofs_decompressqueue *fgq, bool *force_fg) @@ -1160,6 +1158,7 @@ static void z_erofs_submit_queue(struct super_block *sb, z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; void *bi_private; + z_erofs_next_pcluster_t owned_head = f->clt.owned_head; /* since bio will be NULL, no need to initialize last_index */ pgoff_t last_index; unsigned int nr_bios = 0; @@ -1193,7 +1192,6 @@ static void z_erofs_submit_queue(struct super_block *sb, do { struct page *page; - int err; page = pickup_page_for_submission(pcl, i++, pagepool, MNGD_MAPPING(sbi), @@ -1216,11 +1214,12 @@ submit_bio_retry: LOG_SECTORS_PER_BLOCK; bio->bi_private = bi_private; bio->bi_opf = REQ_OP_READ; + if (f->readahead) + bio->bi_opf |= REQ_RAHEAD; ++nr_bios; } - err = bio_add_page(bio, page, PAGE_SIZE, 0); - if (err < PAGE_SIZE) + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) goto submit_bio_retry; last_index = cur; @@ -1248,14 +1247,14 @@ submit_bio_retry: } static void z_erofs_runqueue(struct super_block *sb, - struct z_erofs_collector *clt, + struct z_erofs_decompress_frontend *f, struct list_head *pagepool, bool force_fg) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; - if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) + if (f->clt.owned_head == Z_EROFS_PCLUSTER_TAIL) return; - z_erofs_submit_queue(sb, clt->owned_head, pagepool, io, &force_fg); + z_erofs_submit_queue(sb, f, pagepool, io, &force_fg); /* handle bypass queue (no i/o pclusters) immediately */ z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool); @@ -1282,11 +1281,11 @@ static int z_erofs_readpage(struct file *file, struct page *page) f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; - err = z_erofs_do_read_page(&f, page, &pagepool); + err = z_erofs_do_read_page(&f, page); (void)z_erofs_collector_end(&f.clt); /* if some compressed cluster ready, need submit them anyway */ - z_erofs_runqueue(inode->i_sb, &f.clt, &pagepool, true); + z_erofs_runqueue(inode->i_sb, &f, &pagepool, true); if (err) erofs_err(inode->i_sb, "failed to read, err [%d]", err); @@ -1299,25 +1298,20 @@ static int z_erofs_readpage(struct file *file, struct page *page) return err; } -static bool should_decompress_synchronously(struct erofs_sb_info *sbi, - unsigned int nr) -{ - return nr <= sbi->ctx.max_sync_decompress_pages; -} - static void z_erofs_readahead(struct readahead_control *rac) { struct inode *const inode = rac->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); - bool sync = should_decompress_synchronously(sbi, readahead_count(rac)); + unsigned int nr_pages = readahead_count(rac); + bool sync = (nr_pages <= sbi->ctx.max_sync_decompress_pages); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); struct page *page, *head = NULL; LIST_HEAD(pagepool); - trace_erofs_readpages(inode, readahead_index(rac), - readahead_count(rac), false); + trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); + f.readahead = true; f.headoffset = readahead_pos(rac); while ((page = readahead_page(rac))) { @@ -1341,7 +1335,7 @@ static void z_erofs_readahead(struct readahead_control *rac) /* traversal in reverse order */ head = (void *)page_private(page); - err = z_erofs_do_read_page(&f, page, &pagepool); + err = z_erofs_do_read_page(&f, page); if (err) erofs_err(inode->i_sb, "readahead error at page %lu @ nid %llu", @@ -1351,7 +1345,7 @@ static void z_erofs_readahead(struct readahead_control *rac) (void)z_erofs_collector_end(&f.clt); - z_erofs_runqueue(inode->i_sb, &f.clt, &pagepool, sync); + z_erofs_runqueue(inode->i_sb, &f, &pagepool, sync); if (f.map.mpage) put_page(f.map.mpage); diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 7d40d78ea864..ae325541884e 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -359,7 +359,7 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, return z_erofs_extent_lookback(m, m->delta[0]); case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: map->m_flags &= ~EROFS_MAP_ZIPPED; - /* fallthrough */ + fallthrough; case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: map->m_la = (lcn << lclusterbits) | m->clusterofs; break; @@ -416,7 +416,7 @@ int z_erofs_map_blocks_iter(struct inode *inode, case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: if (endoff >= m.clusterofs) map->m_flags &= ~EROFS_MAP_ZIPPED; - /* fallthrough */ + fallthrough; case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: if (endoff >= m.clusterofs) { map->m_la = (m.lcn << lclusterbits) | m.clusterofs; @@ -433,7 +433,7 @@ int z_erofs_map_blocks_iter(struct inode *inode, end = (m.lcn << lclusterbits) | m.clusterofs; map->m_flags |= EROFS_MAP_FULL_MAPPED; m.delta[0] = 1; - /* fallthrough */ + fallthrough; case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: /* get the correspoinding first chunk */ err = z_erofs_extent_lookback(&m, m.delta[0]); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 12eebcdea9c8..4df61129566d 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -218,8 +218,7 @@ struct eventpoll { struct file *file; /* used to optimize loop detection check */ - struct list_head visited_list_link; - int visited; + u64 gen; #ifdef CONFIG_NET_RX_BUSY_POLL /* used to track busy poll napi_id */ @@ -274,6 +273,8 @@ static long max_user_watches __read_mostly; */ static DEFINE_MUTEX(epmutex); +static u64 loop_check_gen = 0; + /* Used to check for epoll file descriptor inclusion loops */ static struct nested_calls poll_loop_ncalls; @@ -283,9 +284,6 @@ static struct kmem_cache *epi_cache __read_mostly; /* Slab cache used to allocate "struct eppoll_entry" */ static struct kmem_cache *pwq_cache __read_mostly; -/* Visited nodes during ep_loop_check(), so we can unset them when we finish */ -static LIST_HEAD(visited_list); - /* * List of files with newly added links, where we may need to limit the number * of emanating paths. Protected by the epmutex. @@ -1450,7 +1448,7 @@ static int reverse_path_check(void) static int ep_create_wakeup_source(struct epitem *epi) { - const char *name; + struct name_snapshot n; struct wakeup_source *ws; if (!epi->ep->ws) { @@ -1459,8 +1457,9 @@ static int ep_create_wakeup_source(struct epitem *epi) return -ENOMEM; } - name = epi->ffd.file->f_path.dentry->d_name.name; - ws = wakeup_source_register(NULL, name); + take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry); + ws = wakeup_source_register(NULL, n.name.name); + release_dentry_name_snapshot(&n); if (!ws) return -ENOMEM; @@ -1522,6 +1521,22 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, RCU_INIT_POINTER(epi->ws, NULL); } + /* Add the current item to the list of active epoll hook for this file */ + spin_lock(&tfile->f_lock); + list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links); + spin_unlock(&tfile->f_lock); + + /* + * Add the current item to the RB tree. All RB tree operations are + * protected by "mtx", and ep_insert() is called with "mtx" held. + */ + ep_rbtree_insert(ep, epi); + + /* now check if we've created too many backpaths */ + error = -EINVAL; + if (full_check && reverse_path_check()) + goto error_remove_epi; + /* Initialize the poll table using the queue callback */ epq.epi = epi; init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); @@ -1544,22 +1559,6 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, if (epi->nwait < 0) goto error_unregister; - /* Add the current item to the list of active epoll hook for this file */ - spin_lock(&tfile->f_lock); - list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links); - spin_unlock(&tfile->f_lock); - - /* - * Add the current item to the RB tree. All RB tree operations are - * protected by "mtx", and ep_insert() is called with "mtx" held. - */ - ep_rbtree_insert(ep, epi); - - /* now check if we've created too many backpaths */ - error = -EINVAL; - if (full_check && reverse_path_check()) - goto error_remove_epi; - /* We have to drop the new item inside our item list to keep track of it */ write_lock_irq(&ep->lock); @@ -1588,6 +1587,8 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, return 0; +error_unregister: + ep_unregister_pollwait(ep, epi); error_remove_epi: spin_lock(&tfile->f_lock); list_del_rcu(&epi->fllink); @@ -1595,9 +1596,6 @@ error_remove_epi: rb_erase_cached(&epi->rbn, &ep->rbr); -error_unregister: - ep_unregister_pollwait(ep, epi); - /* * We need to do this because an event could have been arrived on some * allocated wait queue. Note that we don't care about the ep->ovflist @@ -1972,13 +1970,12 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) struct epitem *epi; mutex_lock_nested(&ep->mtx, call_nests + 1); - ep->visited = 1; - list_add(&ep->visited_list_link, &visited_list); + ep->gen = loop_check_gen; for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (unlikely(is_file_epoll(epi->ffd.file))) { ep_tovisit = epi->ffd.file->private_data; - if (ep_tovisit->visited) + if (ep_tovisit->gen == loop_check_gen) continue; error = ep_call_nested(&poll_loop_ncalls, ep_loop_check_proc, epi->ffd.file, @@ -1994,9 +1991,11 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) * not already there, and calling reverse_path_check() * during ep_insert(). */ - if (list_empty(&epi->ffd.file->f_tfile_llink)) - list_add(&epi->ffd.file->f_tfile_llink, - &tfile_check_list); + if (list_empty(&epi->ffd.file->f_tfile_llink)) { + if (get_file_rcu(epi->ffd.file)) + list_add(&epi->ffd.file->f_tfile_llink, + &tfile_check_list); + } } } mutex_unlock(&ep->mtx); @@ -2017,18 +2016,8 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) */ static int ep_loop_check(struct eventpoll *ep, struct file *file) { - int ret; - struct eventpoll *ep_cur, *ep_next; - - ret = ep_call_nested(&poll_loop_ncalls, + return ep_call_nested(&poll_loop_ncalls, ep_loop_check_proc, file, ep, current); - /* clear visited list */ - list_for_each_entry_safe(ep_cur, ep_next, &visited_list, - visited_list_link) { - ep_cur->visited = 0; - list_del(&ep_cur->visited_list_link); - } - return ret; } static void clear_tfile_check_list(void) @@ -2040,6 +2029,7 @@ static void clear_tfile_check_list(void) file = list_first_entry(&tfile_check_list, struct file, f_tfile_llink); list_del_init(&file->f_tfile_llink); + fput(file); } INIT_LIST_HEAD(&tfile_check_list); } @@ -2192,33 +2182,32 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, goto error_tgt_fput; if (op == EPOLL_CTL_ADD) { if (!list_empty(&f.file->f_ep_links) || + ep->gen == loop_check_gen || is_file_epoll(tf.file)) { mutex_unlock(&ep->mtx); error = epoll_mutex_lock(&epmutex, 0, nonblock); if (error) goto error_tgt_fput; + loop_check_gen++; full_check = 1; if (is_file_epoll(tf.file)) { error = -ELOOP; - if (ep_loop_check(ep, tf.file) != 0) { - clear_tfile_check_list(); + if (ep_loop_check(ep, tf.file) != 0) goto error_tgt_fput; - } - } else + } else { + get_file(tf.file); list_add(&tf.file->f_tfile_llink, &tfile_check_list); + } error = epoll_mutex_lock(&ep->mtx, 0, nonblock); - if (error) { -out_del: - list_del(&tf.file->f_tfile_llink); + if (error) goto error_tgt_fput; - } if (is_file_epoll(tf.file)) { tep = tf.file->private_data; error = epoll_mutex_lock(&tep->mtx, 1, nonblock); if (error) { mutex_unlock(&ep->mtx); - goto out_del; + goto error_tgt_fput; } } } @@ -2239,8 +2228,6 @@ out_del: error = ep_insert(ep, epds, tf.file, fd, full_check); } else error = -EEXIST; - if (full_check) - clear_tfile_check_list(); break; case EPOLL_CTL_DEL: if (epi) @@ -2263,8 +2250,11 @@ out_del: mutex_unlock(&ep->mtx); error_tgt_fput: - if (full_check) + if (full_check) { + clear_tfile_check_list(); + loop_check_gen++; mutex_unlock(&epmutex); + } fdput(tf); error_fput: diff --git a/fs/exec.c b/fs/exec.c index a91003e28eaa..07910f5032e7 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -62,6 +62,7 @@ #include <linux/oom.h> #include <linux/compat.h> #include <linux/vmalloc.h> +#include <linux/io_uring.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> @@ -1895,6 +1896,11 @@ static int bprm_execve(struct linux_binprm *bprm, struct files_struct *displaced; int retval; + /* + * Cancel any io_uring activity across execve + */ + io_uring_task_cancel(); + retval = unshare_files(&displaced); if (retval) return retval; diff --git a/fs/exfat/cache.c b/fs/exfat/cache.c index 03d0824fc368..5a2f119b7e8c 100644 --- a/fs/exfat/cache.c +++ b/fs/exfat/cache.c @@ -17,7 +17,6 @@ #include "exfat_raw.h" #include "exfat_fs.h" -#define EXFAT_CACHE_VALID 0 #define EXFAT_MAX_CACHE 16 struct exfat_cache { @@ -61,16 +60,6 @@ void exfat_cache_shutdown(void) kmem_cache_destroy(exfat_cachep); } -void exfat_cache_init_inode(struct inode *inode) -{ - struct exfat_inode_info *ei = EXFAT_I(inode); - - spin_lock_init(&ei->cache_lru_lock); - ei->nr_caches = 0; - ei->cache_valid_id = EXFAT_CACHE_VALID + 1; - INIT_LIST_HEAD(&ei->cache_lru); -} - static inline struct exfat_cache *exfat_cache_alloc(void) { return kmem_cache_alloc(exfat_cachep, GFP_NOFS); diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 95d717f8620c..c013fe931d9c 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -248,6 +248,8 @@ struct exfat_sb_info { struct rcu_head rcu; }; +#define EXFAT_CACHE_VALID 0 + /* * EXFAT file system inode in-memory data */ @@ -428,7 +430,6 @@ extern const struct dentry_operations exfat_utf8_dentry_ops; /* cache.c */ int exfat_cache_init(void); void exfat_cache_shutdown(void); -void exfat_cache_init_inode(struct inode *inode); void exfat_cache_inval_inode(struct inode *inode); int exfat_get_cluster(struct inode *inode, unsigned int cluster, unsigned int *fclus, unsigned int *dclus, diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 7f90204adef5..a6de17cac3df 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -611,8 +611,6 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info) ei->i_crtime = info->crtime; inode->i_atime = info->atime; - exfat_cache_init_inode(inode); - return 0; } diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index e73f20f66cb2..c94ac239f740 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -578,7 +578,8 @@ static int exfat_create(struct inode *dir, struct dentry *dentry, umode_t mode, i_pos = exfat_make_i_pos(&info); inode = exfat_build_inode(sb, &info, i_pos); - if (IS_ERR(inode)) + err = PTR_ERR_OR_ZERO(inode); + if (err) goto unlock; inode_inc_iversion(inode); @@ -745,10 +746,9 @@ static struct dentry *exfat_lookup(struct inode *dir, struct dentry *dentry, i_pos = exfat_make_i_pos(&info); inode = exfat_build_inode(sb, &info, i_pos); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); + err = PTR_ERR_OR_ZERO(inode); + if (err) goto unlock; - } i_mode = inode->i_mode; alias = d_find_alias(inode); @@ -890,10 +890,9 @@ static int exfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) i_pos = exfat_make_i_pos(&info); inode = exfat_build_inode(sb, &info, i_pos); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); + err = PTR_ERR_OR_ZERO(inode); + if (err) goto unlock; - } inode_inc_iversion(inode); inode->i_mtime = inode->i_atime = inode->i_ctime = diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 3b6a1659892f..60b941ba557b 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -376,7 +376,6 @@ static int exfat_read_root(struct inode *inode) inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = current_time(inode); exfat_truncate_atime(&inode->i_atime); - exfat_cache_init_inode(inode); return 0; } @@ -763,6 +762,10 @@ static void exfat_inode_init_once(void *foo) { struct exfat_inode_info *ei = (struct exfat_inode_info *)foo; + spin_lock_init(&ei->cache_lru_lock); + ei->nr_caches = 0; + ei->cache_valid_id = EXFAT_CACHE_VALID + 1; + INIT_LIST_HEAD(&ei->cache_lru); INIT_HLIST_NODE(&ei->i_hash_fat); inode_init_once(&ei->vfs_inode); } diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 60378ddf1424..96044f5dbc0e 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -93,8 +93,10 @@ static vm_fault_t ext2_dax_fault(struct vm_fault *vmf) struct inode *inode = file_inode(vmf->vma->vm_file); struct ext2_inode_info *ei = EXT2_I(inode); vm_fault_t ret; + bool write = (vmf->flags & FAULT_FLAG_WRITE) && + (vmf->vma->vm_flags & VM_SHARED); - if (vmf->flags & FAULT_FLAG_WRITE) { + if (write) { sb_start_pagefault(inode->i_sb); file_update_time(vmf->vma->vm_file); } @@ -103,7 +105,7 @@ static vm_fault_t ext2_dax_fault(struct vm_fault *vmf) ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, NULL, &ext2_iomap_ops); up_read(&ei->dax_sem); - if (vmf->flags & FAULT_FLAG_WRITE) + if (write) sb_end_pagefault(inode->i_sb); return ret; } diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 80662e1f7889..415c21f0e750 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1241,7 +1241,7 @@ do_indirects: mark_inode_dirty(inode); ext2_free_branches(inode, &nr, &nr+1, 1); } - /* fall through */ + fallthrough; case EXT2_IND_BLOCK: nr = i_data[EXT2_DIND_BLOCK]; if (nr) { @@ -1249,7 +1249,7 @@ do_indirects: mark_inode_dirty(inode); ext2_free_branches(inode, &nr, &nr+1, 2); } - /* fall through */ + fallthrough; case EXT2_DIND_BLOCK: nr = i_data[EXT2_TIND_BLOCK]; if (nr) { diff --git a/fs/ext2/super.c b/fs/ext2/super.c index dda860562ca3..7fab2b3b5b39 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -587,7 +587,7 @@ static int parse_options(char *options, struct super_block *sb, case Opt_xip: ext2_msg(sb, KERN_INFO, "use dax instead of xip"); set_opt(opts->s_mount_opt, XIP); - /* Fall through */ + fallthrough; case Opt_dax: #ifdef CONFIG_FS_DAX ext2_msg(sb, KERN_WARNING, diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 1afa5a4bcb5f..619dd35ddd48 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -110,7 +110,7 @@ config EXT4_KUNIT_TESTS This builds the ext4 KUnit tests. KUnit tests run during boot and output the results to the debug log - in TAP format (http://testanything.org/). Only useful for kernel devs + in TAP format (https://testanything.org/). Only useful for kernel devs running KUnit test harness and are not for inclusion into a production build. diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 1ba46d87cdf1..48c3df47748d 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -413,7 +413,8 @@ verified: * Return buffer_head on success or an ERR_PTR in case of failure. */ struct buffer_head * -ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) +ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, + bool ignore_locked) { struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -441,6 +442,12 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) return ERR_PTR(-ENOMEM); } + if (ignore_locked && buffer_locked(bh)) { + /* buffer under IO already, return if called for prefetching */ + put_bh(bh); + return NULL; + } + if (bitmap_uptodate(bh)) goto verify; @@ -487,10 +494,11 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) * submit the buffer_head for reading */ set_buffer_new(bh); - trace_ext4_read_block_bitmap_load(sb, block_group); + trace_ext4_read_block_bitmap_load(sb, block_group, ignore_locked); bh->b_end_io = ext4_end_bitmap_read; get_bh(bh); - submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh); + submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO | + (ignore_locked ? REQ_RAHEAD : 0), bh); return bh; verify: err = ext4_validate_block_bitmap(sb, desc, block_group, bh); @@ -534,7 +542,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) struct buffer_head *bh; int err; - bh = ext4_read_block_bitmap_nowait(sb, block_group); + bh = ext4_read_block_bitmap_nowait(sb, block_group, false); if (IS_ERR(bh)) return bh; err = ext4_wait_block_bitmap(sb, block_group, bh); diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 16e9b2fda03a..c54ba52f2dd4 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -24,6 +24,7 @@ struct ext4_system_zone { struct rb_node node; ext4_fsblk_t start_blk; unsigned int count; + u32 ino; }; static struct kmem_cache *ext4_system_zone_cachep; @@ -45,7 +46,8 @@ void ext4_exit_system_zone(void) static inline int can_merge(struct ext4_system_zone *entry1, struct ext4_system_zone *entry2) { - if ((entry1->start_blk + entry1->count) == entry2->start_blk) + if ((entry1->start_blk + entry1->count) == entry2->start_blk && + entry1->ino == entry2->ino) return 1; return 0; } @@ -66,9 +68,9 @@ static void release_system_zone(struct ext4_system_blocks *system_blks) */ static int add_system_zone(struct ext4_system_blocks *system_blks, ext4_fsblk_t start_blk, - unsigned int count) + unsigned int count, u32 ino) { - struct ext4_system_zone *new_entry = NULL, *entry; + struct ext4_system_zone *new_entry, *entry; struct rb_node **n = &system_blks->root.rb_node, *node; struct rb_node *parent = NULL, *new_node = NULL; @@ -79,30 +81,21 @@ static int add_system_zone(struct ext4_system_blocks *system_blks, n = &(*n)->rb_left; else if (start_blk >= (entry->start_blk + entry->count)) n = &(*n)->rb_right; - else { - if (start_blk + count > (entry->start_blk + - entry->count)) - entry->count = (start_blk + count - - entry->start_blk); - new_node = *n; - new_entry = rb_entry(new_node, struct ext4_system_zone, - node); - break; - } + else /* Unexpected overlap of system zones. */ + return -EFSCORRUPTED; } - if (!new_entry) { - new_entry = kmem_cache_alloc(ext4_system_zone_cachep, - GFP_KERNEL); - if (!new_entry) - return -ENOMEM; - new_entry->start_blk = start_blk; - new_entry->count = count; - new_node = &new_entry->node; - - rb_link_node(new_node, parent, n); - rb_insert_color(new_node, &system_blks->root); - } + new_entry = kmem_cache_alloc(ext4_system_zone_cachep, + GFP_KERNEL); + if (!new_entry) + return -ENOMEM; + new_entry->start_blk = start_blk; + new_entry->count = count; + new_entry->ino = ino; + new_node = &new_entry->node; + + rb_link_node(new_node, parent, n); + rb_insert_color(new_node, &system_blks->root); /* Can we merge to the left? */ node = rb_prev(new_node); @@ -151,40 +144,6 @@ static void debug_print_tree(struct ext4_sb_info *sbi) printk(KERN_CONT "\n"); } -/* - * Returns 1 if the passed-in block region (start_blk, - * start_blk+count) is valid; 0 if some part of the block region - * overlaps with filesystem metadata blocks. - */ -static int ext4_data_block_valid_rcu(struct ext4_sb_info *sbi, - struct ext4_system_blocks *system_blks, - ext4_fsblk_t start_blk, - unsigned int count) -{ - struct ext4_system_zone *entry; - struct rb_node *n; - - if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || - (start_blk + count < start_blk) || - (start_blk + count > ext4_blocks_count(sbi->s_es))) - return 0; - - if (system_blks == NULL) - return 1; - - n = system_blks->root.rb_node; - while (n) { - entry = rb_entry(n, struct ext4_system_zone, node); - if (start_blk + count - 1 < entry->start_blk) - n = n->rb_left; - else if (start_blk >= (entry->start_blk + entry->count)) - n = n->rb_right; - else - return 0; - } - return 1; -} - static int ext4_protect_reserved_inode(struct super_block *sb, struct ext4_system_blocks *system_blks, u32 ino) @@ -214,19 +173,18 @@ static int ext4_protect_reserved_inode(struct super_block *sb, if (n == 0) { i++; } else { - if (!ext4_data_block_valid_rcu(sbi, system_blks, - map.m_pblk, n)) { - err = -EFSCORRUPTED; - __ext4_error(sb, __func__, __LINE__, -err, - map.m_pblk, "blocks %llu-%llu " - "from inode %u overlap system zone", - map.m_pblk, - map.m_pblk + map.m_len - 1, ino); + err = add_system_zone(system_blks, map.m_pblk, n, ino); + if (err < 0) { + if (err == -EFSCORRUPTED) { + __ext4_error(sb, __func__, __LINE__, + -err, map.m_pblk, + "blocks %llu-%llu from inode %u overlap system zone", + map.m_pblk, + map.m_pblk + map.m_len - 1, + ino); + } break; } - err = add_system_zone(system_blks, map.m_pblk, n); - if (err < 0) - break; i += n; } } @@ -262,14 +220,6 @@ int ext4_setup_system_zone(struct super_block *sb) int flex_size = ext4_flex_bg_size(sbi); int ret; - if (!test_opt(sb, BLOCK_VALIDITY)) { - if (sbi->system_blks) - ext4_release_system_zone(sb); - return 0; - } - if (sbi->system_blks) - return 0; - system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL); if (!system_blks) return -ENOMEM; @@ -277,22 +227,25 @@ int ext4_setup_system_zone(struct super_block *sb) for (i=0; i < ngroups; i++) { cond_resched(); if (ext4_bg_has_super(sb, i) && - ((i < 5) || ((i % flex_size) == 0))) - add_system_zone(system_blks, + ((i < 5) || ((i % flex_size) == 0))) { + ret = add_system_zone(system_blks, ext4_group_first_block_no(sb, i), - ext4_bg_num_gdb(sb, i) + 1); + ext4_bg_num_gdb(sb, i) + 1, 0); + if (ret) + goto err; + } gdp = ext4_get_group_desc(sb, i, NULL); ret = add_system_zone(system_blks, - ext4_block_bitmap(sb, gdp), 1); + ext4_block_bitmap(sb, gdp), 1, 0); if (ret) goto err; ret = add_system_zone(system_blks, - ext4_inode_bitmap(sb, gdp), 1); + ext4_inode_bitmap(sb, gdp), 1, 0); if (ret) goto err; ret = add_system_zone(system_blks, ext4_inode_table(sb, gdp), - sbi->s_itb_per_group); + sbi->s_itb_per_group, 0); if (ret) goto err; } @@ -341,11 +294,24 @@ void ext4_release_system_zone(struct super_block *sb) call_rcu(&system_blks->rcu, ext4_destroy_system_zone); } -int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, +/* + * Returns 1 if the passed-in block region (start_blk, + * start_blk+count) is valid; 0 if some part of the block region + * overlaps with some other filesystem metadata blocks. + */ +int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk, unsigned int count) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_system_blocks *system_blks; - int ret; + struct ext4_system_zone *entry; + struct rb_node *n; + int ret = 1; + + if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || + (start_blk + count < start_blk) || + (start_blk + count > ext4_blocks_count(sbi->s_es))) + return 0; /* * Lock the system zone to prevent it being released concurrently @@ -354,8 +320,22 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, */ rcu_read_lock(); system_blks = rcu_dereference(sbi->system_blks); - ret = ext4_data_block_valid_rcu(sbi, system_blks, start_blk, - count); + if (system_blks == NULL) + goto out_rcu; + + n = system_blks->root.rb_node; + while (n) { + entry = rb_entry(n, struct ext4_system_zone, node); + if (start_blk + count - 1 < entry->start_blk) + n = n->rb_left; + else if (start_blk >= (entry->start_blk + entry->count)) + n = n->rb_right; + else { + ret = (entry->ino == inode->i_ino); + break; + } + } +out_rcu: rcu_read_unlock(); return ret; } @@ -374,8 +354,7 @@ int ext4_check_blockref(const char *function, unsigned int line, while (bref < p+max) { blk = le32_to_cpu(*bref++); if (blk && - unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), - blk, 1))) { + unlikely(!ext4_inode_block_valid(inode, blk, 1))) { ext4_error_inode(inode, function, line, blk, "invalid block"); return -EFSCORRUPTED; diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 1d82336b1cd4..efe77cffc322 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -148,7 +148,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) } if (IS_ENCRYPTED(inode)) { - err = fscrypt_fname_alloc_buffer(inode, EXT4_NAME_LEN, &fstr); + err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN, &fstr); if (err < 0) return err; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 42f5060f3cdf..f9a692c0a66c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -434,10 +434,36 @@ struct flex_groups { #define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded directory */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ -#define EXT4_FL_USER_VISIBLE 0x725BDFFF /* User visible flags */ -#define EXT4_FL_USER_MODIFIABLE 0x624BC0FF /* User modifiable flags */ - -/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */ +/* User modifiable flags */ +#define EXT4_FL_USER_MODIFIABLE (EXT4_SECRM_FL | \ + EXT4_UNRM_FL | \ + EXT4_COMPR_FL | \ + EXT4_SYNC_FL | \ + EXT4_IMMUTABLE_FL | \ + EXT4_APPEND_FL | \ + EXT4_NODUMP_FL | \ + EXT4_NOATIME_FL | \ + EXT4_JOURNAL_DATA_FL | \ + EXT4_NOTAIL_FL | \ + EXT4_DIRSYNC_FL | \ + EXT4_TOPDIR_FL | \ + EXT4_EXTENTS_FL | \ + 0x00400000 /* EXT4_EOFBLOCKS_FL */ | \ + EXT4_DAX_FL | \ + EXT4_PROJINHERIT_FL | \ + EXT4_CASEFOLD_FL) + +/* User visible flags */ +#define EXT4_FL_USER_VISIBLE (EXT4_FL_USER_MODIFIABLE | \ + EXT4_DIRTY_FL | \ + EXT4_COMPRBLK_FL | \ + EXT4_NOCOMPR_FL | \ + EXT4_ENCRYPT_FL | \ + EXT4_INDEX_FL | \ + EXT4_VERITY_FL | \ + EXT4_INLINE_DATA_FL) + +/* Flags we can manipulate with through FS_IOC_FSSETXATTR */ #define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \ EXT4_IMMUTABLE_FL | \ EXT4_APPEND_FL | \ @@ -669,8 +695,6 @@ enum { /* * ioctl commands */ -#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS -#define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS #define EXT4_IOC_GETVERSION _IOR('f', 3, long) #define EXT4_IOC_SETVERSION _IOW('f', 4, long) #define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION @@ -687,17 +711,11 @@ enum { #define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) #define EXT4_IOC_SWAP_BOOT _IO('f', 17) #define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18) -#define EXT4_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY -#define EXT4_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT -#define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY /* ioctl codes 19--39 are reserved for fscrypt */ #define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40) #define EXT4_IOC_GETSTATE _IOW('f', 41, __u32) #define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap) -#define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR -#define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR - #define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32) /* @@ -722,8 +740,6 @@ enum { /* * ioctl commands in 32 bit emulation */ -#define EXT4_IOC32_GETFLAGS FS_IOC32_GETFLAGS -#define EXT4_IOC32_SETFLAGS FS_IOC32_SETFLAGS #define EXT4_IOC32_GETVERSION _IOR('f', 3, int) #define EXT4_IOC32_SETVERSION _IOW('f', 4, int) #define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) @@ -1054,6 +1070,7 @@ struct ext4_inode_info { struct timespec64 i_crtime; /* mballoc */ + atomic_t i_prealloc_active; struct list_head i_prealloc_list; spinlock_t i_prealloc_lock; @@ -1172,6 +1189,7 @@ struct ext4_inode_info { #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */ +#define EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS 0x4000000 #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ @@ -1383,7 +1401,7 @@ struct ext4_super_block { #define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ #ifdef CONFIG_FS_ENCRYPTION -#define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_ctx.ctx != NULL) +#define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_policy.policy != NULL) #else #define DUMMY_ENCRYPTION_ENABLED(sbi) (0) #endif @@ -1501,10 +1519,13 @@ struct ext4_sb_info { unsigned int s_mb_stats; unsigned int s_mb_order2_reqs; unsigned int s_mb_group_prealloc; + unsigned int s_mb_max_inode_prealloc; unsigned int s_max_dir_size_kb; /* where last allocation was done - for stream allocation */ unsigned long s_mb_last_group; unsigned long s_mb_last_start; + unsigned int s_mb_prefetch; + unsigned int s_mb_prefetch_limit; /* stats for buddy allocator */ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ @@ -1572,9 +1593,11 @@ struct ext4_sb_info { struct ratelimit_state s_err_ratelimit_state; struct ratelimit_state s_warning_ratelimit_state; struct ratelimit_state s_msg_ratelimit_state; + atomic_t s_warning_count; + atomic_t s_msg_count; - /* Encryption context for '-o test_dummy_encryption' */ - struct fscrypt_dummy_context s_dummy_enc_ctx; + /* Encryption policy for '-o test_dummy_encryption' */ + struct fscrypt_dummy_policy s_dummy_enc_policy; /* * Barrier between writepages ops and changing any inode's JOURNAL_DATA @@ -1585,6 +1608,9 @@ struct ext4_sb_info { #ifdef CONFIG_EXT4_DEBUG unsigned long s_simulate_fail; #endif + /* Record the errseq of the backing block device */ + errseq_t s_bdev_wb_err; + spinlock_t s_bdev_wb_lock; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -2313,9 +2339,15 @@ struct ext4_lazy_init { struct mutex li_list_mtx; }; +enum ext4_li_mode { + EXT4_LI_MODE_PREFETCH_BBITMAP, + EXT4_LI_MODE_ITABLE, +}; + struct ext4_li_request { struct super_block *lr_super; - struct ext4_sb_info *lr_sbi; + enum ext4_li_mode lr_mode; + ext4_group_t lr_first_not_zeroed; ext4_group_t lr_next_group; struct list_head lr_request; unsigned long lr_next_sched; @@ -2446,7 +2478,8 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb, - ext4_group_t block_group); + ext4_group_t block_group, + bool ignore_locked); extern int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, struct buffer_head *bh); @@ -2651,9 +2684,15 @@ extern int ext4_mb_release(struct super_block *); extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, struct ext4_allocation_request *, int *); extern int ext4_mb_reserve_blocks(struct super_block *, int); -extern void ext4_discard_preallocations(struct inode *); +extern void ext4_discard_preallocations(struct inode *, unsigned int); extern int __init ext4_init_mballoc(void); extern void ext4_exit_mballoc(void); +extern ext4_group_t ext4_mb_prefetch(struct super_block *sb, + ext4_group_t group, + unsigned int nr, int *cnt); +extern void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, + unsigned int nr); + extern void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t block, unsigned long count, int flags); @@ -2765,8 +2804,7 @@ extern int ext4_search_dir(struct buffer_head *bh, struct ext4_filename *fname, unsigned int offset, struct ext4_dir_entry_2 **res_dir); -extern int ext4_generic_delete_entry(handle_t *handle, - struct inode *dir, +extern int ext4_generic_delete_entry(struct inode *dir, struct ext4_dir_entry_2 *de_del, struct buffer_head *bh, void *entry_buf, @@ -2924,12 +2962,6 @@ do { \ #endif -extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, - __u32 compat); -extern int ext4_update_rocompat_feature(handle_t *handle, - struct super_block *sb, __u32 rocompat); -extern int ext4_update_incompat_feature(handle_t *handle, - struct super_block *sb, __u32 incompat); extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, struct ext4_group_desc *bg); extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, @@ -3145,6 +3177,7 @@ struct ext4_group_info { (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT) #define EXT4_GROUP_INFO_IBITMAP_CORRUPT \ (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT) +#define EXT4_GROUP_INFO_BBITMAP_READ_BIT 4 #define EXT4_MB_GRP_NEED_INIT(grp) \ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) @@ -3159,6 +3192,8 @@ struct ext4_group_info { (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \ (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_TEST_AND_SET_READ(grp) \ + (test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state))) #define EXT4_MAX_CONTENTION 8 #define EXT4_CONTENTION_THRESHOLD 2 @@ -3363,9 +3398,9 @@ extern void ext4_release_system_zone(struct super_block *sb); extern int ext4_setup_system_zone(struct super_block *sb); extern int __init ext4_init_system_zone(void); extern void ext4_exit_system_zone(void); -extern int ext4_data_block_valid(struct ext4_sb_info *sbi, - ext4_fsblk_t start_blk, - unsigned int count); +extern int ext4_inode_block_valid(struct inode *inode, + ext4_fsblk_t start_blk, + unsigned int count); extern int ext4_check_blockref(const char *, unsigned int, struct inode *, __le32 *, unsigned int); diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 0c76cdd44d90..760b9ee49dc0 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -195,6 +195,28 @@ static void ext4_journal_abort_handle(const char *caller, unsigned int line, jbd2_journal_abort_handle(handle); } +static void ext4_check_bdev_write_error(struct super_block *sb) +{ + struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; + struct ext4_sb_info *sbi = EXT4_SB(sb); + int err; + + /* + * If the block device has write error flag, it may have failed to + * async write out metadata buffers in the background. In this case, + * we could read old data from disk and write it out again, which + * may lead to on-disk filesystem inconsistency. + */ + if (errseq_check(&mapping->wb_err, READ_ONCE(sbi->s_bdev_wb_err))) { + spin_lock(&sbi->s_bdev_wb_lock); + err = errseq_check_and_advance(&mapping->wb_err, &sbi->s_bdev_wb_err); + spin_unlock(&sbi->s_bdev_wb_lock); + if (err) + ext4_error_err(sb, -err, + "Error while async write back metadata"); + } +} + int __ext4_journal_get_write_access(const char *where, unsigned int line, handle_t *handle, struct buffer_head *bh) { @@ -202,6 +224,9 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line, might_sleep(); + if (bh->b_bdev->bd_super) + ext4_check_bdev_write_error(bh->b_bdev->bd_super); + if (ext4_handle_valid(handle)) { err = jbd2_journal_get_write_access(handle, bh); if (err) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 221f240eae60..a0481582187a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -100,7 +100,7 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) * i_mutex. So we can safely drop the i_data_sem here. */ BUG_ON(EXT4_JOURNAL(inode) == NULL); - ext4_discard_preallocations(inode); + ext4_discard_preallocations(inode, 0); up_write(&EXT4_I(inode)->i_data_sem); *dropped = 1; return 0; @@ -340,7 +340,7 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) */ if (lblock + len <= lblock) return 0; - return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); + return ext4_inode_block_valid(inode, block, len); } static int ext4_valid_extent_idx(struct inode *inode, @@ -348,7 +348,7 @@ static int ext4_valid_extent_idx(struct inode *inode, { ext4_fsblk_t block = ext4_idx_pblock(ext_idx); - return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1); + return ext4_inode_block_valid(inode, block, 1); } static int ext4_valid_extent_entries(struct inode *inode, @@ -507,14 +507,10 @@ __read_extent_tree_block(const char *function, unsigned int line, } if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE)) return bh; - if (!ext4_has_feature_journal(inode->i_sb) || - (inode->i_ino != - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) { - err = __ext4_ext_check(function, line, inode, - ext_block_hdr(bh), depth, pblk); - if (err) - goto errout; - } + err = __ext4_ext_check(function, line, inode, + ext_block_hdr(bh), depth, pblk); + if (err) + goto errout; set_buffer_verified(bh); /* * If this is a leaf block, cache all of its entries @@ -693,10 +689,8 @@ void ext4_ext_drop_refs(struct ext4_ext_path *path) return; depth = path->p_depth; for (i = 0; i <= depth; i++, path++) { - if (path->p_bh) { - brelse(path->p_bh); - path->p_bh = NULL; - } + brelse(path->p_bh); + path->p_bh = NULL; } } @@ -1915,7 +1909,7 @@ out: /* * ext4_ext_insert_extent: - * tries to merge requsted extent into the existing extent or + * tries to merge requested extent into the existing extent or * inserts requested extent as new one into the tree, * creating new leaf in the no-space case. */ @@ -3125,7 +3119,7 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) * * * Splits extent [a, b] into two extents [a, @split) and [@split, b], states - * of which are deterimined by split_flag. + * of which are determined by split_flag. * * There are two cases: * a> the extent are splitted into two extent. @@ -3650,7 +3644,7 @@ static int ext4_split_convert_extents(handle_t *handle, eof_block = map->m_lblk + map->m_len; /* * It is safe to convert extent to initialized via explicit - * zeroout only if extent is fully insde i_size or new_size. + * zeroout only if extent is fully inside i_size or new_size. */ depth = ext_depth(inode); ex = path[depth].p_ext; @@ -4272,7 +4266,7 @@ got_allocated_blocks: * not a good idea to call discard here directly, * but otherwise we'd need to call it every free(). */ - ext4_discard_preallocations(inode); + ext4_discard_preallocations(inode, 0); if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE; ext4_free_blocks(handle, inode, NULL, newblock, @@ -4495,7 +4489,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, } /* - * Round up offset. This is not fallocate, we neet to zero out + * Round up offset. This is not fallocate, we need to zero out * blocks, so convert interior block aligned part of the range to * unwritten and possibly manually zero out unaligned parts of the * range. @@ -5299,7 +5293,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) } down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode); + ext4_discard_preallocations(inode, 0); ret = ext4_es_remove_extent(inode, punch_start, EXT_MAX_BLOCKS - punch_start); @@ -5313,7 +5307,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } - ext4_discard_preallocations(inode); + ext4_discard_preallocations(inode, 0); ret = ext4_ext_shift_extents(inode, handle, punch_stop, punch_stop - punch_start, SHIFT_LEFT); @@ -5445,7 +5439,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) goto out_stop; down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode); + ext4_discard_preallocations(inode, 0); path = ext4_find_extent(inode, offset_lblk, NULL, 0); if (IS_ERR(path)) { @@ -5579,7 +5573,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, } ex1 = path1[path1->p_depth].p_ext; ex2 = path2[path2->p_depth].p_ext; - /* Do we have somthing to swap ? */ + /* Do we have something to swap ? */ if (unlikely(!ex2 || !ex1)) goto finish; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 129cc1dd6b79..7d61069531d3 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -145,10 +145,9 @@ static int ext4_release_file(struct inode *inode, struct file *filp) /* if we are the last writer on the inode, drop the block reservation */ if ((filp->f_mode & FMODE_WRITE) && (atomic_read(&inode->i_writecount) == 1) && - !EXT4_I(inode)->i_reserved_data_blocks) - { + !EXT4_I(inode)->i_reserved_data_blocks) { down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode); + ext4_discard_preallocations(inode, 0); up_write(&EXT4_I(inode)->i_data_sem); } if (is_dx(inode) && filp->private_data) @@ -428,6 +427,10 @@ restart: */ if (*ilock_shared && (!IS_NOSEC(inode) || *extend || !ext4_overwrite_io(inode, offset, count))) { + if (iocb->ki_flags & IOCB_NOWAIT) { + ret = -EAGAIN; + goto out; + } inode_unlock_shared(inode); *ilock_shared = false; inode_lock(inode); @@ -812,7 +815,7 @@ out: return err; } -static int ext4_file_open(struct inode * inode, struct file * filp) +static int ext4_file_open(struct inode *inode, struct file *filp) { int ret; diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index 3e133793a5a3..2924261226e0 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -233,7 +233,7 @@ static int __ext4fs_dirhash(const char *name, int len, break; case DX_HASH_HALF_MD4_UNSIGNED: str2hashbuf = str2hashbuf_unsigned; - /* fall through */ + fallthrough; case DX_HASH_HALF_MD4: p = name; while (len > 0) { @@ -247,7 +247,7 @@ static int __ext4fs_dirhash(const char *name, int len, break; case DX_HASH_TEA_UNSIGNED: str2hashbuf = str2hashbuf_unsigned; - /* fall through */ + fallthrough; case DX_HASH_TEA: p = name; while (len > 0) { diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index df25d38d6539..698ca4a4db5f 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -742,6 +742,53 @@ not_found: return 1; } +static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode, + bool encrypt) +{ + struct super_block *sb = dir->i_sb; + int nblocks = 0; +#ifdef CONFIG_EXT4_FS_POSIX_ACL + struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT); + + if (IS_ERR(p)) + return PTR_ERR(p); + if (p) { + int acl_size = p->a_count * sizeof(ext4_acl_entry); + + nblocks += (S_ISDIR(mode) ? 2 : 1) * + __ext4_xattr_set_credits(sb, NULL /* inode */, + NULL /* block_bh */, acl_size, + true /* is_create */); + posix_acl_release(p); + } +#endif + +#ifdef CONFIG_SECURITY + { + int num_security_xattrs = 1; + +#ifdef CONFIG_INTEGRITY + num_security_xattrs++; +#endif + /* + * We assume that security xattrs are never more than 1k. + * In practice they are under 128 bytes. + */ + nblocks += num_security_xattrs * + __ext4_xattr_set_credits(sb, NULL /* inode */, + NULL /* block_bh */, 1024, + true /* is_create */); + } +#endif + if (encrypt) + nblocks += __ext4_xattr_set_credits(sb, + NULL /* inode */, + NULL /* block_bh */, + FSCRYPT_SET_CONTEXT_MAX_SIZE, + true /* is_create */); + return nblocks; +} + /* * There are two policies for allocating an inode. If the new inode is * a directory, then a forward search is made for a block group with both @@ -772,7 +819,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, ext4_group_t i; ext4_group_t flex_group; struct ext4_group_info *grp; - int encrypt = 0; + bool encrypt = false; /* Cannot create files in a deleted directory */ if (!dir || !dir->i_nlink) @@ -784,59 +831,6 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, if (unlikely(ext4_forced_shutdown(sbi))) return ERR_PTR(-EIO); - if ((IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) && - (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) && - !(i_flags & EXT4_EA_INODE_FL)) { - err = fscrypt_get_encryption_info(dir); - if (err) - return ERR_PTR(err); - if (!fscrypt_has_encryption_key(dir)) - return ERR_PTR(-ENOKEY); - encrypt = 1; - } - - if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) { -#ifdef CONFIG_EXT4_FS_POSIX_ACL - struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT); - - if (IS_ERR(p)) - return ERR_CAST(p); - if (p) { - int acl_size = p->a_count * sizeof(ext4_acl_entry); - - nblocks += (S_ISDIR(mode) ? 2 : 1) * - __ext4_xattr_set_credits(sb, NULL /* inode */, - NULL /* block_bh */, acl_size, - true /* is_create */); - posix_acl_release(p); - } -#endif - -#ifdef CONFIG_SECURITY - { - int num_security_xattrs = 1; - -#ifdef CONFIG_INTEGRITY - num_security_xattrs++; -#endif - /* - * We assume that security xattrs are never - * more than 1k. In practice they are under - * 128 bytes. - */ - nblocks += num_security_xattrs * - __ext4_xattr_set_credits(sb, NULL /* inode */, - NULL /* block_bh */, 1024, - true /* is_create */); - } -#endif - if (encrypt) - nblocks += __ext4_xattr_set_credits(sb, - NULL /* inode */, NULL /* block_bh */, - FSCRYPT_SET_CONTEXT_MAX_SIZE, - true /* is_create */); - } - ngroups = ext4_get_groups_count(sb); trace_ext4_request_inode(dir, mode); inode = new_inode(sb); @@ -866,10 +860,25 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, else ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID); + if (!(i_flags & EXT4_EA_INODE_FL)) { + err = fscrypt_prepare_new_inode(dir, inode, &encrypt); + if (err) + goto out; + } + err = dquot_initialize(inode); if (err) goto out; + if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) { + ret2 = ext4_xattr_credits_for_new_inode(dir, mode, encrypt); + if (ret2 < 0) { + err = ret2; + goto out; + } + nblocks += ret2; + } + if (!goal) goal = sbi->s_inode_goal; @@ -1162,7 +1171,7 @@ got: * prevent its deduplication. */ if (encrypt) { - err = fscrypt_inherit_context(dir, inode, handle, true); + err = fscrypt_set_context(inode, handle); if (err) goto fail_free_drop; } diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index be2b66eb65f7..80c9f33800be 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -696,7 +696,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode, * i_mutex. So we can safely drop the i_data_sem here. */ BUG_ON(EXT4_JOURNAL(inode) == NULL); - ext4_discard_preallocations(inode); + ext4_discard_preallocations(inode, 0); up_write(&EXT4_I(inode)->i_data_sem); *dropped = 1; return 0; @@ -858,8 +858,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, else if (ext4_should_journal_data(inode)) flags |= EXT4_FREE_BLOCKS_FORGET; - if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, - count)) { + if (!ext4_inode_block_valid(inode, block_to_free, count)) { EXT4_ERROR_INODE(inode, "attempt to clear invalid " "blocks %llu len %lu", (unsigned long long) block_to_free, count); @@ -1004,8 +1003,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, if (!nr) continue; /* A hole */ - if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), - nr, 1)) { + if (!ext4_inode_block_valid(inode, nr, 1)) { EXT4_ERROR_INODE(inode, "invalid indirect mapped " "block %lu (level %d)", @@ -1182,21 +1180,21 @@ do_indirects: ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); i_data[EXT4_IND_BLOCK] = 0; } - /* fall through */ + fallthrough; case EXT4_IND_BLOCK: nr = i_data[EXT4_DIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); i_data[EXT4_DIND_BLOCK] = 0; } - /* fall through */ + fallthrough; case EXT4_DIND_BLOCK: nr = i_data[EXT4_TIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); i_data[EXT4_TIND_BLOCK] = 0; } - /* fall through */ + fallthrough; case EXT4_TIND_BLOCK: ; } @@ -1436,7 +1434,7 @@ do_indirects: ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); i_data[EXT4_IND_BLOCK] = 0; } - /* fall through */ + fallthrough; case EXT4_IND_BLOCK: if (++n >= n2) break; @@ -1445,7 +1443,7 @@ do_indirects: ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); i_data[EXT4_DIND_BLOCK] = 0; } - /* fall through */ + fallthrough; case EXT4_DIND_BLOCK: if (++n >= n2) break; @@ -1454,7 +1452,7 @@ do_indirects: ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); i_data[EXT4_TIND_BLOCK] = 0; } - /* fall through */ + fallthrough; case EXT4_TIND_BLOCK: ; } diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index c3a1ad2db122..75c97bca0815 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -276,7 +276,7 @@ static int ext4_create_inline_data(handle_t *handle, len = 0; } - /* Insert the the xttr entry. */ + /* Insert the xttr entry. */ i.value = value; i.value_len = len; @@ -1706,7 +1706,7 @@ int ext4_delete_inline_entry(handle_t *handle, if (err) goto out; - err = ext4_generic_delete_entry(handle, dir, de_del, bh, + err = ext4_generic_delete_entry(dir, de_del, bh, inline_start, inline_size, 0); if (err) goto out; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 44bad4bb8831..bf596467c234 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -383,7 +383,7 @@ void ext4_da_update_reserve_space(struct inode *inode, */ if ((ei->i_reserved_data_blocks == 0) && !inode_is_open_for_write(inode)) - ext4_discard_preallocations(inode); + ext4_discard_preallocations(inode, 0); } static int __check_block_validity(struct inode *inode, const char *func, @@ -394,8 +394,7 @@ static int __check_block_validity(struct inode *inode, const char *func, (inode->i_ino == le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) return 0; - if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk, - map->m_len)) { + if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) { ext4_error_inode(inode, func, line, map->m_pblk, "lblock %lu mapped to illegal pblock %llu " "(length %d)", (unsigned long) map->m_lblk, @@ -3288,7 +3287,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait) if (PageChecked(page)) return 0; if (journal) - return jbd2_journal_try_to_free_buffers(journal, page, wait); + return jbd2_journal_try_to_free_buffers(journal, page); else return try_to_free_buffers(page); } @@ -4056,7 +4055,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) if (stop_block > first_block) { down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode); + ext4_discard_preallocations(inode, 0); ret = ext4_es_remove_extent(inode, first_block, stop_block - first_block); @@ -4163,7 +4162,7 @@ int ext4_truncate(struct inode *inode) trace_ext4_truncate_enter(inode); if (!ext4_can_truncate(inode)) - return 0; + goto out_trace; if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); @@ -4172,16 +4171,14 @@ int ext4_truncate(struct inode *inode) int has_inline = 1; err = ext4_inline_data_truncate(inode, &has_inline); - if (err) - return err; - if (has_inline) - return 0; + if (err || has_inline) + goto out_trace; } /* If we zero-out tail of the page, we have to create jinode for jbd2 */ if (inode->i_size & (inode->i_sb->s_blocksize - 1)) { if (ext4_inode_attach_jinode(inode) < 0) - return 0; + goto out_trace; } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) @@ -4190,8 +4187,10 @@ int ext4_truncate(struct inode *inode) credits = ext4_blocks_for_truncate(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); - if (IS_ERR(handle)) - return PTR_ERR(handle); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto out_trace; + } if (inode->i_size & (inode->i_sb->s_blocksize - 1)) ext4_block_truncate_page(handle, mapping, inode->i_size); @@ -4211,7 +4210,7 @@ int ext4_truncate(struct inode *inode) down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode); + ext4_discard_preallocations(inode, 0); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) err = ext4_ext_truncate(handle, inode); @@ -4242,6 +4241,7 @@ out_stop: err = err2; ext4_journal_stop(handle); +out_trace: trace_ext4_truncate_exit(inode); return err; } @@ -4760,7 +4760,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ret = 0; if (ei->i_file_acl && - !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { + !ext4_inode_block_valid(inode, ei->i_file_acl, 1)) { ext4_error_inode(inode, function, line, 0, "iget: bad extended attribute block %llu", ei->i_file_acl); @@ -4901,7 +4901,7 @@ static void __ext4_update_other_inode_time(struct super_block *sb, (inode->i_state & I_DIRTY_TIME)) { struct ext4_inode_info *ei = EXT4_I(inode); - inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED); + inode->i_state &= ~I_DIRTY_TIME; spin_unlock(&inode->i_lock); spin_lock(&ei->i_raw_lock); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 999cf6add39c..36eca3bc036a 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -202,7 +202,7 @@ static long swap_inode_boot_loader(struct super_block *sb, reset_inode_seed(inode); reset_inode_seed(inode_bl); - ext4_discard_preallocations(inode); + ext4_discard_preallocations(inode, 0); err = ext4_mark_inode_dirty(handle, inode); if (err < 0) { @@ -819,12 +819,12 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) switch (cmd) { case FS_IOC_GETFSMAP: return ext4_ioc_getfsmap(sb, (void __user *)arg); - case EXT4_IOC_GETFLAGS: + case FS_IOC_GETFLAGS: flags = ei->i_flags & EXT4_FL_USER_VISIBLE; if (S_ISREG(inode->i_mode)) flags &= ~EXT4_PROJINHERIT_FL; return put_user(flags, (int __user *) arg); - case EXT4_IOC_SETFLAGS: { + case FS_IOC_SETFLAGS: { int err; if (!inode_owner_or_capable(inode)) @@ -1129,12 +1129,12 @@ resizefs_out: case EXT4_IOC_PRECACHE_EXTENTS: return ext4_ext_precache(inode); - case EXT4_IOC_SET_ENCRYPTION_POLICY: + case FS_IOC_SET_ENCRYPTION_POLICY: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); - case EXT4_IOC_GET_ENCRYPTION_PWSALT: { + case FS_IOC_GET_ENCRYPTION_PWSALT: { #ifdef CONFIG_FS_ENCRYPTION int err, err2; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -1174,7 +1174,7 @@ resizefs_out: return -EOPNOTSUPP; #endif } - case EXT4_IOC_GET_ENCRYPTION_POLICY: + case FS_IOC_GET_ENCRYPTION_POLICY: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; return fscrypt_ioctl_get_policy(filp, (void __user *)arg); @@ -1236,7 +1236,7 @@ resizefs_out: case EXT4_IOC_GET_ES_CACHE: return ext4_ioctl_get_es_cache(filp, arg); - case EXT4_IOC_FSGETXATTR: + case FS_IOC_FSGETXATTR: { struct fsxattr fa; @@ -1247,7 +1247,7 @@ resizefs_out: return -EFAULT; return 0; } - case EXT4_IOC_FSSETXATTR: + case FS_IOC_FSSETXATTR: { struct fsxattr fa, old_fa; int err; @@ -1313,11 +1313,11 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { /* These are just misnamed, they actually get/put from/to user an int */ switch (cmd) { - case EXT4_IOC32_GETFLAGS: - cmd = EXT4_IOC_GETFLAGS; + case FS_IOC32_GETFLAGS: + cmd = FS_IOC_GETFLAGS; break; - case EXT4_IOC32_SETFLAGS: - cmd = EXT4_IOC_SETFLAGS; + case FS_IOC32_SETFLAGS: + cmd = FS_IOC_SETFLAGS; break; case EXT4_IOC32_GETVERSION: cmd = EXT4_IOC_GETVERSION; @@ -1361,9 +1361,9 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC_RESIZE_FS: case FITRIM: case EXT4_IOC_PRECACHE_EXTENTS: - case EXT4_IOC_SET_ENCRYPTION_POLICY: - case EXT4_IOC_GET_ENCRYPTION_PWSALT: - case EXT4_IOC_GET_ENCRYPTION_POLICY: + case FS_IOC_SET_ENCRYPTION_POLICY: + case FS_IOC_GET_ENCRYPTION_PWSALT: + case FS_IOC_GET_ENCRYPTION_POLICY: case FS_IOC_GET_ENCRYPTION_POLICY_EX: case FS_IOC_ADD_ENCRYPTION_KEY: case FS_IOC_REMOVE_ENCRYPTION_KEY: @@ -1377,8 +1377,8 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC_CLEAR_ES_CACHE: case EXT4_IOC_GETSTATE: case EXT4_IOC_GET_ES_CACHE: - case EXT4_IOC_FSGETXATTR: - case EXT4_IOC_FSSETXATTR: + case FS_IOC_FSGETXATTR: + case FS_IOC_FSSETXATTR: break; default: return -ENOIOCTLCMD; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c0a331e2feb0..132c118d12e1 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -922,7 +922,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) bh[i] = NULL; continue; } - bh[i] = ext4_read_block_bitmap_nowait(sb, group); + bh[i] = ext4_read_block_bitmap_nowait(sb, group, false); if (IS_ERR(bh[i])) { err = PTR_ERR(bh[i]); bh[i] = NULL; @@ -1279,9 +1279,6 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, e4b->bd_buddy_page = page; e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); - BUG_ON(e4b->bd_bitmap_page == NULL); - BUG_ON(e4b->bd_buddy_page == NULL); - return 0; err: @@ -1743,10 +1740,6 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, } -/* - * regular allocator, for general purposes allocation - */ - static void ext4_mb_check_limits(struct ext4_allocation_context *ac, struct ext4_buddy *e4b, int finish_group) @@ -2119,13 +2112,11 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, BUG_ON(cr < 0 || cr >= 4); - free = grp->bb_free; - if (free == 0) - return false; - if (cr <= 2 && free < ac->ac_g_ex.fe_len) + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) return false; - if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) + free = grp->bb_free; + if (free == 0) return false; fragments = grp->bb_fragments; @@ -2142,8 +2133,10 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, ((group % flex_size) == 0)) return false; - if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) || - (free / fragments) >= ac->ac_g_ex.fe_len) + if (free < ac->ac_g_ex.fe_len) + return false; + + if (ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) return true; if (grp->bb_largest_free_order < ac->ac_2order) @@ -2177,6 +2170,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, { struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); struct super_block *sb = ac->ac_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK; ext4_grpblk_t free; int ret = 0; @@ -2195,7 +2189,25 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { - ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS); + struct ext4_group_desc *gdp = + ext4_get_group_desc(sb, group, NULL); + int ret; + + /* cr=0/1 is a very optimistic search to find large + * good chunks almost for free. If buddy data is not + * ready, then this optimization makes no sense. But + * we never skip the first block group in a flex_bg, + * since this gets used for metadata block allocation, + * and we want to make sure we locate metadata blocks + * in the first block group in the flex_bg if possible. + */ + if (cr < 2 && + (!sbi->s_log_groups_per_flex || + ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) && + !(ext4_has_group_desc_csum(sb) && + (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) + return 0; + ret = ext4_mb_init_group(sb, group, GFP_NOFS); if (ret) return ret; } @@ -2209,15 +2221,95 @@ out: return ret; } +/* + * Start prefetching @nr block bitmaps starting at @group. + * Return the next group which needs to be prefetched. + */ +ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, + unsigned int nr, int *cnt) +{ + ext4_group_t ngroups = ext4_get_groups_count(sb); + struct buffer_head *bh; + struct blk_plug plug; + + blk_start_plug(&plug); + while (nr-- > 0) { + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, + NULL); + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + + /* + * Prefetch block groups with free blocks; but don't + * bother if it is marked uninitialized on disk, since + * it won't require I/O to read. Also only try to + * prefetch once, so we avoid getblk() call, which can + * be expensive. + */ + if (!EXT4_MB_GRP_TEST_AND_SET_READ(grp) && + EXT4_MB_GRP_NEED_INIT(grp) && + ext4_free_group_clusters(sb, gdp) > 0 && + !(ext4_has_group_desc_csum(sb) && + (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) { + bh = ext4_read_block_bitmap_nowait(sb, group, true); + if (bh && !IS_ERR(bh)) { + if (!buffer_uptodate(bh) && cnt) + (*cnt)++; + brelse(bh); + } + } + if (++group >= ngroups) + group = 0; + } + blk_finish_plug(&plug); + return group; +} + +/* + * Prefetching reads the block bitmap into the buffer cache; but we + * need to make sure that the buddy bitmap in the page cache has been + * initialized. Note that ext4_mb_init_group() will block if the I/O + * is not yet completed, or indeed if it was not initiated by + * ext4_mb_prefetch did not start the I/O. + * + * TODO: We should actually kick off the buddy bitmap setup in a work + * queue when the buffer I/O is completed, so that we don't block + * waiting for the block allocation bitmap read to finish when + * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator(). + */ +void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, + unsigned int nr) +{ + while (nr-- > 0) { + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, + NULL); + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + + if (!group) + group = ext4_get_groups_count(sb); + group--; + grp = ext4_get_group_info(sb, group); + + if (EXT4_MB_GRP_NEED_INIT(grp) && + ext4_free_group_clusters(sb, gdp) > 0 && + !(ext4_has_group_desc_csum(sb) && + (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) { + if (ext4_mb_init_group(sb, group, GFP_NOFS)) + break; + } + } +} + static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { - ext4_group_t ngroups, group, i; + ext4_group_t prefetch_grp = 0, ngroups, group, i; int cr = -1; int err = 0, first_err = 0; + unsigned int nr = 0, prefetch_ios = 0; struct ext4_sb_info *sbi; struct super_block *sb; struct ext4_buddy e4b; + int lost; sb = ac->ac_sb; sbi = EXT4_SB(sb); @@ -2237,8 +2329,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) goto out; /* - * ac->ac2_order is set only if the fe_len is a power of 2 - * if ac2_order is set we also set criteria to 0 so that we + * ac->ac_2order is set only if the fe_len is a power of 2 + * if ac->ac_2order is set we also set criteria to 0 so that we * try exact allocation using buddy. */ i = fls(ac->ac_g_ex.fe_len); @@ -2282,6 +2374,7 @@ repeat: * from the goal value specified */ group = ac->ac_g_ex.fe_group; + prefetch_grp = group; for (i = 0; i < ngroups; group++, i++) { int ret = 0; @@ -2293,6 +2386,29 @@ repeat: if (group >= ngroups) group = 0; + /* + * Batch reads of the block allocation bitmaps + * to get multiple READs in flight; limit + * prefetching at cr=0/1, otherwise mballoc can + * spend a lot of time loading imperfect groups + */ + if ((prefetch_grp == group) && + (cr > 1 || + prefetch_ios < sbi->s_mb_prefetch_limit)) { + unsigned int curr_ios = prefetch_ios; + + nr = sbi->s_mb_prefetch; + if (ext4_has_feature_flex_bg(sb)) { + nr = (group / sbi->s_mb_prefetch) * + sbi->s_mb_prefetch; + nr = nr + sbi->s_mb_prefetch - group; + } + prefetch_grp = ext4_mb_prefetch(sb, group, + nr, &prefetch_ios); + if (prefetch_ios == curr_ios) + nr = 0; + } + /* This now checks without needing the buddy page */ ret = ext4_mb_good_group_nolock(ac, group, cr); if (ret <= 0) { @@ -2341,22 +2457,24 @@ repeat: * We've been searching too long. Let's try to allocate * the best chunk we've found so far */ - ext4_mb_try_best_found(ac, &e4b); if (ac->ac_status != AC_STATUS_FOUND) { /* * Someone more lucky has already allocated it. * The only thing we can do is just take first * found block(s) - printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n"); */ + lost = atomic_inc_return(&sbi->s_mb_lost_chunks); + mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n", + ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start, + ac->ac_b_ex.fe_len, lost); + ac->ac_b_ex.fe_group = 0; ac->ac_b_ex.fe_start = 0; ac->ac_b_ex.fe_len = 0; ac->ac_status = AC_STATUS_CONTINUE; ac->ac_flags |= EXT4_MB_HINT_FIRST; cr = 3; - atomic_inc(&sbi->s_mb_lost_chunks); goto repeat; } } @@ -2367,6 +2485,10 @@ out: mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n", ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status, ac->ac_flags, cr, err); + + if (nr) + ext4_mb_prefetch_fini(sb, prefetch_grp, nr); + return err; } @@ -2439,7 +2561,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) for (i = 0; i <= 13; i++) seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ? sg.info.bb_counters[i] : 0); - seq_printf(seq, " ]\n"); + seq_puts(seq, " ]\n"); return 0; } @@ -2613,6 +2735,26 @@ static int ext4_mb_init_backend(struct super_block *sb) goto err_freebuddy; } + if (ext4_has_feature_flex_bg(sb)) { + /* a single flex group is supposed to be read by a single IO */ + sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex; + sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */ + } else { + sbi->s_mb_prefetch = 32; + } + if (sbi->s_mb_prefetch > ext4_get_groups_count(sb)) + sbi->s_mb_prefetch = ext4_get_groups_count(sb); + /* now many real IOs to prefetch within a single allocation at cr=0 + * given cr=0 is an CPU-related optimization we shouldn't try to + * load too many groups, at some point we should start to use what + * we've got in memory. + * with an average random access time 5ms, it'd take a second to get + * 200 groups (* N with flex_bg), so let's make this limit 4 + */ + sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4; + if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb)) + sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb); + return 0; err_freebuddy: @@ -2736,6 +2878,7 @@ int ext4_mb_init(struct super_block *sb) sbi->s_mb_stats = MB_DEFAULT_STATS; sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; + sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC; /* * The default group preallocation is 512, which for 4k block * sizes translates to 2 megabytes. However for bigalloc file @@ -3090,7 +3233,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); - if (!ext4_data_block_valid(sbi, block, len)) { + if (!ext4_inode_block_valid(ac->ac_inode, block, len)) { ext4_error(sb, "Allocating blocks %llu-%llu which overlap " "fs metadata", block, block+len); /* File system mounted not to panic on error @@ -3674,6 +3817,26 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, mb_debug(sb, "preallocated %d for group %u\n", preallocated, group); } +static void ext4_mb_mark_pa_deleted(struct super_block *sb, + struct ext4_prealloc_space *pa) +{ + struct ext4_inode_info *ei; + + if (pa->pa_deleted) { + ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n", + pa->pa_type, pa->pa_pstart, pa->pa_lstart, + pa->pa_len); + return; + } + + pa->pa_deleted = 1; + + if (pa->pa_type == MB_INODE_PA) { + ei = EXT4_I(pa->pa_inode); + atomic_dec(&ei->i_prealloc_active); + } +} + static void ext4_mb_pa_callback(struct rcu_head *head) { struct ext4_prealloc_space *pa; @@ -3706,7 +3869,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, return; } - pa->pa_deleted = 1; + ext4_mb_mark_pa_deleted(sb, pa); spin_unlock(&pa->pa_lock); grp_blk = pa->pa_pstart; @@ -3830,6 +3993,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) spin_lock(pa->pa_obj_lock); list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); spin_unlock(pa->pa_obj_lock); + atomic_inc(&ei->i_prealloc_active); } /* @@ -4040,7 +4204,7 @@ repeat: } /* seems this one can be freed ... */ - pa->pa_deleted = 1; + ext4_mb_mark_pa_deleted(sb, pa); /* we can trust pa_free ... */ free += pa->pa_free; @@ -4103,7 +4267,7 @@ out_dbg: * * FIXME!! Make sure it is valid at all the call sites */ -void ext4_discard_preallocations(struct inode *inode) +void ext4_discard_preallocations(struct inode *inode, unsigned int needed) { struct ext4_inode_info *ei = EXT4_I(inode); struct super_block *sb = inode->i_sb; @@ -4121,15 +4285,19 @@ void ext4_discard_preallocations(struct inode *inode) mb_debug(sb, "discard preallocation for inode %lu\n", inode->i_ino); - trace_ext4_discard_preallocations(inode); + trace_ext4_discard_preallocations(inode, + atomic_read(&ei->i_prealloc_active), needed); INIT_LIST_HEAD(&list); + if (needed == 0) + needed = UINT_MAX; + repeat: /* first, collect all pa's in the inode */ spin_lock(&ei->i_prealloc_lock); - while (!list_empty(&ei->i_prealloc_list)) { - pa = list_entry(ei->i_prealloc_list.next, + while (!list_empty(&ei->i_prealloc_list) && needed) { + pa = list_entry(ei->i_prealloc_list.prev, struct ext4_prealloc_space, pa_inode_list); BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock); spin_lock(&pa->pa_lock); @@ -4146,10 +4314,11 @@ repeat: } if (pa->pa_deleted == 0) { - pa->pa_deleted = 1; + ext4_mb_mark_pa_deleted(sb, pa); spin_unlock(&pa->pa_lock); list_del_rcu(&pa->pa_inode_list); list_add(&pa->u.pa_tmp_list, &list); + needed--; continue; } @@ -4399,7 +4568,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, ac->ac_g_ex = ac->ac_o_ex; ac->ac_flags = ar->flags; - /* we have to define context: we'll we work with a file or + /* we have to define context: we'll work with a file or * locality group. this is a policy, actually */ ext4_mb_group_or_file(ac); @@ -4450,7 +4619,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, BUG_ON(pa->pa_type != MB_GROUP_PA); /* seems this one can be freed ... */ - pa->pa_deleted = 1; + ext4_mb_mark_pa_deleted(sb, pa); spin_unlock(&pa->pa_lock); list_del_rcu(&pa->pa_inode_list); @@ -4549,10 +4718,29 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) } /* + * if per-inode prealloc list is too long, trim some PA + */ +static void ext4_mb_trim_inode_pa(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int count, delta; + + count = atomic_read(&ei->i_prealloc_active); + delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1; + if (count > sbi->s_mb_max_inode_prealloc + delta) { + count -= sbi->s_mb_max_inode_prealloc; + ext4_discard_preallocations(inode, count); + } +} + +/* * release all resource we used in allocation */ static int ext4_mb_release_context(struct ext4_allocation_context *ac) { + struct inode *inode = ac->ac_inode; + struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *pa = ac->ac_pa; if (pa) { @@ -4564,21 +4752,31 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) pa->pa_free -= ac->ac_b_ex.fe_len; pa->pa_len -= ac->ac_b_ex.fe_len; spin_unlock(&pa->pa_lock); + + /* + * We want to add the pa to the right bucket. + * Remove it from the list and while adding + * make sure the list to which we are adding + * doesn't grow big. + */ + if (likely(pa->pa_free)) { + spin_lock(pa->pa_obj_lock); + list_del_rcu(&pa->pa_inode_list); + spin_unlock(pa->pa_obj_lock); + ext4_mb_add_n_trim(ac); + } } - } - if (pa) { - /* - * We want to add the pa to the right bucket. - * Remove it from the list and while adding - * make sure the list to which we are adding - * doesn't grow big. - */ - if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) { + + if (pa->pa_type == MB_INODE_PA) { + /* + * treat per-inode prealloc list as a lru list, then try + * to trim the least recently used PA. + */ spin_lock(pa->pa_obj_lock); - list_del_rcu(&pa->pa_inode_list); + list_move(&pa->pa_inode_list, &ei->i_prealloc_list); spin_unlock(pa->pa_obj_lock); - ext4_mb_add_n_trim(ac); } + ext4_mb_put_pa(ac, ac->ac_sb, pa); } if (ac->ac_bitmap_page) @@ -4588,6 +4786,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) mutex_unlock(&ac->ac_lg->lg_mutex); ext4_mb_collect_stats(ac); + ext4_mb_trim_inode_pa(inode); return 0; } @@ -4915,7 +5114,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, sbi = EXT4_SB(sb); if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && - !ext4_data_block_valid(sbi, block, count)) { + !ext4_inode_block_valid(inode, block, count)) { ext4_error(sb, "Freeing blocks not in datazone - " "block = %llu, count = %lu", block, count); goto error_return; diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 6b4d17c2935d..e75b4749aa1c 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -73,6 +73,10 @@ */ #define MB_DEFAULT_GROUP_PREALLOC 512 +/* + * maximum length of inode prealloc list + */ +#define MB_DEFAULT_MAX_INODE_PREALLOC 512 struct ext4_free_data { /* this links the free block information from sb_info */ diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 1ed86fb6c302..0d601b822875 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -686,8 +686,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, out: if (*moved_len) { - ext4_discard_preallocations(orig_inode); - ext4_discard_preallocations(donor_inode); + ext4_discard_preallocations(orig_inode, 0); + ext4_discard_preallocations(donor_inode, 0); } ext4_ext_drop_refs(path); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 56738b538ddf..0d74615fcce3 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -663,8 +663,7 @@ static struct stats dx_show_leaf(struct inode *dir, /* Directory is encrypted */ res = fscrypt_fname_alloc_buffer( - dir, len, - &fname_crypto_str); + len, &fname_crypto_str); if (res) printk(KERN_WARNING "Error " "allocating crypto " @@ -1016,8 +1015,8 @@ static int htree_dirblock_to_tree(struct file *dir_file, brelse(bh); return err; } - err = fscrypt_fname_alloc_buffer(dir, EXT4_NAME_LEN, - &fname_crypto_str); + err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN, + &fname_crypto_str); if (err < 0) { brelse(bh); return err; @@ -1396,8 +1395,8 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, ext4_match(dir, fname, de)) { /* found a match - just to be sure, do * a full check */ - if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, - bh->b_size, offset)) + if (ext4_check_dir_entry(dir, NULL, de, bh, search_buf, + buf_size, offset)) return -1; *res_dir = de; return 1; @@ -1858,7 +1857,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, blocksize, hinfo, map); map -= count; dx_sort_map(map, count); - /* Split the existing block in the middle, size-wise */ + /* Ensure that neither split block is over half full */ size = 0; move = 0; for (i = count-1; i >= 0; i--) { @@ -1868,8 +1867,18 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, size += map[i].size; move++; } - /* map index at which we will split */ - split = count - move; + /* + * map index at which we will split + * + * If the sum of active entries didn't exceed half the block size, just + * split it in half by count; each resulting block will have at least + * half the space free. + */ + if (i > 0) + split = count - move; + else + split = count/2; + hash2 = map[split].hash; continued = hash2 == map[split - 1].hash; dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n", @@ -2455,8 +2464,7 @@ cleanup: * ext4_generic_delete_entry deletes a directory entry by merging it * with the previous entry */ -int ext4_generic_delete_entry(handle_t *handle, - struct inode *dir, +int ext4_generic_delete_entry(struct inode *dir, struct ext4_dir_entry_2 *de_del, struct buffer_head *bh, void *entry_buf, @@ -2472,7 +2480,7 @@ int ext4_generic_delete_entry(handle_t *handle, de = (struct ext4_dir_entry_2 *)entry_buf; while (i < buf_size - csum_size) { if (ext4_check_dir_entry(dir, NULL, de, bh, - bh->b_data, bh->b_size, i)) + entry_buf, buf_size, i)) return -EFSCORRUPTED; if (de == de_del) { if (pde) @@ -2517,8 +2525,7 @@ static int ext4_delete_entry(handle_t *handle, if (unlikely(err)) goto out; - err = ext4_generic_delete_entry(handle, dir, de_del, - bh, bh->b_data, + err = ext4_generic_delete_entry(dir, de_del, bh, bh->b_data, dir->i_sb->s_blocksize, csum_size); if (err) goto out; @@ -3193,30 +3200,33 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) * in separate transaction */ retval = dquot_initialize(dir); if (retval) - return retval; + goto out_trace; retval = dquot_initialize(d_inode(dentry)); if (retval) - return retval; + goto out_trace; - retval = -ENOENT; bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); - if (IS_ERR(bh)) - return PTR_ERR(bh); - if (!bh) - goto end_unlink; + if (IS_ERR(bh)) { + retval = PTR_ERR(bh); + goto out_trace; + } + if (!bh) { + retval = -ENOENT; + goto out_trace; + } inode = d_inode(dentry); - retval = -EFSCORRUPTED; - if (le32_to_cpu(de->inode) != inode->i_ino) - goto end_unlink; + if (le32_to_cpu(de->inode) != inode->i_ino) { + retval = -EFSCORRUPTED; + goto out_bh; + } handle = ext4_journal_start(dir, EXT4_HT_DIR, EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) { retval = PTR_ERR(handle); - handle = NULL; - goto end_unlink; + goto out_bh; } if (IS_DIRSYNC(dir)) @@ -3224,12 +3234,12 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) retval = ext4_delete_entry(handle, dir, de, bh); if (retval) - goto end_unlink; + goto out_handle; dir->i_ctime = dir->i_mtime = current_time(dir); ext4_update_dx_flag(dir); retval = ext4_mark_inode_dirty(handle, dir); if (retval) - goto end_unlink; + goto out_handle; if (inode->i_nlink == 0) ext4_warning_inode(inode, "Deleting file '%.*s' with no links", dentry->d_name.len, dentry->d_name.name); @@ -3251,10 +3261,11 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) d_invalidate(dentry); #endif -end_unlink: +out_handle: + ext4_journal_stop(handle); +out_bh: brelse(bh); - if (handle) - ext4_journal_stop(handle); +out_trace: trace_ext4_unlink_exit(dentry, retval); return retval; } diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index f2df2db0786c..f014c5e473a9 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -140,7 +140,7 @@ static void bio_post_read_processing(struct bio_post_read_ctx *ctx) return; } ctx->cur_step++; - /* fall-through */ + fallthrough; case STEP_VERITY: if (ctx->enabled_steps & (1 << STEP_VERITY)) { INIT_WORK(&ctx->work, verity_work); @@ -148,7 +148,7 @@ static void bio_post_read_processing(struct bio_post_read_ctx *ctx) return; } ctx->cur_step++; - /* fall-through */ + fallthrough; default: __read_end_io(ctx->bio); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0907f907c47d..8b2736283481 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -66,10 +66,10 @@ static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); static int ext4_show_options(struct seq_file *seq, struct dentry *root); static int ext4_commit_super(struct super_block *sb, int sync); -static void ext4_mark_recovery_complete(struct super_block *sb, +static int ext4_mark_recovery_complete(struct super_block *sb, struct ext4_super_block *es); -static void ext4_clear_journal_err(struct super_block *sb, - struct ext4_super_block *es); +static int ext4_clear_journal_err(struct super_block *sb, + struct ext4_super_block *es); static int ext4_sync_fs(struct super_block *sb, int wait); static int ext4_remount(struct super_block *sb, int *flags, char *data); static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); @@ -744,6 +744,7 @@ void __ext4_msg(struct super_block *sb, struct va_format vaf; va_list args; + atomic_inc(&EXT4_SB(sb)->s_msg_count); if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs")) return; @@ -754,9 +755,12 @@ void __ext4_msg(struct super_block *sb, va_end(args); } -#define ext4_warning_ratelimit(sb) \ - ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), \ - "EXT4-fs warning") +static int ext4_warning_ratelimit(struct super_block *sb) +{ + atomic_inc(&EXT4_SB(sb)->s_warning_count); + return ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), + "EXT4-fs warning"); +} void __ext4_warning(struct super_block *sb, const char *function, unsigned int line, const char *fmt, ...) @@ -1100,7 +1104,7 @@ static void ext4_put_super(struct super_block *sb) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->s_blockgroup_lock); fs_put_dax(sbi->s_daxdev); - fscrypt_free_dummy_context(&sbi->s_dummy_enc_ctx); + fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); #ifdef CONFIG_UNICODE utf8_unload(sbi->s_encoding); #endif @@ -1123,6 +1127,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) inode_set_iversion(&ei->vfs_inode, 1); spin_lock_init(&ei->i_raw_lock); INIT_LIST_HEAD(&ei->i_prealloc_list); + atomic_set(&ei->i_prealloc_active, 0); spin_lock_init(&ei->i_prealloc_lock); ext4_es_init_tree(&ei->i_es_tree); rwlock_init(&ei->i_es_lock); @@ -1216,7 +1221,7 @@ void ext4_clear_inode(struct inode *inode) { invalidate_inode_buffers(inode); clear_inode(inode); - ext4_discard_preallocations(inode); + ext4_discard_preallocations(inode, 0); ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); dquot_drop(inode); if (EXT4_I(inode)->jinode) { @@ -1288,8 +1293,8 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page, if (!page_has_buffers(page)) return 0; if (journal) - return jbd2_journal_try_to_free_buffers(journal, page, - wait & ~__GFP_DIRECT_RECLAIM); + return jbd2_journal_try_to_free_buffers(journal, page); + return try_to_free_buffers(page); } @@ -1387,10 +1392,9 @@ retry: return res; } -static const union fscrypt_context * -ext4_get_dummy_context(struct super_block *sb) +static const union fscrypt_policy *ext4_get_dummy_policy(struct super_block *sb) { - return EXT4_SB(sb)->s_dummy_enc_ctx.ctx; + return EXT4_SB(sb)->s_dummy_enc_policy.policy; } static bool ext4_has_stable_inodes(struct super_block *sb) @@ -1409,7 +1413,7 @@ static const struct fscrypt_operations ext4_cryptops = { .key_prefix = "ext4:", .get_context = ext4_get_context, .set_context = ext4_set_context, - .get_dummy_context = ext4_get_dummy_context, + .get_dummy_policy = ext4_get_dummy_policy, .empty_dir = ext4_empty_dir, .max_namelen = EXT4_NAME_LEN, .has_stable_inodes = ext4_has_stable_inodes, @@ -1522,6 +1526,7 @@ enum { Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, + Opt_prefetch_block_bitmaps, }; static const match_table_t tokens = { @@ -1614,6 +1619,7 @@ static const match_table_t tokens = { {Opt_inlinecrypt, "inlinecrypt"}, {Opt_nombcache, "nombcache"}, {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */ + {Opt_prefetch_block_bitmaps, "prefetch_block_bitmaps"}, {Opt_removed, "check=none"}, /* mount option from ext2/3 */ {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ {Opt_removed, "reservation"}, /* mount option from ext2/3 */ @@ -1831,6 +1837,8 @@ static const struct mount_opts { {Opt_max_dir_size_kb, 0, MOPT_GTE0}, {Opt_test_dummy_encryption, 0, MOPT_STRING}, {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, + {Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS, + MOPT_SET}, {Opt_err, 0, 0} }; @@ -1879,12 +1887,13 @@ static int ext4_set_test_dummy_encryption(struct super_block *sb, * needed to allow it to be set or changed during remount. We do allow * it to be specified during remount, but only if there is no change. */ - if (is_remount && !sbi->s_dummy_enc_ctx.ctx) { + if (is_remount && !sbi->s_dummy_enc_policy.policy) { ext4_msg(sb, KERN_WARNING, "Can't set test_dummy_encryption on remount"); return -1; } - err = fscrypt_set_test_dummy_encryption(sb, arg, &sbi->s_dummy_enc_ctx); + err = fscrypt_set_test_dummy_encryption(sb, arg->from, + &sbi->s_dummy_enc_policy); if (err) { if (err == -EEXIST) ext4_msg(sb, KERN_WARNING, @@ -3213,15 +3222,34 @@ static void print_daily_error_info(struct timer_list *t) static int ext4_run_li_request(struct ext4_li_request *elr) { struct ext4_group_desc *gdp = NULL; - ext4_group_t group, ngroups; - struct super_block *sb; + struct super_block *sb = elr->lr_super; + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + ext4_group_t group = elr->lr_next_group; unsigned long timeout = 0; + unsigned int prefetch_ios = 0; int ret = 0; - sb = elr->lr_super; - ngroups = EXT4_SB(sb)->s_groups_count; + if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) { + elr->lr_next_group = ext4_mb_prefetch(sb, group, + EXT4_SB(sb)->s_mb_prefetch, &prefetch_ios); + if (prefetch_ios) + ext4_mb_prefetch_fini(sb, elr->lr_next_group, + prefetch_ios); + trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group, + prefetch_ios); + if (group >= elr->lr_next_group) { + ret = 1; + if (elr->lr_first_not_zeroed != ngroups && + !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) { + elr->lr_next_group = elr->lr_first_not_zeroed; + elr->lr_mode = EXT4_LI_MODE_ITABLE; + ret = 0; + } + } + return ret; + } - for (group = elr->lr_next_group; group < ngroups; group++) { + for (; group < ngroups; group++) { gdp = ext4_get_group_desc(sb, group, NULL); if (!gdp) { ret = 1; @@ -3239,9 +3267,10 @@ static int ext4_run_li_request(struct ext4_li_request *elr) timeout = jiffies; ret = ext4_init_inode_table(sb, group, elr->lr_timeout ? 0 : 1); + trace_ext4_lazy_itable_init(sb, group); if (elr->lr_timeout == 0) { timeout = (jiffies - timeout) * - elr->lr_sbi->s_li_wait_mult; + EXT4_SB(elr->lr_super)->s_li_wait_mult; elr->lr_timeout = timeout; } elr->lr_next_sched = jiffies + elr->lr_timeout; @@ -3256,15 +3285,11 @@ static int ext4_run_li_request(struct ext4_li_request *elr) */ static void ext4_remove_li_request(struct ext4_li_request *elr) { - struct ext4_sb_info *sbi; - if (!elr) return; - sbi = elr->lr_sbi; - list_del(&elr->lr_request); - sbi->s_li_request = NULL; + EXT4_SB(elr->lr_super)->s_li_request = NULL; kfree(elr); } @@ -3473,7 +3498,6 @@ static int ext4_li_info_new(void) static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, ext4_group_t start) { - struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_li_request *elr; elr = kzalloc(sizeof(*elr), GFP_KERNEL); @@ -3481,8 +3505,13 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, return NULL; elr->lr_super = sb; - elr->lr_sbi = sbi; - elr->lr_next_group = start; + elr->lr_first_not_zeroed = start; + if (test_opt(sb, PREFETCH_BLOCK_BITMAPS)) + elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP; + else { + elr->lr_mode = EXT4_LI_MODE_ITABLE; + elr->lr_next_group = start; + } /* * Randomize first schedule time of the request to @@ -3512,8 +3541,9 @@ int ext4_register_li_request(struct super_block *sb, goto out; } - if (first_not_zeroed == ngroups || sb_rdonly(sb) || - !test_opt(sb, INIT_INODE_TABLE)) + if (!test_opt(sb, PREFETCH_BLOCK_BITMAPS) && + (first_not_zeroed == ngroups || sb_rdonly(sb) || + !test_opt(sb, INIT_INODE_TABLE))) goto out; elr = ext4_li_request_new(sb, first_not_zeroed); @@ -4710,11 +4740,13 @@ no_journal: ext4_set_resv_clusters(sb); - err = ext4_setup_system_zone(sb); - if (err) { - ext4_msg(sb, KERN_ERR, "failed to initialize system " - "zone (%d)", err); - goto failed_mount4a; + if (test_opt(sb, BLOCK_VALIDITY)) { + err = ext4_setup_system_zone(sb); + if (err) { + ext4_msg(sb, KERN_ERR, "failed to initialize system " + "zone (%d)", err); + goto failed_mount4a; + } } ext4_ext_init(sb); @@ -4777,12 +4809,23 @@ no_journal: } #endif /* CONFIG_QUOTA */ + /* + * Save the original bdev mapping's wb_err value which could be + * used to detect the metadata async write error. + */ + spin_lock_init(&sbi->s_bdev_wb_lock); + if (!sb_rdonly(sb)) + errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err, + &sbi->s_bdev_wb_err); + sb->s_bdev->bd_super = sb; EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; if (needs_recovery) { ext4_msg(sb, KERN_INFO, "recovery complete"); - ext4_mark_recovery_complete(sb, es); + err = ext4_mark_recovery_complete(sb, es); + if (err) + goto failed_mount8; } if (EXT4_SB(sb)->s_journal) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) @@ -4816,6 +4859,8 @@ no_journal: ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10); ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10); ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10); + atomic_set(&sbi->s_warning_count, 0); + atomic_set(&sbi->s_msg_count, 0); kfree(orig_data); return 0; @@ -4825,10 +4870,8 @@ cantfind_ext4: ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); goto failed_mount; -#ifdef CONFIG_QUOTA failed_mount8: ext4_unregister_sysfs(sb); -#endif failed_mount7: ext4_unregister_li_request(sb); failed_mount6: @@ -4892,7 +4935,7 @@ failed_mount: for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(get_qf_name(sb, sbi, i)); #endif - fscrypt_free_dummy_context(&sbi->s_dummy_enc_ctx); + fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); ext4_blkdev_remove(sbi); brelse(bh); out_fail: @@ -4968,7 +5011,8 @@ static journal_t *ext4_get_journal(struct super_block *sb, struct inode *journal_inode; journal_t *journal; - BUG_ON(!ext4_has_feature_journal(sb)); + if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) + return NULL; journal_inode = ext4_get_journal_inode(sb, journal_inum); if (!journal_inode) @@ -4998,7 +5042,8 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, struct ext4_super_block *es; struct block_device *bdev; - BUG_ON(!ext4_has_feature_journal(sb)); + if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) + return NULL; bdev = ext4_blkdev_get(j_dev, sb); if (bdev == NULL) @@ -5089,8 +5134,10 @@ static int ext4_load_journal(struct super_block *sb, dev_t journal_dev; int err = 0; int really_read_only; + int journal_dev_ro; - BUG_ON(!ext4_has_feature_journal(sb)); + if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) + return -EFSCORRUPTED; if (journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { @@ -5100,7 +5147,31 @@ static int ext4_load_journal(struct super_block *sb, } else journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); - really_read_only = bdev_read_only(sb->s_bdev); + if (journal_inum && journal_dev) { + ext4_msg(sb, KERN_ERR, + "filesystem has both journal inode and journal device!"); + return -EINVAL; + } + + if (journal_inum) { + journal = ext4_get_journal(sb, journal_inum); + if (!journal) + return -EINVAL; + } else { + journal = ext4_get_dev_journal(sb, journal_dev); + if (!journal) + return -EINVAL; + } + + journal_dev_ro = bdev_read_only(journal->j_dev); + really_read_only = bdev_read_only(sb->s_bdev) | journal_dev_ro; + + if (journal_dev_ro && !sb_rdonly(sb)) { + ext4_msg(sb, KERN_ERR, + "journal device read-only, try mounting with '-o ro'"); + err = -EROFS; + goto err_out; + } /* * Are we loading a blank journal or performing recovery after a @@ -5115,27 +5186,14 @@ static int ext4_load_journal(struct super_block *sb, ext4_msg(sb, KERN_ERR, "write access " "unavailable, cannot proceed " "(try mounting with noload)"); - return -EROFS; + err = -EROFS; + goto err_out; } ext4_msg(sb, KERN_INFO, "write access will " "be enabled during recovery"); } } - if (journal_inum && journal_dev) { - ext4_msg(sb, KERN_ERR, "filesystem has both journal " - "and inode journals!"); - return -EINVAL; - } - - if (journal_inum) { - if (!(journal = ext4_get_journal(sb, journal_inum))) - return -EINVAL; - } else { - if (!(journal = ext4_get_dev_journal(sb, journal_dev))) - return -EINVAL; - } - if (!(journal->j_flags & JBD2_BARRIER)) ext4_msg(sb, KERN_INFO, "barriers disabled"); @@ -5155,12 +5213,16 @@ static int ext4_load_journal(struct super_block *sb, if (err) { ext4_msg(sb, KERN_ERR, "error loading journal"); - jbd2_journal_destroy(journal); - return err; + goto err_out; } EXT4_SB(sb)->s_journal = journal; - ext4_clear_journal_err(sb, es); + err = ext4_clear_journal_err(sb, es); + if (err) { + EXT4_SB(sb)->s_journal = NULL; + jbd2_journal_destroy(journal); + return err; + } if (!really_read_only && journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { @@ -5171,6 +5233,10 @@ static int ext4_load_journal(struct super_block *sb, } return 0; + +err_out: + jbd2_journal_destroy(journal); + return err; } static int ext4_commit_super(struct super_block *sb, int sync) @@ -5183,13 +5249,6 @@ static int ext4_commit_super(struct super_block *sb, int sync) return error; /* - * The superblock bh should be mapped, but it might not be if the - * device was hot-removed. Not much we can do but fail the I/O. - */ - if (!buffer_mapped(sbh)) - return error; - - /* * If the file system is mounted read-only, don't update the * superblock write time. This avoids updating the superblock * write time when we are mounting the root file system @@ -5256,26 +5315,32 @@ static int ext4_commit_super(struct super_block *sb, int sync) * remounting) the filesystem readonly, then we will end up with a * consistent fs on disk. Record that fact. */ -static void ext4_mark_recovery_complete(struct super_block *sb, - struct ext4_super_block *es) +static int ext4_mark_recovery_complete(struct super_block *sb, + struct ext4_super_block *es) { + int err; journal_t *journal = EXT4_SB(sb)->s_journal; if (!ext4_has_feature_journal(sb)) { - BUG_ON(journal != NULL); - return; + if (journal != NULL) { + ext4_error(sb, "Journal got removed while the fs was " + "mounted!"); + return -EFSCORRUPTED; + } + return 0; } jbd2_journal_lock_updates(journal); - if (jbd2_journal_flush(journal) < 0) + err = jbd2_journal_flush(journal); + if (err < 0) goto out; if (ext4_has_feature_journal_needs_recovery(sb) && sb_rdonly(sb)) { ext4_clear_feature_journal_needs_recovery(sb); ext4_commit_super(sb, 1); } - out: jbd2_journal_unlock_updates(journal); + return err; } /* @@ -5283,14 +5348,17 @@ out: * has recorded an error from a previous lifetime, move that error to the * main filesystem now. */ -static void ext4_clear_journal_err(struct super_block *sb, +static int ext4_clear_journal_err(struct super_block *sb, struct ext4_super_block *es) { journal_t *journal; int j_errno; const char *errstr; - BUG_ON(!ext4_has_feature_journal(sb)); + if (!ext4_has_feature_journal(sb)) { + ext4_error(sb, "Journal got removed while the fs was mounted!"); + return -EFSCORRUPTED; + } journal = EXT4_SB(sb)->s_journal; @@ -5315,6 +5383,7 @@ static void ext4_clear_journal_err(struct super_block *sb, jbd2_journal_clear_err(journal); jbd2_journal_update_sb_errno(journal); } + return 0; } /* @@ -5457,7 +5526,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) { struct ext4_super_block *es; struct ext4_sb_info *sbi = EXT4_SB(sb); - unsigned long old_sb_flags; + unsigned long old_sb_flags, vfs_flags; struct ext4_mount_options old_opts; int enable_quota = 0; ext4_group_t g; @@ -5500,6 +5569,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (sbi->s_journal && sbi->s_journal->j_task->io_context) journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; + /* + * Some options can be enabled by ext4 and/or by VFS mount flag + * either way we need to make sure it matches in both *flags and + * s_flags. Copy those selected flags from *flags to s_flags + */ + vfs_flags = SB_LAZYTIME | SB_I_VERSION; + sb->s_flags = (sb->s_flags & ~vfs_flags) | (*flags & vfs_flags); + if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) { err = -EINVAL; goto restore_opts; @@ -5553,9 +5630,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); } - if (*flags & SB_LAZYTIME) - sb->s_flags |= SB_LAZYTIME; - if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) { if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) { err = -EROFS; @@ -5585,8 +5659,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) (sbi->s_mount_state & EXT4_VALID_FS)) es->s_state = cpu_to_le16(sbi->s_mount_state); - if (sbi->s_journal) + if (sbi->s_journal) { + /* + * We let remount-ro finish even if marking fs + * as clean failed... + */ ext4_mark_recovery_complete(sb, es); + } if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); } else { @@ -5629,13 +5708,24 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } /* + * Update the original bdev mapping's wb_err value + * which could be used to detect the metadata async + * write error. + */ + errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err, + &sbi->s_bdev_wb_err); + + /* * Mounting a RDONLY partition read-write, so reread * and store the current valid flag. (It may have * been changed by e2fsck since we originally mounted * the partition.) */ - if (sbi->s_journal) - ext4_clear_journal_err(sb, es); + if (sbi->s_journal) { + err = ext4_clear_journal_err(sb, es); + if (err) + goto restore_opts; + } sbi->s_mount_state = le16_to_cpu(es->s_state); err = ext4_setup_super(sb, es, 0); @@ -5665,7 +5755,17 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) ext4_register_li_request(sb, first_not_zeroed); } - ext4_setup_system_zone(sb); + /* + * Handle creation of system zone data early because it can fail. + * Releasing of existing data is done when we are sure remount will + * succeed. + */ + if (test_opt(sb, BLOCK_VALIDITY) && !sbi->system_blks) { + err = ext4_setup_system_zone(sb); + if (err) + goto restore_opts; + } + if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) { err = ext4_commit_super(sb, 1); if (err) @@ -5686,8 +5786,16 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } } #endif + if (!test_opt(sb, BLOCK_VALIDITY) && sbi->system_blks) + ext4_release_system_zone(sb); + + /* + * Some options can be enabled by ext4 and/or by VFS mount flag + * either way we need to make sure it matches in both *flags and + * s_flags. Copy those selected flags from s_flags to *flags + */ + *flags = (*flags & ~vfs_flags) | (sb->s_flags & vfs_flags); - *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data); kfree(orig_data); return 0; @@ -5701,6 +5809,8 @@ restore_opts: sbi->s_commit_interval = old_opts.s_commit_interval; sbi->s_min_batch_time = old_opts.s_min_batch_time; sbi->s_max_batch_time = old_opts.s_max_batch_time; + if (!test_opt(sb, BLOCK_VALIDITY) && sbi->system_blks) + ext4_release_system_zone(sb); #ifdef CONFIG_QUOTA sbi->s_jquota_fmt = old_opts.s_jquota_fmt; for (i = 0; i < EXT4_MAXQUOTAS; i++) { diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 6c9fc9e21c13..bfabb799fa45 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -189,6 +189,9 @@ static struct ext4_attr ext4_attr_##_name = { \ #define EXT4_RW_ATTR_SBI_UL(_name,_elname) \ EXT4_ATTR_OFFSET(_name, 0644, pointer_ul, ext4_sb_info, _elname) +#define EXT4_RO_ATTR_SBI_ATOMIC(_name,_elname) \ + EXT4_ATTR_OFFSET(_name, 0444, pointer_atomic, ext4_sb_info, _elname) + #define EXT4_ATTR_PTR(_name,_mode,_id,_ptr) \ static struct ext4_attr ext4_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ @@ -215,6 +218,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); +EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc); EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error); EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval); @@ -226,6 +230,8 @@ EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); #ifdef CONFIG_EXT4_DEBUG EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail); #endif +EXT4_RO_ATTR_SBI_ATOMIC(warning_count, s_warning_count); +EXT4_RO_ATTR_SBI_ATOMIC(msg_count, s_msg_count); EXT4_RO_ATTR_ES_UI(errors_count, s_error_count); EXT4_RO_ATTR_ES_U8(first_error_errcode, s_first_error_errcode); EXT4_RO_ATTR_ES_U8(last_error_errcode, s_last_error_errcode); @@ -240,6 +246,8 @@ EXT4_RO_ATTR_ES_STRING(last_error_func, s_last_error_func, 32); EXT4_ATTR(first_error_time, 0444, first_error_time); EXT4_ATTR(last_error_time, 0444, last_error_time); EXT4_ATTR(journal_task, 0444, journal_task); +EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch); +EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit); static unsigned int old_bump_val = 128; EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val); @@ -257,6 +265,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(mb_order2_req), ATTR_LIST(mb_stream_req), ATTR_LIST(mb_group_prealloc), + ATTR_LIST(mb_max_inode_prealloc), ATTR_LIST(max_writeback_mb_bump), ATTR_LIST(extent_max_zeroout_kb), ATTR_LIST(trigger_fs_error), @@ -267,6 +276,8 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(msg_ratelimit_interval_ms), ATTR_LIST(msg_ratelimit_burst), ATTR_LIST(errors_count), + ATTR_LIST(warning_count), + ATTR_LIST(msg_count), ATTR_LIST(first_error_ino), ATTR_LIST(last_error_ino), ATTR_LIST(first_error_block), @@ -283,6 +294,8 @@ static struct attribute *ext4_attrs[] = { #ifdef CONFIG_EXT4_DEBUG ATTR_LIST(simulate_fail), #endif + ATTR_LIST(mb_prefetch), + ATTR_LIST(mb_prefetch_limit), NULL, }; ATTRIBUTE_GROUPS(ext4); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 7d2f6576d954..cba4b877c606 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1356,8 +1356,7 @@ retry: block = 0; while (wsize < bufsize) { - if (bh != NULL) - brelse(bh); + brelse(bh); csize = (bufsize - wsize) > blocksize ? blocksize : bufsize - wsize; bh = ext4_getblk(handle, ea_inode, block, 0); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ed2bca0fce92..73683e58a08d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3550,6 +3550,9 @@ static int check_direct_IO(struct inode *inode, struct iov_iter *iter, unsigned long align = offset | iov_iter_alignment(iter); struct block_device *bdev = inode->i_sb->s_bdev; + if (iov_iter_rw(iter) == READ && offset >= i_size_read(inode)) + return 1; + if (align & blocksize_mask) { if (bdev) blkbits = blksize_bits(bdev_logical_block_size(bdev)); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 069f498af1e3..53fbc4dd6e48 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -111,7 +111,7 @@ static int __f2fs_setup_filename(const struct inode *dir, #ifdef CONFIG_FS_ENCRYPTION fname->crypto_buf = crypt_name->crypto_buf; #endif - if (crypt_name->is_ciphertext_name) { + if (crypt_name->is_nokey_name) { /* hash was decoded from the no-key name */ fname->hash = cpu_to_le32(crypt_name->hash); } else { @@ -537,7 +537,7 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, goto put_error; if (IS_ENCRYPTED(inode)) { - err = fscrypt_inherit_context(dir, inode, page, false); + err = fscrypt_set_context(inode, page); if (err) goto put_error; } @@ -1032,7 +1032,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) if (err) goto out; - err = fscrypt_fname_alloc_buffer(inode, F2FS_NAME_LEN, &fstr); + err = fscrypt_fname_alloc_buffer(F2FS_NAME_LEN, &fstr); if (err < 0) goto out; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 16322ea5b463..7c089ff7ff94 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -138,7 +138,7 @@ struct f2fs_mount_info { int fsync_mode; /* fsync policy */ int fs_mode; /* fs mode: LFS or ADAPTIVE */ int bggc_mode; /* bggc mode: off, on or sync */ - struct fscrypt_dummy_context dummy_enc_ctx; /* test dummy encryption */ + struct fscrypt_dummy_policy dummy_enc_policy; /* test dummy encryption */ block_t unusable_cap_perc; /* percentage for cap */ block_t unusable_cap; /* Amount of space allowed to be * unusable when disabling checkpoint @@ -1315,13 +1315,6 @@ enum fsync_mode { #define IS_IO_TRACED_PAGE(page) (0) #endif -#ifdef CONFIG_FS_ENCRYPTION -#define DUMMY_ENCRYPTION_ENABLED(sbi) \ - (unlikely(F2FS_OPTION(sbi).dummy_enc_ctx.ctx != NULL)) -#else -#define DUMMY_ENCRYPTION_ENABLED(sbi) (0) -#endif - /* For compression */ enum compress_algorithm_type { COMPRESS_LZO, @@ -2646,7 +2639,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode, case FI_NEW_INODE: if (set) return; - /* fall through */ + fallthrough; case FI_DATA_EXIST: case FI_INLINE_DOTS: case FI_PIN_FILE: @@ -4022,22 +4015,6 @@ static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi) return F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS; } -static inline bool f2fs_may_encrypt(struct inode *dir, struct inode *inode) -{ -#ifdef CONFIG_FS_ENCRYPTION - struct f2fs_sb_info *sbi = F2FS_I_SB(dir); - umode_t mode = inode->i_mode; - - /* - * If the directory encrypted or dummy encryption enabled, - * then we should encrypt the inode. - */ - if (IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) - return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)); -#endif - return false; -} - static inline bool f2fs_may_compress(struct inode *inode) { if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) || diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 84e4bbc1a64d..45f324511a19 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -28,6 +28,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) nid_t ino; struct inode *inode; bool nid_free = false; + bool encrypt = false; int xattr_size = 0; int err; @@ -69,13 +70,17 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns, F2FS_DEF_PROJID); + err = fscrypt_prepare_new_inode(dir, inode, &encrypt); + if (err) + goto fail_drop; + err = dquot_initialize(inode); if (err) goto fail_drop; set_inode_flag(inode, FI_NEW_INODE); - if (f2fs_may_encrypt(dir, inode)) + if (encrypt) f2fs_set_encrypted_inode(inode); if (f2fs_sb_has_extra_attr(sbi)) { diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 9bbaa2614679..cb1b5b61a1da 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -618,10 +618,10 @@ pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs) switch (dn->max_level) { case 3: base += 2 * indirect_blks; - /* fall through */ + fallthrough; case 2: base += 2 * direct_blks; - /* fall through */ + fallthrough; case 1: base += direct_index; break; @@ -2373,6 +2373,9 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi, if (unlikely(nid >= nm_i->max_nid)) nid = 0; + if (unlikely(nid % NAT_ENTRY_PER_BLOCK)) + nid = NAT_BLOCK_OFFSET(nid) * NAT_ENTRY_PER_BLOCK; + /* Enough entries */ if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK) return 0; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a65d357f89a9..e247a5ef3713 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -799,7 +799,7 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, if (__is_large_section(sbi)) { unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); - unsigned short valid_blocks = + block_t valid_blocks = get_valid_blocks(sbi, segno, true); f2fs_bug_on(sbi, unlikely(!valid_blocks || @@ -815,7 +815,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, enum dirty_type dirty_type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned short valid_blocks; + block_t valid_blocks; if (test_and_clear_bit(segno, dirty_i->dirty_segmap[dirty_type])) dirty_i->nr_dirty[dirty_type]--; @@ -4316,8 +4316,8 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct free_segmap_info *free_i = FREE_I(sbi); unsigned int segno = 0, offset = 0, secno; - unsigned short valid_blocks; - unsigned short blks_per_sec = BLKS_PER_SEC(sbi); + block_t valid_blocks; + block_t blks_per_sec = BLKS_PER_SEC(sbi); while (1) { /* find dirty segment based on free segmap */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index dfa072fa8081..bef2be3fa3d0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -433,12 +433,12 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb, * needed to allow it to be set or changed during remount. We do allow * it to be specified during remount, but only if there is no change. */ - if (is_remount && !F2FS_OPTION(sbi).dummy_enc_ctx.ctx) { + if (is_remount && !F2FS_OPTION(sbi).dummy_enc_policy.policy) { f2fs_warn(sbi, "Can't set test_dummy_encryption on remount"); return -EINVAL; } err = fscrypt_set_test_dummy_encryption( - sb, arg, &F2FS_OPTION(sbi).dummy_enc_ctx); + sb, arg->from, &F2FS_OPTION(sbi).dummy_enc_policy); if (err) { if (err == -EEXIST) f2fs_warn(sbi, @@ -1275,7 +1275,7 @@ static void f2fs_put_super(struct super_block *sb) for (i = 0; i < MAXQUOTAS; i++) kfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif - fscrypt_free_dummy_context(&F2FS_OPTION(sbi).dummy_enc_ctx); + fscrypt_free_dummy_policy(&F2FS_OPTION(sbi).dummy_enc_policy); destroy_percpu_info(sbi); for (i = 0; i < NR_PAGE_TYPE; i++) kvfree(sbi->write_io[i]); @@ -2482,10 +2482,9 @@ static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, ctx, len, fs_data, XATTR_CREATE); } -static const union fscrypt_context * -f2fs_get_dummy_context(struct super_block *sb) +static const union fscrypt_policy *f2fs_get_dummy_policy(struct super_block *sb) { - return F2FS_OPTION(F2FS_SB(sb)).dummy_enc_ctx.ctx; + return F2FS_OPTION(F2FS_SB(sb)).dummy_enc_policy.policy; } static bool f2fs_has_stable_inodes(struct super_block *sb) @@ -2523,7 +2522,7 @@ static const struct fscrypt_operations f2fs_cryptops = { .key_prefix = "f2fs:", .get_context = f2fs_get_context, .set_context = f2fs_set_context, - .get_dummy_context = f2fs_get_dummy_context, + .get_dummy_policy = f2fs_get_dummy_policy, .empty_dir = f2fs_empty_dir, .max_namelen = F2FS_NAME_LEN, .has_stable_inodes = f2fs_has_stable_inodes, @@ -3864,7 +3863,7 @@ free_options: for (i = 0; i < MAXQUOTAS; i++) kfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif - fscrypt_free_dummy_context(&F2FS_OPTION(sbi).dummy_enc_ctx); + fscrypt_free_dummy_policy(&F2FS_OPTION(sbi).dummy_enc_policy); kvfree(options); free_sb_buf: kfree(raw_super); diff --git a/fs/fcntl.c b/fs/fcntl.c index 2e4c0fa2074b..19ac5baad50f 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -362,7 +362,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, case F_OFD_SETLK: case F_OFD_SETLKW: #endif - /* Fallthrough */ + fallthrough; case F_SETLK: case F_SETLKW: if (copy_from_user(&flock, argp, sizeof(flock))) @@ -771,7 +771,7 @@ static void send_sigio_to_task(struct task_struct *p, if (!do_send_sig_info(signum, &si, p, type)) break; } - /* fall-through - fall back on the old plain SIGIO signal */ + fallthrough; /* fall back on the old plain SIGIO signal */ case 0: do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, type); } diff --git a/fs/file.c b/fs/file.c index 21c0893f2f1d..4559b5fec3bd 100644 --- a/fs/file.c +++ b/fs/file.c @@ -21,6 +21,7 @@ #include <linux/rcupdate.h> #include <linux/close_range.h> #include <net/sock.h> +#include <linux/io_uring.h> unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open_min = BITS_PER_LONG; @@ -452,6 +453,7 @@ void exit_files(struct task_struct *tsk) struct files_struct * files = tsk->files; if (files) { + io_uring_files_cancel(files); task_lock(tsk); tsk->files = NULL; task_unlock(tsk); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index a605c3dddabc..e6005c78bfa9 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -42,7 +42,6 @@ struct wb_writeback_work { long nr_pages; struct super_block *sb; - unsigned long *older_than_this; enum writeback_sync_modes sync_mode; unsigned int tagged_writepages:1; unsigned int for_kupdate:1; @@ -144,7 +143,9 @@ static void inode_io_list_del_locked(struct inode *inode, struct bdi_writeback *wb) { assert_spin_locked(&wb->list_lock); + assert_spin_locked(&inode->i_lock); + inode->i_state &= ~I_SYNC_QUEUED; list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); } @@ -1122,7 +1123,9 @@ void inode_io_list_del(struct inode *inode) struct bdi_writeback *wb; wb = inode_to_wb_and_lock_list(inode); + spin_lock(&inode->i_lock); inode_io_list_del_locked(inode, wb); + spin_unlock(&inode->i_lock); spin_unlock(&wb->list_lock); } EXPORT_SYMBOL(inode_io_list_del); @@ -1172,8 +1175,10 @@ void sb_clear_inode_writeback(struct inode *inode) * the case then the inode must have been redirtied while it was being written * out and we don't reset its dirtied_when. */ -static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) +static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb) { + assert_spin_locked(&inode->i_lock); + if (!list_empty(&wb->b_dirty)) { struct inode *tail; @@ -1182,6 +1187,14 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) inode->dirtied_when = jiffies; } inode_io_list_move_locked(inode, wb, &wb->b_dirty); + inode->i_state &= ~I_SYNC_QUEUED; +} + +static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) +{ + spin_lock(&inode->i_lock); + redirty_tail_locked(inode, wb); + spin_unlock(&inode->i_lock); } /* @@ -1220,16 +1233,13 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t) #define EXPIRE_DIRTY_ATIME 0x0001 /* - * Move expired (dirtied before work->older_than_this) dirty inodes from + * Move expired (dirtied before dirtied_before) dirty inodes from * @delaying_queue to @dispatch_queue. */ static int move_expired_inodes(struct list_head *delaying_queue, struct list_head *dispatch_queue, - int flags, - struct wb_writeback_work *work) + unsigned long dirtied_before) { - unsigned long *older_than_this = NULL; - unsigned long expire_time; LIST_HEAD(tmp); struct list_head *pos, *node; struct super_block *sb = NULL; @@ -1237,21 +1247,15 @@ static int move_expired_inodes(struct list_head *delaying_queue, int do_sb_sort = 0; int moved = 0; - if ((flags & EXPIRE_DIRTY_ATIME) == 0) - older_than_this = work->older_than_this; - else if (!work->for_sync) { - expire_time = jiffies - (dirtytime_expire_interval * HZ); - older_than_this = &expire_time; - } while (!list_empty(delaying_queue)) { inode = wb_inode(delaying_queue->prev); - if (older_than_this && - inode_dirtied_after(inode, *older_than_this)) + if (inode_dirtied_after(inode, dirtied_before)) break; list_move(&inode->i_io_list, &tmp); moved++; - if (flags & EXPIRE_DIRTY_ATIME) - set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state); + spin_lock(&inode->i_lock); + inode->i_state |= I_SYNC_QUEUED; + spin_unlock(&inode->i_lock); if (sb_is_blkdev_sb(inode->i_sb)) continue; if (sb && sb != inode->i_sb) @@ -1289,18 +1293,22 @@ out: * | * +--> dequeue for IO */ -static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) +static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work, + unsigned long dirtied_before) { int moved; + unsigned long time_expire_jif = dirtied_before; assert_spin_locked(&wb->list_lock); list_splice_init(&wb->b_more_io, &wb->b_io); - moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work); + moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, dirtied_before); + if (!work->for_sync) + time_expire_jif = jiffies - dirtytime_expire_interval * HZ; moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io, - EXPIRE_DIRTY_ATIME, work); + time_expire_jif); if (moved) wb_io_lists_populated(wb); - trace_writeback_queue_io(wb, work, moved); + trace_writeback_queue_io(wb, work, dirtied_before, moved); } static int write_inode(struct inode *inode, struct writeback_control *wbc) @@ -1394,7 +1402,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, * writeback is not making progress due to locked * buffers. Skip this inode for now. */ - redirty_tail(inode, wb); + redirty_tail_locked(inode, wb); return; } @@ -1414,7 +1422,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, * retrying writeback of the dirty page/inode * that cannot be performed immediately. */ - redirty_tail(inode, wb); + redirty_tail_locked(inode, wb); } } else if (inode->i_state & I_DIRTY) { /* @@ -1422,10 +1430,11 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, * such as delayed allocation during submission or metadata * updates after data IO completion. */ - redirty_tail(inode, wb); + redirty_tail_locked(inode, wb); } else if (inode->i_state & I_DIRTY_TIME) { inode->dirtied_when = jiffies; inode_io_list_move_locked(inode, wb, &wb->b_dirty_time); + inode->i_state &= ~I_SYNC_QUEUED; } else { /* The inode is clean. Remove from writeback lists. */ inode_io_list_del_locked(inode, wb); @@ -1472,18 +1481,14 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) spin_lock(&inode->i_lock); dirty = inode->i_state & I_DIRTY; - if (inode->i_state & I_DIRTY_TIME) { - if ((dirty & I_DIRTY_INODE) || - wbc->sync_mode == WB_SYNC_ALL || - unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) || - unlikely(time_after(jiffies, - (inode->dirtied_time_when + - dirtytime_expire_interval * HZ)))) { - dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED; - trace_writeback_lazytime(inode); - } - } else - inode->i_state &= ~I_DIRTY_TIME_EXPIRED; + if ((inode->i_state & I_DIRTY_TIME) && + ((dirty & I_DIRTY_INODE) || + wbc->sync_mode == WB_SYNC_ALL || wbc->for_sync || + time_after(jiffies, inode->dirtied_time_when + + dirtytime_expire_interval * HZ))) { + dirty |= I_DIRTY_TIME; + trace_writeback_lazytime(inode); + } inode->i_state &= ~dirty; /* @@ -1669,8 +1674,8 @@ static long writeback_sb_inodes(struct super_block *sb, */ spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + redirty_tail_locked(inode, wb); spin_unlock(&inode->i_lock); - redirty_tail(inode, wb); continue; } if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) { @@ -1811,7 +1816,7 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, blk_start_plug(&plug); spin_lock(&wb->list_lock); if (list_empty(&wb->b_io)) - queue_io(wb, &work); + queue_io(wb, &work, jiffies); __writeback_inodes_wb(wb, &work); spin_unlock(&wb->list_lock); blk_finish_plug(&plug); @@ -1831,7 +1836,7 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, * takes longer than a dirty_writeback_interval interval, then leave a * one-second gap. * - * older_than_this takes precedence over nr_to_write. So we'll only write back + * dirtied_before takes precedence over nr_to_write. So we'll only write back * all dirty pages if they are all attached to "old" mappings. */ static long wb_writeback(struct bdi_writeback *wb, @@ -1839,14 +1844,11 @@ static long wb_writeback(struct bdi_writeback *wb, { unsigned long wb_start = jiffies; long nr_pages = work->nr_pages; - unsigned long oldest_jif; + unsigned long dirtied_before = jiffies; struct inode *inode; long progress; struct blk_plug plug; - oldest_jif = jiffies; - work->older_than_this = &oldest_jif; - blk_start_plug(&plug); spin_lock(&wb->list_lock); for (;;) { @@ -1880,14 +1882,14 @@ static long wb_writeback(struct bdi_writeback *wb, * safe. */ if (work->for_kupdate) { - oldest_jif = jiffies - + dirtied_before = jiffies - msecs_to_jiffies(dirty_expire_interval * 10); } else if (work->for_background) - oldest_jif = jiffies; + dirtied_before = jiffies; trace_writeback_start(wb, work); if (list_empty(&wb->b_io)) - queue_io(wb, work); + queue_io(wb, work, dirtied_before); if (work->sb) progress = writeback_sb_inodes(work->sb, wb, work); else @@ -2182,7 +2184,7 @@ static int __init start_dirtytime_writeback(void) __initcall(start_dirtytime_writeback); int dirtytime_interval_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -2289,11 +2291,12 @@ void __mark_inode_dirty(struct inode *inode, int flags) inode->i_state |= flags; /* - * If the inode is being synced, just update its dirty state. - * The unlocker will place the inode on the appropriate - * superblock list, based upon its state. + * If the inode is queued for writeback by flush worker, just + * update its dirty state. Once the flush worker is done with + * the inode it will place it on the appropriate superblock + * list, based upon its state. */ - if (inode->i_state & I_SYNC) + if (inode->i_state & I_SYNC_QUEUED) goto out_unlock_inode; /* @@ -2318,7 +2321,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) wb = locked_inode_to_wb_and_lock_list(inode); - WARN(bdi_cap_writeback_dirty(wb->bdi) && + WARN((wb->bdi->capabilities & BDI_CAP_WRITEBACK) && !test_bit(WB_registered, &wb->state), "bdi-%s not registered\n", bdi_dev_name(wb->bdi)); @@ -2343,7 +2346,8 @@ void __mark_inode_dirty(struct inode *inode, int flags) * to make sure background write-back happens * later. */ - if (bdi_cap_writeback_dirty(wb->bdi) && wakeup_bdi) + if (wakeup_bdi && + (wb->bdi->capabilities & BDI_CAP_WRITEBACK)) wb_wakeup_delayed(wb); return; } @@ -2578,7 +2582,7 @@ int write_inode_now(struct inode *inode, int sync) .range_end = LLONG_MAX, }; - if (!mapping_cap_writeback_dirty(inode->i_mapping)) + if (!mapping_can_writeback(inode->i_mapping)) wbc.nr_to_write = 0; might_sleep(); diff --git a/fs/fs_context.c b/fs/fs_context.c index 7d5c5dd2b1d5..2834d1afa6e8 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -521,7 +521,7 @@ static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param) switch (param->type) { case fs_value_is_string: len = 1 + param->size; - /* Fall through */ + fallthrough; case fs_value_is_flag: len += strlen(param->key); break; diff --git a/fs/fs_parser.c b/fs/fs_parser.c index ab53e42a874a..68b0148f4bb8 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -189,7 +189,7 @@ out: } EXPORT_SYMBOL(fs_lookup_param); -int fs_param_bad_value(struct p_log *log, struct fs_parameter *param) +static int fs_param_bad_value(struct p_log *log, struct fs_parameter *param) { return inval_plog(log, "Bad value for '%s'", param->key); } diff --git a/fs/fsopen.c b/fs/fsopen.c index 2fa3f241b762..27a890aa493a 100644 --- a/fs/fsopen.c +++ b/fs/fsopen.c @@ -412,7 +412,7 @@ SYSCALL_DEFINE5(fsconfig, break; case FSCONFIG_SET_PATH_EMPTY: lookup_flags = LOOKUP_EMPTY; - /* fallthru */ + fallthrough; case FSCONFIG_SET_PATH: param.type = fs_value_is_filename; param.name = getname_flags(_value, lookup_flags, NULL); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 6611ef3269a8..43c165e796da 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -3091,11 +3091,10 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) ssize_t ret = 0; struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; - bool async_dio = ff->fc->async_dio; loff_t pos = 0; struct inode *inode; loff_t i_size; - size_t count = iov_iter_count(iter); + size_t count = iov_iter_count(iter), shortened = 0; loff_t offset = iocb->ki_pos; struct fuse_io_priv *io; @@ -3103,17 +3102,9 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) inode = file->f_mapping->host; i_size = i_size_read(inode); - if ((iov_iter_rw(iter) == READ) && (offset > i_size)) + if ((iov_iter_rw(iter) == READ) && (offset >= i_size)) return 0; - /* optimization for short read */ - if (async_dio && iov_iter_rw(iter) != WRITE && offset + count > i_size) { - if (offset >= i_size) - return 0; - iov_iter_truncate(iter, fuse_round_up(ff->fc, i_size - offset)); - count = iov_iter_count(iter); - } - io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL); if (!io) return -ENOMEM; @@ -3129,15 +3120,22 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) * By default, we want to optimize all I/Os with async request * submission to the client filesystem if supported. */ - io->async = async_dio; + io->async = ff->fc->async_dio; io->iocb = iocb; io->blocking = is_sync_kiocb(iocb); + /* optimization for short read */ + if (io->async && !io->write && offset + count > i_size) { + iov_iter_truncate(iter, fuse_round_up(ff->fc, i_size - offset)); + shortened = count - iov_iter_count(iter); + count -= shortened; + } + /* * We cannot asynchronously extend the size of a file. * In such case the aio will behave exactly like sync io. */ - if ((offset + count > i_size) && iov_iter_rw(iter) == WRITE) + if ((offset + count > i_size) && io->write) io->blocking = true; if (io->async && io->blocking) { @@ -3155,6 +3153,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } else { ret = __fuse_direct_read(io, iter, &pos); } + iov_iter_reexpand(iter, iov_iter_count(iter) + shortened); if (io->async) { bool blocking = io->blocking; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index bba747520e9b..581329203d68 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1049,9 +1049,9 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) if (err) return err; - sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; /* fuse does it's own writeback accounting */ - sb->s_bdi->capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT; + sb->s_bdi->capabilities &= ~BDI_CAP_WRITEBACK_ACCT; + sb->s_bdi->capabilities |= BDI_CAP_STRICTLIMIT; /* * For a single fuse filesystem use max 1% of dirty + diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 770f3a720db9..0f69fbd4af66 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -746,7 +746,7 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, } if (n == 0) break; - /* fall through - To branching from existing tree */ + fallthrough; /* To branching from existing tree */ case ALLOC_GROW_DEPTH: if (i > 1 && i < mp->mp_fheight) gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]); @@ -757,7 +757,7 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, state = ALLOC_DATA; if (n == 0) break; - /* fall through - To tree complete, adding data blocks */ + fallthrough; /* To tree complete, adding data blocks */ case ALLOC_DATA: BUG_ON(n > dblks); BUG_ON(mp->mp_bh[end_of_metadata] == NULL); diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index a58333e3980d..3763c9ff1406 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -902,6 +902,36 @@ static void empty_ail1_list(struct gfs2_sbd *sdp) } /** + * drain_bd - drain the buf and databuf queue for a failed transaction + * @tr: the transaction to drain + * + * When this is called, we're taking an error exit for a log write that failed + * but since we bypassed the after_commit functions, we need to remove the + * items from the buf and databuf queue. + */ +static void trans_drain(struct gfs2_trans *tr) +{ + struct gfs2_bufdata *bd; + struct list_head *head; + + if (!tr) + return; + + head = &tr->tr_buf; + while (!list_empty(head)) { + bd = list_first_entry(head, struct gfs2_bufdata, bd_list); + list_del_init(&bd->bd_list); + kmem_cache_free(gfs2_bufdata_cachep, bd); + } + head = &tr->tr_databuf; + while (!list_empty(head)) { + bd = list_first_entry(head, struct gfs2_bufdata, bd_list); + list_del_init(&bd->bd_list); + kmem_cache_free(gfs2_bufdata_cachep, bd); + } +} + +/** * gfs2_log_flush - flush incore transaction(s) * @sdp: the filesystem * @gl: The glock structure to flush. If NULL, flush the whole incore log @@ -1005,6 +1035,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) out: if (gfs2_withdrawn(sdp)) { + trans_drain(tr); /** * If the tr_list is empty, we're withdrawing during a log * flush that targets a transaction, but the transaction was diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 4b67d47a7e00..6e173ae378c4 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1599,7 +1599,7 @@ static int gfs2_quota_get_state(struct super_block *sb, struct qc_state *state) case GFS2_QUOTA_ON: state->s_state[USRQUOTA].flags |= QCI_LIMITS_ENFORCED; state->s_state[GRPQUOTA].flags |= QCI_LIMITS_ENFORCED; - /*FALLTHRU*/ + fallthrough; case GFS2_QUOTA_ACCOUNT: state->s_state[USRQUOTA].flags |= QCI_ACCT_ENABLED | QCI_SYSFILE; diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index e1c7eb6eb00a..6d4bf7ea7b3b 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -67,6 +67,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, tr->tr_reserved += gfs2_struct2blk(sdp, revokes); INIT_LIST_HEAD(&tr->tr_databuf); INIT_LIST_HEAD(&tr->tr_buf); + INIT_LIST_HEAD(&tr->tr_list); INIT_LIST_HEAD(&tr->tr_ail1_list); INIT_LIST_HEAD(&tr->tr_ail2_list); diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 61eec628805d..0350dc7821bf 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -195,7 +195,7 @@ reread: switch (sbi->s_vhdr->signature) { case cpu_to_be16(HFSPLUS_VOLHEAD_SIGX): set_bit(HFSPLUS_SB_HFSX, &sbi->flags); - /*FALLTHRU*/ + fallthrough; case cpu_to_be16(HFSPLUS_VOLHEAD_SIG): break; case cpu_to_be16(HFSP_WRAP_MAGIC): diff --git a/fs/internal.h b/fs/internal.h index 10517ece4516..a7cd0f64faa4 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -82,9 +82,6 @@ int may_linkat(struct path *link); /* * namespace.c */ -extern void *copy_mount_options(const void __user *); -extern char *copy_mount_string(const void __user *); - extern struct vfsmount *lookup_mnt(const struct path *); extern int finish_automount(struct vfsmount *, struct path *); diff --git a/fs/io-wq.c b/fs/io-wq.c index e92c4724480c..0a182f1333e8 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -17,6 +17,7 @@ #include <linux/rculist_nulls.h> #include <linux/fs_struct.h> #include <linux/task_work.h> +#include <linux/blk-cgroup.h> #include "io-wq.h" @@ -26,9 +27,8 @@ enum { IO_WORKER_F_UP = 1, /* up and active */ IO_WORKER_F_RUNNING = 2, /* account as running */ IO_WORKER_F_FREE = 4, /* worker on free list */ - IO_WORKER_F_EXITING = 8, /* worker exiting */ - IO_WORKER_F_FIXED = 16, /* static idle worker */ - IO_WORKER_F_BOUND = 32, /* is doing bounded work */ + IO_WORKER_F_FIXED = 8, /* static idle worker */ + IO_WORKER_F_BOUND = 16, /* is doing bounded work */ }; enum { @@ -57,9 +57,13 @@ struct io_worker { struct rcu_head rcu; struct mm_struct *mm; +#ifdef CONFIG_BLK_CGROUP + struct cgroup_subsys_state *blkcg_css; +#endif const struct cred *cur_creds; const struct cred *saved_creds; struct files_struct *restore_files; + struct nsproxy *restore_nsproxy; struct fs_struct *restore_fs; }; @@ -87,7 +91,7 @@ enum { */ struct io_wqe { struct { - spinlock_t lock; + raw_spinlock_t lock; struct io_wq_work_list work_list; unsigned long hash_map; unsigned flags; @@ -148,11 +152,12 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) if (current->files != worker->restore_files) { __acquire(&wqe->lock); - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); dropped_lock = true; task_lock(current); current->files = worker->restore_files; + current->nsproxy = worker->restore_nsproxy; task_unlock(current); } @@ -166,7 +171,7 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) if (worker->mm) { if (!dropped_lock) { __acquire(&wqe->lock); - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); dropped_lock = true; } __set_current_state(TASK_RUNNING); @@ -175,6 +180,13 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) worker->mm = NULL; } +#ifdef CONFIG_BLK_CGROUP + if (worker->blkcg_css) { + kthread_associate_blkcg(NULL); + worker->blkcg_css = NULL; + } +#endif + return dropped_lock; } @@ -200,7 +212,6 @@ static void io_worker_exit(struct io_worker *worker) { struct io_wqe *wqe = worker->wqe; struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker); - unsigned nr_workers; /* * If we're not at zero, someone else is holding a brief reference @@ -220,23 +231,19 @@ static void io_worker_exit(struct io_worker *worker) worker->flags = 0; preempt_enable(); - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); hlist_nulls_del_rcu(&worker->nulls_node); list_del_rcu(&worker->all_list); if (__io_worker_unuse(wqe, worker)) { __release(&wqe->lock); - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); } acct->nr_workers--; - nr_workers = wqe->acct[IO_WQ_ACCT_BOUND].nr_workers + - wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers; - spin_unlock_irq(&wqe->lock); - - /* all workers gone, wq exit can proceed */ - if (!nr_workers && refcount_dec_and_test(&wqe->wq->refs)) - complete(&wqe->wq->done); + raw_spin_unlock_irq(&wqe->lock); kfree_rcu(worker, rcu); + if (refcount_dec_and_test(&wqe->wq->refs)) + complete(&wqe->wq->done); } static inline bool io_wqe_run_queue(struct io_wqe *wqe) @@ -318,6 +325,7 @@ static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker) worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); worker->restore_files = current->files; + worker->restore_nsproxy = current->nsproxy; worker->restore_fs = current->fs; io_wqe_inc_running(wqe, worker); } @@ -436,6 +444,17 @@ static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work) work->flags |= IO_WQ_WORK_CANCEL; } +static inline void io_wq_switch_blkcg(struct io_worker *worker, + struct io_wq_work *work) +{ +#ifdef CONFIG_BLK_CGROUP + if (work->blkcg_css != worker->blkcg_css) { + kthread_associate_blkcg(work->blkcg_css); + worker->blkcg_css = work->blkcg_css; + } +#endif +} + static void io_wq_switch_creds(struct io_worker *worker, struct io_wq_work *work) { @@ -454,6 +473,7 @@ static void io_impersonate_work(struct io_worker *worker, if (work->files && current->files != work->files) { task_lock(current); current->files = work->files; + current->nsproxy = work->nsproxy; task_unlock(current); } if (work->fs && current->fs != work->fs) @@ -463,6 +483,7 @@ static void io_impersonate_work(struct io_worker *worker, if (worker->cur_creds != work->creds) io_wq_switch_creds(worker, work); current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->fsize; + io_wq_switch_blkcg(worker, work); } static void io_assign_current_work(struct io_worker *worker, @@ -504,7 +525,7 @@ get_next: else if (!wq_list_empty(&wqe->work_list)) wqe->flags |= IO_WQE_FLAG_STALLED; - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); if (!work) break; io_assign_current_work(worker, work); @@ -538,17 +559,17 @@ get_next: io_wqe_enqueue(wqe, linked); if (hash != -1U && !next_hashed) { - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); wqe->hash_map &= ~BIT_ULL(hash); wqe->flags &= ~IO_WQE_FLAG_STALLED; /* skip unnecessary unlock-lock wqe->lock */ if (!work) goto get_next; - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); } } while (work); - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); } while (1); } @@ -563,7 +584,7 @@ static int io_wqe_worker(void *data) while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) { set_current_state(TASK_INTERRUPTIBLE); loop: - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); if (io_wqe_run_queue(wqe)) { __set_current_state(TASK_RUNNING); io_worker_handle_work(worker); @@ -574,7 +595,7 @@ loop: __release(&wqe->lock); goto loop; } - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); if (signal_pending(current)) flush_signals(current); if (schedule_timeout(WORKER_IDLE_TIMEOUT)) @@ -586,11 +607,11 @@ loop: } if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) { - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); if (!wq_list_empty(&wqe->work_list)) io_worker_handle_work(worker); else - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); } io_worker_exit(worker); @@ -630,14 +651,14 @@ void io_wq_worker_sleeping(struct task_struct *tsk) worker->flags &= ~IO_WORKER_F_RUNNING; - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); io_wqe_dec_running(wqe, worker); - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); } static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) { - struct io_wqe_acct *acct =&wqe->acct[index]; + struct io_wqe_acct *acct = &wqe->acct[index]; struct io_worker *worker; worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node); @@ -656,7 +677,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) return false; } - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); list_add_tail_rcu(&worker->all_list, &wqe->all_list); worker->flags |= IO_WORKER_F_FREE; @@ -665,11 +686,12 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) if (!acct->nr_workers && (worker->flags & IO_WORKER_F_BOUND)) worker->flags |= IO_WORKER_F_FIXED; acct->nr_workers++; - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); if (index == IO_WQ_ACCT_UNBOUND) atomic_inc(&wq->user->processes); + refcount_inc(&wq->refs); wake_up_process(worker->task); return true; } @@ -685,28 +707,63 @@ static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index) return acct->nr_workers < acct->max_workers; } +static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data) +{ + send_sig(SIGINT, worker->task, 1); + return false; +} + +/* + * Iterate the passed in list and call the specific function for each + * worker that isn't exiting + */ +static bool io_wq_for_each_worker(struct io_wqe *wqe, + bool (*func)(struct io_worker *, void *), + void *data) +{ + struct io_worker *worker; + bool ret = false; + + list_for_each_entry_rcu(worker, &wqe->all_list, all_list) { + if (io_worker_get(worker)) { + /* no task if node is/was offline */ + if (worker->task) + ret = func(worker, data); + io_worker_release(worker); + if (ret) + break; + } + } + + return ret; +} + +static bool io_wq_worker_wake(struct io_worker *worker, void *data) +{ + wake_up_process(worker->task); + return false; +} + /* * Manager thread. Tasked with creating new workers, if we need them. */ static int io_wq_manager(void *data) { struct io_wq *wq = data; - int workers_to_create = num_possible_nodes(); int node; /* create fixed workers */ - refcount_set(&wq->refs, workers_to_create); + refcount_set(&wq->refs, 1); for_each_node(node) { if (!node_online(node)) continue; - if (!create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND)) - goto err; - workers_to_create--; + if (create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND)) + continue; + set_bit(IO_WQ_BIT_ERROR, &wq->state); + set_bit(IO_WQ_BIT_EXIT, &wq->state); + goto out; } - while (workers_to_create--) - refcount_dec(&wq->refs); - complete(&wq->done); while (!kthread_should_stop()) { @@ -720,12 +777,12 @@ static int io_wq_manager(void *data) if (!node_online(node)) continue; - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND)) fork_worker[IO_WQ_ACCT_BOUND] = true; if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND)) fork_worker[IO_WQ_ACCT_UNBOUND] = true; - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); if (fork_worker[IO_WQ_ACCT_BOUND]) create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND); if (fork_worker[IO_WQ_ACCT_UNBOUND]) @@ -738,12 +795,18 @@ static int io_wq_manager(void *data) if (current->task_works) task_work_run(); - return 0; -err: - set_bit(IO_WQ_BIT_ERROR, &wq->state); - set_bit(IO_WQ_BIT_EXIT, &wq->state); - if (refcount_sub_and_test(workers_to_create, &wq->refs)) +out: + if (refcount_dec_and_test(&wq->refs)) { complete(&wq->done); + return 0; + } + /* if ERROR is set and we get here, we have workers to wake */ + if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) { + rcu_read_lock(); + for_each_node(node) + io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL); + rcu_read_unlock(); + } return 0; } @@ -821,10 +884,10 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) } work_flags = work->flags; - spin_lock_irqsave(&wqe->lock, flags); + raw_spin_lock_irqsave(&wqe->lock, flags); io_wqe_insert_work(wqe, work); wqe->flags &= ~IO_WQE_FLAG_STALLED; - spin_unlock_irqrestore(&wqe->lock, flags); + raw_spin_unlock_irqrestore(&wqe->lock, flags); if ((work_flags & IO_WQ_WORK_CONCURRENT) || !atomic_read(&acct->nr_running)) @@ -850,37 +913,6 @@ void io_wq_hash_work(struct io_wq_work *work, void *val) work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT)); } -static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data) -{ - send_sig(SIGINT, worker->task, 1); - return false; -} - -/* - * Iterate the passed in list and call the specific function for each - * worker that isn't exiting - */ -static bool io_wq_for_each_worker(struct io_wqe *wqe, - bool (*func)(struct io_worker *, void *), - void *data) -{ - struct io_worker *worker; - bool ret = false; - - list_for_each_entry_rcu(worker, &wqe->all_list, all_list) { - if (io_worker_get(worker)) { - /* no task if node is/was offline */ - if (worker->task) - ret = func(worker, data); - io_worker_release(worker); - if (ret) - break; - } - } - - return ret; -} - void io_wq_cancel_all(struct io_wq *wq) { int node; @@ -925,6 +957,24 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) return match->nr_running && !match->cancel_all; } +static inline void io_wqe_remove_pending(struct io_wqe *wqe, + struct io_wq_work *work, + struct io_wq_work_node *prev) +{ + unsigned int hash = io_get_work_hash(work); + struct io_wq_work *prev_work = NULL; + + if (io_wq_is_hashed(work) && work == wqe->hash_tail[hash]) { + if (prev) + prev_work = container_of(prev, struct io_wq_work, list); + if (prev_work && io_get_work_hash(prev_work) == hash) + wqe->hash_tail[hash] = prev_work; + else + wqe->hash_tail[hash] = NULL; + } + wq_list_del(&wqe->work_list, &work->list, prev); +} + static void io_wqe_cancel_pending_work(struct io_wqe *wqe, struct io_cb_cancel_data *match) { @@ -933,14 +983,13 @@ static void io_wqe_cancel_pending_work(struct io_wqe *wqe, unsigned long flags; retry: - spin_lock_irqsave(&wqe->lock, flags); + raw_spin_lock_irqsave(&wqe->lock, flags); wq_list_for_each(node, prev, &wqe->work_list) { work = container_of(node, struct io_wq_work, list); if (!match->fn(work, match->data)) continue; - - wq_list_del(&wqe->work_list, node, prev); - spin_unlock_irqrestore(&wqe->lock, flags); + io_wqe_remove_pending(wqe, work, prev); + raw_spin_unlock_irqrestore(&wqe->lock, flags); io_run_cancel(work, wqe); match->nr_pending++; if (!match->cancel_all) @@ -949,7 +998,7 @@ retry: /* not safe to continue after unlock */ goto retry; } - spin_unlock_irqrestore(&wqe->lock, flags); + raw_spin_unlock_irqrestore(&wqe->lock, flags); } static void io_wqe_cancel_running_work(struct io_wqe *wqe, @@ -1057,7 +1106,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) } atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0); wqe->wq = wq; - spin_lock_init(&wqe->lock); + raw_spin_lock_init(&wqe->lock); INIT_WQ_LIST(&wqe->work_list); INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0); INIT_LIST_HEAD(&wqe->all_list); @@ -1096,12 +1145,6 @@ bool io_wq_get(struct io_wq *wq, struct io_wq_data *data) return refcount_inc_not_zero(&wq->use_refs); } -static bool io_wq_worker_wake(struct io_worker *worker, void *data) -{ - wake_up_process(worker->task); - return false; -} - static void __io_wq_destroy(struct io_wq *wq) { int node; diff --git a/fs/io-wq.h b/fs/io-wq.h index ddaf9614cf9b..84bcf6a85523 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -87,7 +87,11 @@ struct io_wq_work { struct io_wq_work_node list; struct files_struct *files; struct mm_struct *mm; +#ifdef CONFIG_BLK_CGROUP + struct cgroup_subsys_state *blkcg_css; +#endif const struct cred *creds; + struct nsproxy *nsproxy; struct fs_struct *fs; unsigned long fsize; unsigned flags; diff --git a/fs/io_uring.c b/fs/io_uring.c index dc506b75659c..fc6de6b4784e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -79,6 +79,8 @@ #include <linux/splice.h> #include <linux/task_work.h> #include <linux/pagemap.h> +#include <linux/io_uring.h> +#include <linux/blk-cgroup.h> #define CREATE_TRACE_POINTS #include <trace/events/io_uring.h> @@ -98,6 +100,8 @@ #define IORING_MAX_FILES_TABLE (1U << IORING_FILE_TABLE_SHIFT) #define IORING_FILE_TABLE_MASK (IORING_MAX_FILES_TABLE - 1) #define IORING_MAX_FIXED_FILES (64 * IORING_MAX_FILES_TABLE) +#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ + IORING_REGISTER_LAST + IORING_OP_LAST) struct io_uring { u32 head ____cacheline_aligned_in_smp; @@ -187,6 +191,7 @@ struct io_mapped_ubuf { size_t len; struct bio_vec *bvec; unsigned int nr_bvecs; + unsigned long acct_pages; }; struct fixed_file_table { @@ -205,7 +210,7 @@ struct fixed_file_data { struct fixed_file_table *table; struct io_ring_ctx *ctx; - struct percpu_ref *cur_refs; + struct fixed_file_ref_node *node; struct percpu_ref refs; struct completion done; struct list_head ref_list; @@ -219,6 +224,27 @@ struct io_buffer { __u16 bid; }; +struct io_restriction { + DECLARE_BITMAP(register_op, IORING_REGISTER_LAST); + DECLARE_BITMAP(sqe_op, IORING_OP_LAST); + u8 sqe_flags_allowed; + u8 sqe_flags_required; + bool registered; +}; + +struct io_sq_data { + refcount_t refs; + struct mutex lock; + + /* ctx's that are using this sqd */ + struct list_head ctx_list; + struct list_head ctx_new_list; + struct mutex ctx_lock; + + struct task_struct *thread; + struct wait_queue_head wait; +}; + struct io_ring_ctx { struct { struct percpu_ref refs; @@ -231,6 +257,7 @@ struct io_ring_ctx { unsigned int cq_overflow_flushed: 1; unsigned int drain_next: 1; unsigned int eventfd_async: 1; + unsigned int restricted: 1; /* * Ring buffer of indices into array of io_uring_sqe, which is @@ -264,9 +291,25 @@ struct io_ring_ctx { /* IO offload */ struct io_wq *io_wq; - struct task_struct *sqo_thread; /* if using sq thread polling */ - struct mm_struct *sqo_mm; - wait_queue_head_t sqo_wait; + + /* + * For SQPOLL usage - we hold a reference to the parent task, so we + * have access to the ->files + */ + struct task_struct *sqo_task; + + /* Only used for accounting purposes */ + struct mm_struct *mm_account; + +#ifdef CONFIG_BLK_CGROUP + struct cgroup_subsys_state *sqo_blkcg_css; +#endif + + struct io_sq_data *sq_data; /* if using sq thread polling */ + + struct wait_queue_head sqo_sq_wait; + struct wait_queue_entry sqo_wait_entry; + struct list_head sqd_list; /* * If used, fixed file set. Writers must ensure that ->refs is dead, @@ -275,8 +318,6 @@ struct io_ring_ctx { */ struct fixed_file_data *file_data; unsigned nr_user_files; - int ring_fd; - struct file *ring_file; /* if used, fixed mapped user buffers */ unsigned nr_user_bufs; @@ -338,6 +379,7 @@ struct io_ring_ctx { struct llist_head file_put_llist; struct work_struct exit_work; + struct io_restriction restrictions; }; /* @@ -392,13 +434,16 @@ struct io_cancel { struct io_timeout { struct file *file; - u64 addr; - int flags; u32 off; u32 target_seq; struct list_head list; }; +struct io_timeout_rem { + struct file *file; + u64 addr; +}; + struct io_rw { /* NOTE: kiocb has the file as the first member, so don't do it here */ struct kiocb kiocb; @@ -514,15 +559,6 @@ struct io_async_rw { struct wait_page_queue wpq; }; -struct io_async_ctx { - union { - struct io_async_rw rw; - struct io_async_msghdr msg; - struct io_async_connect connect; - struct io_timeout_data timeout; - }; -}; - enum { REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, @@ -540,12 +576,10 @@ enum { REQ_F_ISREG_BIT, REQ_F_COMP_LOCKED_BIT, REQ_F_NEED_CLEANUP_BIT, - REQ_F_OVERFLOW_BIT, REQ_F_POLLED_BIT, REQ_F_BUFFER_SELECTED_BIT, REQ_F_NO_FILE_TABLE_BIT, REQ_F_WORK_INITIALIZED_BIT, - REQ_F_TASK_PINNED_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -583,8 +617,6 @@ enum { REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT), /* needs cleanup */ REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), - /* in overflow list */ - REQ_F_OVERFLOW = BIT(REQ_F_OVERFLOW_BIT), /* already went through poll handler */ REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), /* buffer already selected */ @@ -593,8 +625,6 @@ enum { REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT), /* io_wq_work is initialized */ REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT), - /* req->task is refcounted */ - REQ_F_TASK_PINNED = BIT(REQ_F_TASK_PINNED_BIT), }; struct async_poll { @@ -617,6 +647,7 @@ struct io_kiocb { struct io_sync sync; struct io_cancel cancel; struct io_timeout timeout; + struct io_timeout_rem timeout_rem; struct io_connect connect; struct io_sr_msg sr_msg; struct io_open open; @@ -632,7 +663,8 @@ struct io_kiocb { struct io_completion compl; }; - struct io_async_ctx *io; + /* opcode allocated if it needs to store data for async defer */ + void *async_data; u8 opcode; /* polled IO has completed */ u8 iopoll_completed; @@ -700,8 +732,6 @@ struct io_submit_state { }; struct io_op_def { - /* needs req->io allocated for deferral/async */ - unsigned async_ctx : 1; /* needs current->mm setup, does mm access */ unsigned needs_mm : 1; /* needs req->file assigned */ @@ -723,35 +753,49 @@ struct io_op_def { unsigned pollout : 1; /* op supports buffer selection */ unsigned buffer_select : 1; + /* needs rlimit(RLIMIT_FSIZE) assigned */ unsigned needs_fsize : 1; + /* must always have async data allocated */ + unsigned needs_async_data : 1; + /* needs blkcg context, issues async io potentially */ + unsigned needs_blkcg : 1; + /* size of async data needed, if any */ + unsigned short async_size; }; -static const struct io_op_def io_op_defs[] = { +static const struct io_op_def io_op_defs[] __read_mostly = { [IORING_OP_NOP] = {}, [IORING_OP_READV] = { - .async_ctx = 1, .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, + .needs_async_data = 1, + .needs_blkcg = 1, + .async_size = sizeof(struct io_async_rw), }, [IORING_OP_WRITEV] = { - .async_ctx = 1, .needs_mm = 1, .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, .needs_fsize = 1, + .needs_async_data = 1, + .needs_blkcg = 1, + .async_size = sizeof(struct io_async_rw), }, [IORING_OP_FSYNC] = { .needs_file = 1, + .needs_blkcg = 1, }, [IORING_OP_READ_FIXED] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, + .needs_blkcg = 1, + .async_size = sizeof(struct io_async_rw), }, [IORING_OP_WRITE_FIXED] = { .needs_file = 1, @@ -759,6 +803,8 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollout = 1, .needs_fsize = 1, + .needs_blkcg = 1, + .async_size = sizeof(struct io_async_rw), }, [IORING_OP_POLL_ADD] = { .needs_file = 1, @@ -767,27 +813,33 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_POLL_REMOVE] = {}, [IORING_OP_SYNC_FILE_RANGE] = { .needs_file = 1, + .needs_blkcg = 1, }, [IORING_OP_SENDMSG] = { - .async_ctx = 1, .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .needs_fs = 1, .pollout = 1, + .needs_async_data = 1, + .needs_blkcg = 1, + .async_size = sizeof(struct io_async_msghdr), }, [IORING_OP_RECVMSG] = { - .async_ctx = 1, .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .needs_fs = 1, .pollin = 1, .buffer_select = 1, + .needs_async_data = 1, + .needs_blkcg = 1, + .async_size = sizeof(struct io_async_msghdr), }, [IORING_OP_TIMEOUT] = { - .async_ctx = 1, .needs_mm = 1, + .needs_async_data = 1, + .async_size = sizeof(struct io_timeout_data), }, [IORING_OP_TIMEOUT_REMOVE] = {}, [IORING_OP_ACCEPT] = { @@ -799,28 +851,33 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_ASYNC_CANCEL] = {}, [IORING_OP_LINK_TIMEOUT] = { - .async_ctx = 1, .needs_mm = 1, + .needs_async_data = 1, + .async_size = sizeof(struct io_timeout_data), }, [IORING_OP_CONNECT] = { - .async_ctx = 1, .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, + .needs_async_data = 1, + .async_size = sizeof(struct io_async_connect), }, [IORING_OP_FALLOCATE] = { .needs_file = 1, .needs_fsize = 1, + .needs_blkcg = 1, }, [IORING_OP_OPENAT] = { .file_table = 1, .needs_fs = 1, + .needs_blkcg = 1, }, [IORING_OP_CLOSE] = { .needs_file = 1, .needs_file_no_error = 1, .file_table = 1, + .needs_blkcg = 1, }, [IORING_OP_FILES_UPDATE] = { .needs_mm = 1, @@ -830,6 +887,7 @@ static const struct io_op_def io_op_defs[] = { .needs_mm = 1, .needs_fs = 1, .file_table = 1, + .needs_blkcg = 1, }, [IORING_OP_READ] = { .needs_mm = 1, @@ -837,6 +895,8 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, + .needs_blkcg = 1, + .async_size = sizeof(struct io_async_rw), }, [IORING_OP_WRITE] = { .needs_mm = 1, @@ -844,18 +904,23 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollout = 1, .needs_fsize = 1, + .needs_blkcg = 1, + .async_size = sizeof(struct io_async_rw), }, [IORING_OP_FADVISE] = { .needs_file = 1, + .needs_blkcg = 1, }, [IORING_OP_MADVISE] = { .needs_mm = 1, + .needs_blkcg = 1, }, [IORING_OP_SEND] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, + .needs_blkcg = 1, }, [IORING_OP_RECV] = { .needs_mm = 1, @@ -863,10 +928,12 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, + .needs_blkcg = 1, }, [IORING_OP_OPENAT2] = { .file_table = 1, .needs_fs = 1, + .needs_blkcg = 1, }, [IORING_OP_EPOLL_CTL] = { .unbound_nonreg_file = 1, @@ -876,6 +943,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, + .needs_blkcg = 1, }, [IORING_OP_PROVIDE_BUFFERS] = {}, [IORING_OP_REMOVE_BUFFERS] = {}, @@ -903,13 +971,10 @@ static void io_queue_linked_timeout(struct io_kiocb *req); static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_files_update *ip, unsigned nr_args); -static int io_prep_work_files(struct io_kiocb *req); static void __io_clean_op(struct io_kiocb *req); -static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, - int fd, struct file **out_file, bool fixed); -static void __io_queue_sqe(struct io_kiocb *req, - const struct io_uring_sqe *sqe, - struct io_comp_state *cs); +static struct file *io_file_get(struct io_submit_state *state, + struct io_kiocb *req, int fd, bool fixed); +static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs); static void io_file_put_work(struct work_struct *work); static ssize_t io_import_iovec(int rw, struct io_kiocb *req, @@ -921,7 +986,7 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, static struct kmem_cache *req_cachep; -static const struct file_operations io_uring_fops; +static const struct file_operations io_uring_fops __read_mostly; struct sock *io_uring_get_socket(struct file *file) { @@ -936,27 +1001,13 @@ struct sock *io_uring_get_socket(struct file *file) } EXPORT_SYMBOL(io_uring_get_socket); -static void io_get_req_task(struct io_kiocb *req) -{ - if (req->flags & REQ_F_TASK_PINNED) - return; - get_task_struct(req->task); - req->flags |= REQ_F_TASK_PINNED; -} - static inline void io_clean_op(struct io_kiocb *req) { - if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED)) + if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED | + REQ_F_INFLIGHT)) __io_clean_op(req); } -/* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */ -static void __io_put_req_task(struct io_kiocb *req) -{ - if (req->flags & REQ_F_TASK_PINNED) - put_task_struct(req->task); -} - static void io_sq_thread_drop_mm(void) { struct mm_struct *mm = current->mm; @@ -971,9 +1022,10 @@ static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) { if (!current->mm) { if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) || - !mmget_not_zero(ctx->sqo_mm))) + !ctx->sqo_task->mm || + !mmget_not_zero(ctx->sqo_task->mm))) return -EFAULT; - kthread_use_mm(ctx->sqo_mm); + kthread_use_mm(ctx->sqo_task->mm); } return 0; @@ -987,6 +1039,26 @@ static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, return __io_sq_thread_acquire_mm(ctx); } +static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx, + struct cgroup_subsys_state **cur_css) + +{ +#ifdef CONFIG_BLK_CGROUP + /* puts the old one when swapping */ + if (*cur_css != ctx->sqo_blkcg_css) { + kthread_associate_blkcg(ctx->sqo_blkcg_css); + *cur_css = ctx->sqo_blkcg_css; + } +#endif +} + +static void io_sq_thread_unassociate_blkcg(void) +{ +#ifdef CONFIG_BLK_CGROUP + kthread_associate_blkcg(NULL); +#endif +} + static inline void req_set_fail_links(struct io_kiocb *req) { if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK) @@ -1056,7 +1128,8 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) goto err; ctx->flags = p->flags; - init_waitqueue_head(&ctx->sqo_wait); + init_waitqueue_head(&ctx->sqo_sq_wait); + INIT_LIST_HEAD(&ctx->sqd_list); init_waitqueue_head(&ctx->cq_wait); INIT_LIST_HEAD(&ctx->cq_overflow_list); init_completion(&ctx->ref_comp); @@ -1123,6 +1196,10 @@ static bool io_req_clean_work(struct io_kiocb *req) mmdrop(req->work.mm); req->work.mm = NULL; } +#ifdef CONFIG_BLK_CGROUP + if (req->work.blkcg_css) + css_put(req->work.blkcg_css); +#endif if (req->work.creds) { put_cred(req->work.creds); req->work.creds = NULL; @@ -1148,20 +1225,45 @@ static bool io_req_clean_work(struct io_kiocb *req) static void io_prep_async_work(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; + struct io_ring_ctx *ctx = req->ctx; io_req_init_async(req); if (req->flags & REQ_F_ISREG) { - if (def->hash_reg_file) + if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL)) io_wq_hash_work(&req->work, file_inode(req->file)); } else { if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; } + if (!req->work.files && io_op_defs[req->opcode].file_table && + !(req->flags & REQ_F_NO_FILE_TABLE)) { + req->work.files = get_files_struct(current); + get_nsproxy(current->nsproxy); + req->work.nsproxy = current->nsproxy; + req->flags |= REQ_F_INFLIGHT; + + spin_lock_irq(&ctx->inflight_lock); + list_add(&req->inflight_entry, &ctx->inflight_list); + spin_unlock_irq(&ctx->inflight_lock); + } if (!req->work.mm && def->needs_mm) { mmgrab(current->mm); req->work.mm = current->mm; } +#ifdef CONFIG_BLK_CGROUP + if (!req->work.blkcg_css && def->needs_blkcg) { + rcu_read_lock(); + req->work.blkcg_css = blkcg_css(); + /* + * This should be rare, either the cgroup is dying or the task + * is moving cgroups. Just punt to root for the handful of ios. + */ + if (!css_tryget_online(req->work.blkcg_css)) + req->work.blkcg_css = NULL; + rcu_read_unlock(); + } +#endif if (!req->work.creds) req->work.creds = get_current_cred(); if (!req->work.fs && def->needs_fs) { @@ -1215,9 +1317,10 @@ static void io_queue_async_work(struct io_kiocb *req) static void io_kill_timeout(struct io_kiocb *req) { + struct io_timeout_data *io = req->async_data; int ret; - ret = hrtimer_try_to_cancel(&req->io->timeout.timer); + ret = hrtimer_try_to_cancel(&io->timer); if (ret != -1) { atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); @@ -1228,14 +1331,36 @@ static void io_kill_timeout(struct io_kiocb *req) } } -static void io_kill_timeouts(struct io_ring_ctx *ctx) +static bool io_task_match(struct io_kiocb *req, struct task_struct *tsk) +{ + struct io_ring_ctx *ctx = req->ctx; + + if (!tsk || req->task == tsk) + return true; + if (ctx->flags & IORING_SETUP_SQPOLL) { + if (ctx->sq_data && req->task == ctx->sq_data->thread) + return true; + } + return false; +} + +/* + * Returns true if we found and killed one or more timeouts + */ +static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk) { struct io_kiocb *req, *tmp; + int canceled = 0; spin_lock_irq(&ctx->completion_lock); - list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) - io_kill_timeout(req); + list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { + if (io_task_match(req, tsk)) { + io_kill_timeout(req); + canceled++; + } + } spin_unlock_irq(&ctx->completion_lock); + return canceled != 0; } static void __io_queue_deferred(struct io_ring_ctx *ctx) @@ -1286,6 +1411,13 @@ static void io_commit_cqring(struct io_ring_ctx *ctx) __io_queue_deferred(ctx); } +static inline bool io_sqring_full(struct io_ring_ctx *ctx) +{ + struct io_rings *r = ctx->rings; + + return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == r->sq_ring_entries; +} + static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; @@ -1319,8 +1451,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) { if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); - if (waitqueue_active(&ctx->sqo_wait)) - wake_up(&ctx->sqo_wait); + if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait)) + wake_up(&ctx->sq_data->wait); if (io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); } @@ -1334,12 +1466,24 @@ static void io_cqring_mark_overflow(struct io_ring_ctx *ctx) } } +static inline bool io_match_files(struct io_kiocb *req, + struct files_struct *files) +{ + if (!files) + return true; + if (req->flags & REQ_F_WORK_INITIALIZED) + return req->work.files == files; + return false; +} + /* Returns true if there are no backlogged entries after the flush */ -static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) +static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, + struct task_struct *tsk, + struct files_struct *files) { struct io_rings *rings = ctx->rings; + struct io_kiocb *req, *tmp; struct io_uring_cqe *cqe; - struct io_kiocb *req; unsigned long flags; LIST_HEAD(list); @@ -1358,15 +1502,17 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) ctx->cq_overflow_flushed = 1; cqe = NULL; - while (!list_empty(&ctx->cq_overflow_list)) { + list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) { + if (tsk && req->task != tsk) + continue; + if (!io_match_files(req, files)) + continue; + cqe = io_get_cqring(ctx); if (!cqe && !force) break; - req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb, - compl.list); list_move(&req->compl.list, &list); - req->flags &= ~REQ_F_OVERFLOW; if (cqe) { WRITE_ONCE(cqe->user_data, req->user_data); WRITE_ONCE(cqe->res, req->result); @@ -1409,7 +1555,12 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) WRITE_ONCE(cqe->user_data, req->user_data); WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, cflags); - } else if (ctx->cq_overflow_flushed) { + } else if (ctx->cq_overflow_flushed || req->task->io_uring->in_idle) { + /* + * If we're in ring overflow flush mode, or in task cancel mode, + * then we cannot store the request for later flushing, we need + * to drop it on the floor. + */ WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); } else { @@ -1419,7 +1570,6 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW; } io_clean_op(req); - req->flags |= REQ_F_OVERFLOW; req->result = res; req->compl.cflags = cflags; refcount_inc(&req->refs); @@ -1513,10 +1663,8 @@ static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx) static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx, struct io_submit_state *state) { - gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; - struct io_kiocb *req; - if (!state->free_reqs) { + gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; size_t sz; int ret; @@ -1533,14 +1681,11 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx, goto fallback; ret = 1; } - state->free_reqs = ret - 1; - req = state->reqs[ret - 1]; - } else { - state->free_reqs--; - req = state->reqs[state->free_reqs]; + state->free_reqs = ret; } - return req; + state->free_reqs--; + return state->reqs[state->free_reqs]; fallback: return io_get_fallback_req(ctx); } @@ -1558,30 +1703,24 @@ static bool io_dismantle_req(struct io_kiocb *req) { io_clean_op(req); - if (req->io) - kfree(req->io); + if (req->async_data) + kfree(req->async_data); if (req->file) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); - if (req->flags & REQ_F_INFLIGHT) { - struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - - spin_lock_irqsave(&ctx->inflight_lock, flags); - list_del(&req->inflight_entry); - if (waitqueue_active(&ctx->inflight_wait)) - wake_up(&ctx->inflight_wait); - spin_unlock_irqrestore(&ctx->inflight_lock, flags); - } - return io_req_clean_work(req); } static void __io_free_req_finish(struct io_kiocb *req) { + struct io_uring_task *tctx = req->task->io_uring; struct io_ring_ctx *ctx = req->ctx; - __io_put_req_task(req); + atomic_long_inc(&tctx->req_complete); + if (tctx->in_idle) + wake_up(&tctx->wait); + put_task_struct(req->task); + if (likely(!io_is_fallback_req(req))) kmem_cache_free(req_cachep, req); else @@ -1624,10 +1763,11 @@ static void __io_free_req(struct io_kiocb *req) static bool io_link_cancel_timeout(struct io_kiocb *req) { + struct io_timeout_data *io = req->async_data; struct io_ring_ctx *ctx = req->ctx; int ret; - ret = hrtimer_try_to_cancel(&req->io->timeout.timer); + ret = hrtimer_try_to_cancel(&io->timer); if (ret != -1) { io_cqring_fill_event(req, -ECANCELED); io_commit_cqring(ctx); @@ -1761,12 +1901,15 @@ static struct io_kiocb *io_req_find_next(struct io_kiocb *req) return __io_req_find_next(req); } -static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb) +static int io_req_task_work_add(struct io_kiocb *req, bool twa_signal_ok) { struct task_struct *tsk = req->task; struct io_ring_ctx *ctx = req->ctx; int ret, notify; + if (tsk->flags & PF_EXITING) + return -ESRCH; + /* * SQPOLL kernel thread doesn't need notification, just a wakeup. For * all other cases, use TWA_SIGNAL unconditionally to ensure we're @@ -1774,10 +1917,10 @@ static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb) * will do the job. */ notify = 0; - if (!(ctx->flags & IORING_SETUP_SQPOLL)) + if (!(ctx->flags & IORING_SETUP_SQPOLL) && twa_signal_ok) notify = TWA_SIGNAL; - ret = task_work_add(tsk, cb, notify); + ret = task_work_add(tsk, &req->task_work, notify); if (!ret) wake_up_process(tsk); @@ -1801,8 +1944,10 @@ static void __io_req_task_cancel(struct io_kiocb *req, int error) static void io_req_task_cancel(struct callback_head *cb) { struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct io_ring_ctx *ctx = req->ctx; __io_req_task_cancel(req, -ECANCELED); + percpu_ref_put(&ctx->refs); } static void __io_req_task_submit(struct io_kiocb *req) @@ -1811,7 +1956,7 @@ static void __io_req_task_submit(struct io_kiocb *req) if (!__io_sq_thread_acquire_mm(ctx)) { mutex_lock(&ctx->uring_lock); - __io_queue_sqe(req, NULL, NULL); + __io_queue_sqe(req, NULL); mutex_unlock(&ctx->uring_lock); } else { __io_req_task_cancel(req, -EFAULT); @@ -1834,7 +1979,7 @@ static void io_req_task_queue(struct io_kiocb *req) init_task_work(&req->task_work, io_req_task_submit); percpu_ref_get(&req->ctx->refs); - ret = io_req_task_work_add(req, &req->task_work); + ret = io_req_task_work_add(req, true); if (unlikely(ret)) { struct task_struct *tsk; @@ -1888,6 +2033,7 @@ static void io_req_free_batch_finish(struct io_ring_ctx *ctx, if (rb->to_free) __io_req_free_batch_flush(ctx, rb); if (rb->task) { + atomic_long_add(rb->task_refs, &rb->task->io_uring->req_complete); put_task_struct_many(rb->task, rb->task_refs); rb->task = NULL; } @@ -1902,16 +2048,15 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) if (req->flags & REQ_F_LINK_HEAD) io_queue_next(req); - if (req->flags & REQ_F_TASK_PINNED) { - if (req->task != rb->task) { - if (rb->task) - put_task_struct_many(rb->task, rb->task_refs); - rb->task = req->task; - rb->task_refs = 0; + if (req->task != rb->task) { + if (rb->task) { + atomic_long_add(rb->task_refs, &rb->task->io_uring->req_complete); + put_task_struct_many(rb->task, rb->task_refs); } - rb->task_refs++; - req->flags &= ~REQ_F_TASK_PINNED; + rb->task = req->task; + rb->task_refs = 0; } + rb->task_refs++; WARN_ON_ONCE(io_dismantle_req(req)); rb->reqs[rb->to_free++] = req; @@ -1987,7 +2132,7 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush) if (noflush && !list_empty(&ctx->cq_overflow_list)) return -1U; - io_cqring_overflow_flush(ctx, false); + io_cqring_overflow_flush(ctx, false, NULL, NULL); } /* See comment at the top of this file */ @@ -2024,6 +2169,12 @@ static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req) static inline bool io_run_task_work(void) { + /* + * Not safe to run on exiting task, and the task_work handling will + * not add work to such a task. + */ + if (unlikely(current->flags & PF_EXITING)) + return false; if (current->task_works) { __set_current_state(TASK_RUNNING); task_work_run(); @@ -2063,6 +2214,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, req = list_first_entry(done, struct io_kiocb, inflight_entry); if (READ_ONCE(req->result) == -EAGAIN) { + req->result = 0; req->iopoll_completed = 0; list_move_tail(&req->inflight_entry, &again); continue; @@ -2296,50 +2448,43 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error) goto end_req; } - ret = io_import_iovec(rw, req, &iovec, &iter, false); - if (ret < 0) - goto end_req; - ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false); - if (!ret) + if (!req->async_data) { + ret = io_import_iovec(rw, req, &iovec, &iter, false); + if (ret < 0) + goto end_req; + ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false); + if (!ret) + return true; + kfree(iovec); + } else { return true; - kfree(iovec); + } end_req: req_set_fail_links(req); io_req_complete(req, ret); return false; } - -static void io_rw_resubmit(struct callback_head *cb) -{ - struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); - struct io_ring_ctx *ctx = req->ctx; - int err; - - err = io_sq_thread_acquire_mm(ctx, req); - - if (io_resubmit_prep(req, err)) { - refcount_inc(&req->refs); - io_queue_async_work(req); - } - - percpu_ref_put(&ctx->refs); -} #endif static bool io_rw_reissue(struct io_kiocb *req, long res) { #ifdef CONFIG_BLOCK + umode_t mode = file_inode(req->file)->i_mode; int ret; + if (!S_ISBLK(mode) && !S_ISREG(mode)) + return false; if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker()) return false; - init_task_work(&req->task_work, io_rw_resubmit); - percpu_ref_get(&req->ctx->refs); + ret = io_sq_thread_acquire_mm(req->ctx, req); - ret = io_req_task_work_add(req, &req->task_work); - if (!ret) + if (io_resubmit_prep(req, ret)) { + refcount_inc(&req->refs); + io_queue_async_work(req); return true; + } + #endif return false; } @@ -2410,8 +2555,8 @@ static void io_iopoll_req_issued(struct io_kiocb *req) list_add_tail(&req->inflight_entry, &ctx->iopoll_list); if ((ctx->flags & IORING_SETUP_SQPOLL) && - wq_has_sleeper(&ctx->sqo_wait)) - wake_up(&ctx->sqo_wait); + wq_has_sleeper(&ctx->sq_data->wait)) + wake_up(&ctx->sq_data->wait); } static void __io_state_file_put(struct io_submit_state *state) @@ -2440,7 +2585,6 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd) if (state->file) { if (state->fd == fd) { state->has_refs--; - state->ios_left--; return state->file; } __io_state_file_put(state); @@ -2450,8 +2594,7 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd) return NULL; state->fd = fd; - state->ios_left--; - state->has_refs = state->ios_left; + state->has_refs = state->ios_left - 1; return state->file; } @@ -2500,8 +2643,7 @@ static bool io_file_supports_async(struct file *file, int rw) return file->f_op->write_iter != NULL; } -static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, - bool force_nonblock) +static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; struct kiocb *kiocb = &req->rw.kiocb; @@ -2536,12 +2678,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (kiocb->ki_flags & IOCB_NOWAIT) req->flags |= REQ_F_NOWAIT; - if (kiocb->ki_flags & IOCB_DIRECT) - io_get_req_task(req); - - if (force_nonblock) - kiocb->ki_flags |= IOCB_NOWAIT; - if (ctx->flags & IORING_SETUP_IOPOLL) { if (!(kiocb->ki_flags & IOCB_DIRECT) || !kiocb->ki_filp->f_op->iopoll) @@ -2550,7 +2686,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, kiocb->ki_flags |= IOCB_HIPRI; kiocb->ki_complete = io_complete_rw_iopoll; req->iopoll_completed = 0; - io_get_req_task(req); } else { if (kiocb->ki_flags & IOCB_HIPRI) return -EINVAL; @@ -2578,7 +2713,7 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) * IO with EINTR. */ ret = -EINTR; - /* fall through */ + fallthrough; default: kiocb->ki_complete(kiocb, ret, 0); } @@ -2588,13 +2723,14 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_comp_state *cs) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); + struct io_async_rw *io = req->async_data; /* add previously done IO, if any */ - if (req->io && req->io->rw.bytes_done > 0) { + if (io && io->bytes_done > 0) { if (ret < 0) - ret = req->io->rw.bytes_done; + ret = io->bytes_done; else - ret += req->io->rw.bytes_done; + ret += io->bytes_done; } if (req->flags & REQ_F_CUR_POS) @@ -2611,18 +2747,12 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw, struct io_ring_ctx *ctx = req->ctx; size_t len = req->rw.len; struct io_mapped_ubuf *imu; - u16 index, buf_index; + u16 index, buf_index = req->buf_index; size_t offset; u64 buf_addr; - /* attempt to use fixed buffers without having provided iovecs */ - if (unlikely(!ctx->user_bufs)) - return -EFAULT; - - buf_index = req->buf_index; if (unlikely(buf_index >= ctx->nr_user_bufs)) return -EFAULT; - index = array_index_nospec(buf_index, ctx->nr_user_bufs); imu = &ctx->user_bufs[index]; buf_addr = req->rw.addr; @@ -2819,22 +2949,15 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, return __io_iov_buffer_select(req, iov, needs_lock); } -static ssize_t io_import_iovec(int rw, struct io_kiocb *req, - struct iovec **iovec, struct iov_iter *iter, - bool needs_lock) +static ssize_t __io_import_iovec(int rw, struct io_kiocb *req, + struct iovec **iovec, struct iov_iter *iter, + bool needs_lock) { void __user *buf = u64_to_user_ptr(req->rw.addr); size_t sqe_len = req->rw.len; ssize_t ret; u8 opcode; - if (req->io) { - struct io_async_rw *iorw = &req->io->rw; - - *iovec = NULL; - return iov_iter_count(&iorw->iter); - } - opcode = req->opcode; if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { *iovec = NULL; @@ -2848,10 +2971,8 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { if (req->flags & REQ_F_BUFFER_SELECT) { buf = io_rw_buffer_select(req, &sqe_len, needs_lock); - if (IS_ERR(buf)) { - *iovec = NULL; + if (IS_ERR(buf)) return PTR_ERR(buf); - } req->rw.len = sqe_len; } @@ -2870,13 +2991,25 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, return ret; } -#ifdef CONFIG_COMPAT - if (req->ctx->compat) - return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV, - iovec, iter); -#endif + return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter, + req->ctx->compat); +} - return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter); +static ssize_t io_import_iovec(int rw, struct io_kiocb *req, + struct iovec **iovec, struct iov_iter *iter, + bool needs_lock) +{ + struct io_async_rw *iorw = req->async_data; + + if (!iorw) + return __io_import_iovec(rw, req, iovec, iter, needs_lock); + *iovec = NULL; + return iov_iter_count(&iorw->iter); +} + +static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) +{ + return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos; } /* @@ -2914,10 +3047,10 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, if (rw == READ) { nr = file->f_op->read(file, iovec.iov_base, - iovec.iov_len, &kiocb->ki_pos); + iovec.iov_len, io_kiocb_ppos(kiocb)); } else { nr = file->f_op->write(file, iovec.iov_base, - iovec.iov_len, &kiocb->ki_pos); + iovec.iov_len, io_kiocb_ppos(kiocb)); } if (iov_iter_is_bvec(iter)) @@ -2940,10 +3073,10 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, const struct iovec *fast_iov, struct iov_iter *iter) { - struct io_async_rw *rw = &req->io->rw; + struct io_async_rw *rw = req->async_data; memcpy(&rw->iter, iter, sizeof(*iter)); - rw->free_iovec = NULL; + rw->free_iovec = iovec; rw->bytes_done = 0; /* can only be fixed buffers, no need to do anything */ if (iter->type == ITER_BVEC) @@ -2960,33 +3093,33 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, memcpy(rw->fast_iov + iov_off, fast_iov + iov_off, sizeof(struct iovec) * iter->nr_segs); } else { - rw->free_iovec = iovec; req->flags |= REQ_F_NEED_CLEANUP; } } -static inline int __io_alloc_async_ctx(struct io_kiocb *req) +static inline int __io_alloc_async_data(struct io_kiocb *req) { - req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); - return req->io == NULL; + WARN_ON_ONCE(!io_op_defs[req->opcode].async_size); + req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL); + return req->async_data == NULL; } -static int io_alloc_async_ctx(struct io_kiocb *req) +static int io_alloc_async_data(struct io_kiocb *req) { - if (!io_op_defs[req->opcode].async_ctx) + if (!io_op_defs[req->opcode].needs_async_data) return 0; - return __io_alloc_async_ctx(req); + return __io_alloc_async_data(req); } static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, const struct iovec *fast_iov, struct iov_iter *iter, bool force) { - if (!force && !io_op_defs[req->opcode].async_ctx) + if (!force && !io_op_defs[req->opcode].needs_async_data) return 0; - if (!req->io) { - if (__io_alloc_async_ctx(req)) + if (!req->async_data) { + if (__io_alloc_async_data(req)) return -ENOMEM; io_req_map_rw(req, iovec, fast_iov, iter); @@ -2994,31 +3127,28 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, return 0; } -static inline int io_rw_prep_async(struct io_kiocb *req, int rw, - bool force_nonblock) +static inline int io_rw_prep_async(struct io_kiocb *req, int rw) { - struct io_async_rw *iorw = &req->io->rw; + struct io_async_rw *iorw = req->async_data; + struct iovec *iov = iorw->fast_iov; ssize_t ret; - iorw->iter.iov = iorw->fast_iov; - /* reset ->io around the iovec import, we don't want to use it */ - req->io = NULL; - ret = io_import_iovec(rw, req, (struct iovec **) &iorw->iter.iov, - &iorw->iter, !force_nonblock); - req->io = container_of(iorw, struct io_async_ctx, rw); + ret = __io_import_iovec(rw, req, &iov, &iorw->iter, false); if (unlikely(ret < 0)) return ret; - io_req_map_rw(req, iorw->iter.iov, iorw->fast_iov, &iorw->iter); + iorw->bytes_done = 0; + iorw->free_iovec = iov; + if (iov) + req->flags |= REQ_F_NEED_CLEANUP; return 0; } -static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, - bool force_nonblock) +static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { ssize_t ret; - ret = io_prep_rw(req, sqe, force_nonblock); + ret = io_prep_rw(req, sqe); if (ret) return ret; @@ -3026,9 +3156,9 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, return -EBADF; /* either don't need iovec imported or already have it */ - if (!req->io || req->flags & REQ_F_NEED_CLEANUP) + if (!req->async_data) return 0; - return io_rw_prep_async(req, READ, force_nonblock); + return io_rw_prep_async(req, READ); } /* @@ -3054,6 +3184,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, if (!wake_page_match(wpq, key)) return 0; + req->rw.kiocb.ki_flags &= ~IOCB_WAITQ; list_del_init(&wait->entry); init_task_work(&req->task_work, io_req_task_submit); @@ -3061,7 +3192,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, /* submit ref gets dropped, acquire a new one */ refcount_inc(&req->refs); - ret = io_req_task_work_add(req, &req->task_work); + ret = io_req_task_work_add(req, true); if (unlikely(ret)) { struct task_struct *tsk; @@ -3074,27 +3205,6 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, return 1; } -static inline int kiocb_wait_page_queue_init(struct kiocb *kiocb, - struct wait_page_queue *wait, - wait_queue_func_t func, - void *data) -{ - /* Can't support async wakeup with polled IO */ - if (kiocb->ki_flags & IOCB_HIPRI) - return -EINVAL; - if (kiocb->ki_filp->f_mode & FMODE_BUF_RASYNC) { - wait->wait.func = func; - wait->wait.private = data; - wait->wait.flags = 0; - INIT_LIST_HEAD(&wait->wait.entry); - kiocb->ki_flags |= IOCB_WAITQ; - kiocb->ki_waitq = wait; - return 0; - } - - return -EOPNOTSUPP; -} - /* * This controls whether a given IO request should be armed for async page * based retry. If we return false here, the request is handed to the async @@ -3109,16 +3219,18 @@ static inline int kiocb_wait_page_queue_init(struct kiocb *kiocb, */ static bool io_rw_should_retry(struct io_kiocb *req) { + struct io_async_rw *rw = req->async_data; + struct wait_page_queue *wait = &rw->wpq; struct kiocb *kiocb = &req->rw.kiocb; - int ret; /* never retry for NOWAIT, we just complete with -EAGAIN */ if (req->flags & REQ_F_NOWAIT) return false; /* Only for buffered IO */ - if (kiocb->ki_flags & IOCB_DIRECT) + if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI)) return false; + /* * just use poll if we can, and don't attempt if the fs doesn't * support callback based unlocks @@ -3126,14 +3238,14 @@ static bool io_rw_should_retry(struct io_kiocb *req) if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC)) return false; - ret = kiocb_wait_page_queue_init(kiocb, &req->io->rw.wpq, - io_async_buf_func, req); - if (!ret) { - io_get_req_task(req); - return true; - } - - return false; + wait->wait.func = io_async_buf_func; + wait->wait.private = req; + wait->wait.flags = 0; + INIT_LIST_HEAD(&wait->wait.entry); + kiocb->ki_flags |= IOCB_WAITQ; + kiocb->ki_flags &= ~IOCB_NOWAIT; + kiocb->ki_waitq = wait; + return true; } static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) @@ -3152,15 +3264,18 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; struct iov_iter __iter, *iter = &__iter; + struct io_async_rw *rw = req->async_data; ssize_t io_size, ret, ret2; size_t iov_count; + bool no_async; - if (req->io) - iter = &req->io->rw.iter; + if (rw) + iter = &rw->iter; ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock); if (ret < 0) return ret; + iov_count = iov_iter_count(iter); io_size = ret; req->result = io_size; ret = 0; @@ -3168,13 +3283,16 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, /* Ensure we clear previously set non-block flag */ if (!force_nonblock) kiocb->ki_flags &= ~IOCB_NOWAIT; + else + kiocb->ki_flags |= IOCB_NOWAIT; + /* If the file doesn't support async, just async punt */ - if (force_nonblock && !io_file_supports_async(req->file, READ)) + no_async = force_nonblock && !io_file_supports_async(req->file, READ); + if (no_async) goto copy_iov; - iov_count = iov_iter_count(iter); - ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); + ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), iov_count); if (unlikely(ret)) goto out_free; @@ -3186,14 +3304,19 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, ret = 0; goto out_free; } else if (ret == -EAGAIN) { - if (!force_nonblock) + /* IOPOLL retry should happen for io-wq threads */ + if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) goto done; - ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); - if (ret) - goto out_free; - return -EAGAIN; + /* no retry on NONBLOCK marked file */ + if (req->file->f_flags & O_NONBLOCK) + goto done; + /* some cases will consume bytes even on error returns */ + iov_iter_revert(iter, iov_count - iov_iter_count(iter)); + ret = 0; + goto copy_iov; } else if (ret < 0) { - goto out_free; + /* make sure -ERESTARTSYS -> -EINTR is done */ + goto done; } /* read it all, or we did blocking attempt. no retry. */ @@ -3208,12 +3331,15 @@ copy_iov: ret = ret2; goto out_free; } + if (no_async) + return -EAGAIN; + rw = req->async_data; /* it's copied and will be cleaned with ->io */ iovec = NULL; /* now use our persistent iterator, if we aren't already */ - iter = &req->io->rw.iter; + iter = &rw->iter; retry: - req->io->rw.bytes_done += ret; + rw->bytes_done += ret; /* if we can retry, do so with the callbacks armed */ if (!io_rw_should_retry(req)) { kiocb->ki_flags &= ~IOCB_WAITQ; @@ -3238,17 +3364,17 @@ done: kiocb_done(kiocb, ret, cs); ret = 0; out_free: + /* it's reportedly faster than delegating the null check to kfree() */ if (iovec) kfree(iovec); return ret; } -static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, - bool force_nonblock) +static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { ssize_t ret; - ret = io_prep_rw(req, sqe, force_nonblock); + ret = io_prep_rw(req, sqe); if (ret) return ret; @@ -3256,9 +3382,9 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, return -EBADF; /* either don't need iovec imported or already have it */ - if (!req->io || req->flags & REQ_F_NEED_CLEANUP) + if (!req->async_data) return 0; - return io_rw_prep_async(req, WRITE, force_nonblock); + return io_rw_prep_async(req, WRITE); } static int io_write(struct io_kiocb *req, bool force_nonblock, @@ -3267,21 +3393,25 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; struct iov_iter __iter, *iter = &__iter; + struct io_async_rw *rw = req->async_data; size_t iov_count; ssize_t ret, ret2, io_size; - if (req->io) - iter = &req->io->rw.iter; + if (rw) + iter = &rw->iter; ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock); if (ret < 0) return ret; + iov_count = iov_iter_count(iter); io_size = ret; req->result = io_size; /* Ensure we clear previously set non-block flag */ if (!force_nonblock) - req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT; + kiocb->ki_flags &= ~IOCB_NOWAIT; + else + kiocb->ki_flags |= IOCB_NOWAIT; /* If the file doesn't support async, just async punt */ if (force_nonblock && !io_file_supports_async(req->file, WRITE)) @@ -3292,8 +3422,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, (req->flags & REQ_F_ISREG)) goto copy_iov; - iov_count = iov_iter_count(iter); - ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count); + ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), iov_count); if (unlikely(ret)) goto out_free; @@ -3325,15 +3454,25 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, */ if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) ret2 = -EAGAIN; + /* no retry on NONBLOCK marked file */ + if (ret2 == -EAGAIN && (req->file->f_flags & O_NONBLOCK)) + goto done; if (!force_nonblock || ret2 != -EAGAIN) { + /* IOPOLL retry should happen for io-wq threads */ + if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN) + goto copy_iov; +done: kiocb_done(kiocb, ret2, cs); } else { copy_iov: + /* some cases will consume bytes even on error returns */ + iov_iter_revert(iter, iov_count - iov_iter_count(iter)); ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); if (!ret) return -EAGAIN; } out_free: + /* it's reportedly faster than delegating the null check to kfree() */ if (iovec) kfree(iovec); return ret; @@ -3344,10 +3483,7 @@ static int __io_splice_prep(struct io_kiocb *req, { struct io_splice* sp = &req->splice; unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; - int ret; - if (req->flags & REQ_F_NEED_CLEANUP) - return 0; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -3358,10 +3494,10 @@ static int __io_splice_prep(struct io_kiocb *req, if (unlikely(sp->flags & ~valid_flags)) return -EINVAL; - ret = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), &sp->file_in, - (sp->flags & SPLICE_F_FD_IN_FIXED)); - if (ret) - return ret; + sp->file_in = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), + (sp->flags & SPLICE_F_FD_IN_FIXED)); + if (!sp->file_in) + return -EBADF; req->flags |= REQ_F_NEED_CLEANUP; if (!S_ISREG(file_inode(sp->file_in)->i_mode)) { @@ -3529,8 +3665,6 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe const char __user *fname; int ret; - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) - return -EINVAL; if (unlikely(sqe->ioprio || sqe->buf_index)) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) @@ -3557,8 +3691,8 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { u64 flags, mode; - if (req->flags & REQ_F_NEED_CLEANUP) - return 0; + if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + return -EINVAL; mode = READ_ONCE(sqe->len); flags = READ_ONCE(sqe->open_flags); req->open.how = build_open_how(flags, mode); @@ -3571,8 +3705,8 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) size_t len; int ret; - if (req->flags & REQ_F_NEED_CLEANUP) - return 0; + if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + return -EINVAL; how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); len = READ_ONCE(sqe->len); if (len < OPEN_HOW_SIZE_VER0) @@ -3788,7 +3922,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, #if defined(CONFIG_EPOLL) if (sqe->ioprio || sqe->buf_index) return -EINVAL; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL))) return -EINVAL; req->epoll.epfd = READ_ONCE(sqe->fd); @@ -3903,7 +4037,7 @@ static int io_fadvise(struct io_kiocb *req, bool force_nonblock) static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL))) return -EINVAL; if (sqe->ioprio || sqe->buf_index) return -EINVAL; @@ -3959,8 +4093,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EBADF; req->close.fd = READ_ONCE(sqe->fd); - if ((req->file && req->file->f_op == &io_uring_fops) || - req->close.fd == req->ctx->ring_fd) + if ((req->file && req->file->f_op == &io_uring_fops)) return -EBADF; req->close.put_file = NULL; @@ -4037,15 +4170,18 @@ static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock) static int io_setup_async_msg(struct io_kiocb *req, struct io_async_msghdr *kmsg) { - if (req->io) + struct io_async_msghdr *async_msg = req->async_data; + + if (async_msg) return -EAGAIN; - if (io_alloc_async_ctx(req)) { + if (io_alloc_async_data(req)) { if (kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); return -ENOMEM; } + async_msg = req->async_data; req->flags |= REQ_F_NEED_CLEANUP; - memcpy(&req->io->msg, kmsg, sizeof(*kmsg)); + memcpy(async_msg, kmsg, sizeof(*kmsg)); return -EAGAIN; } @@ -4060,8 +4196,8 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req, static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_async_msghdr *async_msg = req->async_data; struct io_sr_msg *sr = &req->sr_msg; - struct io_async_ctx *io = req->io; int ret; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) @@ -4076,13 +4212,9 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->msg_flags |= MSG_CMSG_COMPAT; #endif - if (!io || req->opcode == IORING_OP_SEND) - return 0; - /* iovec is already imported */ - if (req->flags & REQ_F_NEED_CLEANUP) + if (!async_msg || !io_op_defs[req->opcode].needs_async_data) return 0; - - ret = io_sendmsg_copy_hdr(req, &io->msg); + ret = io_sendmsg_copy_hdr(req, async_msg); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; @@ -4100,9 +4232,9 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, if (unlikely(!sock)) return ret; - if (req->io) { - kmsg = &req->io->msg; - kmsg->msg.msg_name = &req->io->msg.addr; + if (req->async_data) { + kmsg = req->async_data; + kmsg->msg.msg_name = &kmsg->addr; /* if iov is set, it's allocated already */ if (!kmsg->iov) kmsg->iov = kmsg->fast_iov; @@ -4151,7 +4283,7 @@ static int io_send(struct io_kiocb *req, bool force_nonblock, ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter); if (unlikely(ret)) - return ret;; + return ret; msg.msg_name = NULL; msg.msg_control = NULL; @@ -4200,8 +4332,9 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req, sr->len); iomsg->iov = NULL; } else { - ret = import_iovec(READ, uiov, iov_len, UIO_FASTIOV, - &iomsg->iov, &iomsg->msg.msg_iter); + ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV, + &iomsg->iov, &iomsg->msg.msg_iter, + false); if (ret > 0) ret = 0; } @@ -4241,9 +4374,9 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, sr->len = iomsg->iov[0].iov_len; iomsg->iov = NULL; } else { - ret = compat_import_iovec(READ, uiov, len, UIO_FASTIOV, - &iomsg->iov, - &iomsg->msg.msg_iter); + ret = __import_iovec(READ, (struct iovec __user *)uiov, len, + UIO_FASTIOV, &iomsg->iov, + &iomsg->msg.msg_iter, true); if (ret < 0) return ret; } @@ -4289,8 +4422,8 @@ static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req) static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_async_msghdr *async_msg = req->async_data; struct io_sr_msg *sr = &req->sr_msg; - struct io_async_ctx *io = req->io; int ret; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) @@ -4306,13 +4439,9 @@ static int io_recvmsg_prep(struct io_kiocb *req, sr->msg_flags |= MSG_CMSG_COMPAT; #endif - if (!io || req->opcode == IORING_OP_RECV) + if (!async_msg || !io_op_defs[req->opcode].needs_async_data) return 0; - /* iovec is already imported */ - if (req->flags & REQ_F_NEED_CLEANUP) - return 0; - - ret = io_recvmsg_copy_hdr(req, &io->msg); + ret = io_recvmsg_copy_hdr(req, async_msg); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; @@ -4331,9 +4460,9 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, if (unlikely(!sock)) return ret; - if (req->io) { - kmsg = &req->io->msg; - kmsg->msg.msg_name = &req->io->msg.addr; + if (req->async_data) { + kmsg = req->async_data; + kmsg->msg.msg_name = &kmsg->addr; /* if iov is set, it's allocated already */ if (!kmsg->iov) kmsg->iov = kmsg->fast_iov; @@ -4475,7 +4604,7 @@ static int io_accept(struct io_kiocb *req, bool force_nonblock, static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_connect *conn = &req->connect; - struct io_async_ctx *io = req->io; + struct io_async_connect *io = req->async_data; if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) return -EINVAL; @@ -4489,22 +4618,22 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; return move_addr_to_kernel(conn->addr, conn->addr_len, - &io->connect.address); + &io->address); } static int io_connect(struct io_kiocb *req, bool force_nonblock, struct io_comp_state *cs) { - struct io_async_ctx __io, *io; + struct io_async_connect __io, *io; unsigned file_flags; int ret; - if (req->io) { - io = req->io; + if (req->async_data) { + io = req->async_data; } else { ret = move_addr_to_kernel(req->connect.addr, req->connect.addr_len, - &__io.connect.address); + &__io.address); if (ret) goto out; io = &__io; @@ -4512,16 +4641,17 @@ static int io_connect(struct io_kiocb *req, bool force_nonblock, file_flags = force_nonblock ? O_NONBLOCK : 0; - ret = __sys_connect_file(req->file, &io->connect.address, + ret = __sys_connect_file(req->file, &io->address, req->connect.addr_len, file_flags); if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { - if (req->io) + if (req->async_data) return -EAGAIN; - if (io_alloc_async_ctx(req)) { + if (io_alloc_async_data(req)) { ret = -ENOMEM; goto out; } - memcpy(&req->io->connect, &__io.connect, sizeof(__io.connect)); + io = req->async_data; + memcpy(req->async_data, &__io, sizeof(__io)); return -EAGAIN; } if (ret == -ERESTARTSYS) @@ -4600,6 +4730,7 @@ struct io_poll_table { static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, __poll_t mask, task_work_func_t func) { + bool twa_signal_ok; int ret; /* for instances that support it check for an event match first: */ @@ -4615,12 +4746,20 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, percpu_ref_get(&req->ctx->refs); /* + * If we using the signalfd wait_queue_head for this wakeup, then + * it's not safe to use TWA_SIGNAL as we could be recursing on the + * tsk->sighand->siglock on doing the wakeup. Should not be needed + * either, as the normal wakeup will suffice. + */ + twa_signal_ok = (poll->head != &req->task->sighand->signalfd_wqh); + + /* * If this fails, then the task is exiting. When a task exits, the * work gets canceled, so just cancel this request as well instead * of executing it. We can't safely execute it anyway, as we may not * have the needed state needed for it anyway. */ - ret = io_req_task_work_add(req, &req->task_work); + ret = io_req_task_work_add(req, twa_signal_ok); if (unlikely(ret)) { struct task_struct *tsk; @@ -4654,9 +4793,9 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll) static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req) { - /* pure poll stashes this in ->io, poll driven retry elsewhere */ + /* pure poll stashes this in ->async_data, poll driven retry elsewhere */ if (req->opcode == IORING_OP_POLL_ADD) - return (struct io_poll_iocb *) req->io; + return req->async_data; return req->apoll->double_poll; } @@ -4736,6 +4875,8 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, if (mask && !(mask & poll->events)) return 0; + list_del_init(&wait->entry); + if (poll && poll->head) { bool done; @@ -4909,12 +5050,20 @@ static bool io_arm_poll_handler(struct io_kiocb *req) struct async_poll *apoll; struct io_poll_table ipt; __poll_t mask, ret; + int rw; if (!req->file || !file_can_poll(req->file)) return false; if (req->flags & REQ_F_POLLED) return false; - if (!def->pollin && !def->pollout) + if (def->pollin) + rw = READ; + else if (def->pollout) + rw = WRITE; + else + return false; + /* if we can't nonblock try, then no point in arming a poll handler */ + if (!io_file_supports_async(req->file, rw)) return false; apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); @@ -4923,7 +5072,6 @@ static bool io_arm_poll_handler(struct io_kiocb *req) apoll->double_poll = NULL; req->flags |= REQ_F_POLLED; - io_get_req_task(req); req->apoll = apoll; INIT_HLIST_NODE(&req->hash_node); @@ -4998,7 +5146,10 @@ static bool io_poll_remove_one(struct io_kiocb *req) return do_complete; } -static void io_poll_remove_all(struct io_ring_ctx *ctx) +/* + * Returns true if we found and killed one or more poll requests + */ +static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk) { struct hlist_node *tmp; struct io_kiocb *req; @@ -5009,13 +5160,17 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx) struct hlist_head *list; list = &ctx->cancel_hash[i]; - hlist_for_each_entry_safe(req, tmp, list, hash_node) - posted += io_poll_remove_one(req); + hlist_for_each_entry_safe(req, tmp, list, hash_node) { + if (io_task_match(req, tsk)) + posted += io_poll_remove_one(req); + } } spin_unlock_irq(&ctx->completion_lock); if (posted) io_cqring_ev_posted(ctx); + + return posted != 0; } static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) @@ -5083,7 +5238,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, { struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); - __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->io); + __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data); } static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -5104,8 +5259,6 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe #endif poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP | (events & EPOLLEXCLUSIVE); - - io_get_req_task(req); return 0; } @@ -5144,16 +5297,10 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); + list_del_init(&req->timeout.list); atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); - /* - * We could be racing with timeout deletion. If the list is empty, - * then timeout lookup already found it and will be handling it. - */ - if (!list_empty(&req->timeout.list)) - list_del_init(&req->timeout.list); - io_cqring_fill_event(req, -ETIME); io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); @@ -5166,13 +5313,13 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) static int __io_timeout_cancel(struct io_kiocb *req) { + struct io_timeout_data *io = req->async_data; int ret; - list_del_init(&req->timeout.list); - - ret = hrtimer_try_to_cancel(&req->io->timeout.timer); + ret = hrtimer_try_to_cancel(&io->timer); if (ret == -1) return -EALREADY; + list_del_init(&req->timeout.list); req_set_fail_links(req); req->flags |= REQ_F_COMP_LOCKED; @@ -5206,14 +5353,10 @@ static int io_timeout_remove_prep(struct io_kiocb *req, return -EINVAL; if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->len) - return -EINVAL; - - req->timeout.addr = READ_ONCE(sqe->addr); - req->timeout.flags = READ_ONCE(sqe->timeout_flags); - if (req->timeout.flags) + if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->timeout_flags) return -EINVAL; + req->timeout_rem.addr = READ_ONCE(sqe->addr); return 0; } @@ -5226,7 +5369,7 @@ static int io_timeout_remove(struct io_kiocb *req) int ret; spin_lock_irq(&ctx->completion_lock); - ret = io_timeout_cancel(ctx, req->timeout.addr); + ret = io_timeout_cancel(ctx, req->timeout_rem.addr); io_cqring_fill_event(req, ret); io_commit_cqring(ctx); @@ -5257,10 +5400,10 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, req->timeout.off = off; - if (!req->io && io_alloc_async_ctx(req)) + if (!req->async_data && io_alloc_async_data(req)) return -ENOMEM; - data = &req->io->timeout; + data = req->async_data; data->req = req; if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) @@ -5278,7 +5421,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, static int io_timeout(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct io_timeout_data *data = &req->io->timeout; + struct io_timeout_data *data = req->async_data; struct list_head *entry; u32 tail, off = req->timeout.off; @@ -5403,6 +5546,8 @@ static int io_async_cancel(struct io_kiocb *req) static int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + if (unlikely(req->ctx->flags & IORING_SETUP_SQPOLL)) + return -EINVAL; if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; if (sqe->ioprio || sqe->rw_flags) @@ -5439,118 +5584,86 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock, return 0; } -static int io_req_defer_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) +static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - ssize_t ret = 0; - - if (!sqe) - return 0; - - if (io_alloc_async_ctx(req)) - return -EAGAIN; - ret = io_prep_work_files(req); - if (unlikely(ret)) - return ret; - switch (req->opcode) { case IORING_OP_NOP: - break; + return 0; case IORING_OP_READV: case IORING_OP_READ_FIXED: case IORING_OP_READ: - ret = io_read_prep(req, sqe, true); - break; + return io_read_prep(req, sqe); case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: case IORING_OP_WRITE: - ret = io_write_prep(req, sqe, true); - break; + return io_write_prep(req, sqe); case IORING_OP_POLL_ADD: - ret = io_poll_add_prep(req, sqe); - break; + return io_poll_add_prep(req, sqe); case IORING_OP_POLL_REMOVE: - ret = io_poll_remove_prep(req, sqe); - break; + return io_poll_remove_prep(req, sqe); case IORING_OP_FSYNC: - ret = io_prep_fsync(req, sqe); - break; + return io_prep_fsync(req, sqe); case IORING_OP_SYNC_FILE_RANGE: - ret = io_prep_sfr(req, sqe); - break; + return io_prep_sfr(req, sqe); case IORING_OP_SENDMSG: case IORING_OP_SEND: - ret = io_sendmsg_prep(req, sqe); - break; + return io_sendmsg_prep(req, sqe); case IORING_OP_RECVMSG: case IORING_OP_RECV: - ret = io_recvmsg_prep(req, sqe); - break; + return io_recvmsg_prep(req, sqe); case IORING_OP_CONNECT: - ret = io_connect_prep(req, sqe); - break; + return io_connect_prep(req, sqe); case IORING_OP_TIMEOUT: - ret = io_timeout_prep(req, sqe, false); - break; + return io_timeout_prep(req, sqe, false); case IORING_OP_TIMEOUT_REMOVE: - ret = io_timeout_remove_prep(req, sqe); - break; + return io_timeout_remove_prep(req, sqe); case IORING_OP_ASYNC_CANCEL: - ret = io_async_cancel_prep(req, sqe); - break; + return io_async_cancel_prep(req, sqe); case IORING_OP_LINK_TIMEOUT: - ret = io_timeout_prep(req, sqe, true); - break; + return io_timeout_prep(req, sqe, true); case IORING_OP_ACCEPT: - ret = io_accept_prep(req, sqe); - break; + return io_accept_prep(req, sqe); case IORING_OP_FALLOCATE: - ret = io_fallocate_prep(req, sqe); - break; + return io_fallocate_prep(req, sqe); case IORING_OP_OPENAT: - ret = io_openat_prep(req, sqe); - break; + return io_openat_prep(req, sqe); case IORING_OP_CLOSE: - ret = io_close_prep(req, sqe); - break; + return io_close_prep(req, sqe); case IORING_OP_FILES_UPDATE: - ret = io_files_update_prep(req, sqe); - break; + return io_files_update_prep(req, sqe); case IORING_OP_STATX: - ret = io_statx_prep(req, sqe); - break; + return io_statx_prep(req, sqe); case IORING_OP_FADVISE: - ret = io_fadvise_prep(req, sqe); - break; + return io_fadvise_prep(req, sqe); case IORING_OP_MADVISE: - ret = io_madvise_prep(req, sqe); - break; + return io_madvise_prep(req, sqe); case IORING_OP_OPENAT2: - ret = io_openat2_prep(req, sqe); - break; + return io_openat2_prep(req, sqe); case IORING_OP_EPOLL_CTL: - ret = io_epoll_ctl_prep(req, sqe); - break; + return io_epoll_ctl_prep(req, sqe); case IORING_OP_SPLICE: - ret = io_splice_prep(req, sqe); - break; + return io_splice_prep(req, sqe); case IORING_OP_PROVIDE_BUFFERS: - ret = io_provide_buffers_prep(req, sqe); - break; + return io_provide_buffers_prep(req, sqe); case IORING_OP_REMOVE_BUFFERS: - ret = io_remove_buffers_prep(req, sqe); - break; + return io_remove_buffers_prep(req, sqe); case IORING_OP_TEE: - ret = io_tee_prep(req, sqe); - break; - default: - printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", - req->opcode); - ret = -EINVAL; - break; + return io_tee_prep(req, sqe); } - return ret; + printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", + req->opcode); + return-EINVAL; +} + +static int io_req_defer_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + if (!sqe) + return 0; + if (io_alloc_async_data(req)) + return -EAGAIN; + return io_req_prep(req, sqe); } static u32 io_get_sequence(struct io_kiocb *req) @@ -5584,7 +5697,7 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) return 0; - if (!req->io) { + if (!req->async_data) { ret = io_req_defer_prep(req, sqe); if (ret) return ret; @@ -5610,10 +5723,24 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EIOCBQUEUED; } -static void __io_clean_op(struct io_kiocb *req) +static void io_req_drop_files(struct io_kiocb *req) { - struct io_async_ctx *io = req->io; + struct io_ring_ctx *ctx = req->ctx; + unsigned long flags; + spin_lock_irqsave(&ctx->inflight_lock, flags); + list_del(&req->inflight_entry); + if (waitqueue_active(&ctx->inflight_wait)) + wake_up(&ctx->inflight_wait); + spin_unlock_irqrestore(&ctx->inflight_lock, flags); + req->flags &= ~REQ_F_INFLIGHT; + put_files_struct(req->work.files); + put_nsproxy(req->work.nsproxy); + req->work.files = NULL; +} + +static void __io_clean_op(struct io_kiocb *req) +{ if (req->flags & REQ_F_BUFFER_SELECTED) { switch (req->opcode) { case IORING_OP_READV: @@ -5636,27 +5763,39 @@ static void __io_clean_op(struct io_kiocb *req) case IORING_OP_READ: case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: - case IORING_OP_WRITE: - if (io->rw.free_iovec) - kfree(io->rw.free_iovec); + case IORING_OP_WRITE: { + struct io_async_rw *io = req->async_data; + if (io->free_iovec) + kfree(io->free_iovec); break; + } case IORING_OP_RECVMSG: - case IORING_OP_SENDMSG: - if (io->msg.iov != io->msg.fast_iov) - kfree(io->msg.iov); + case IORING_OP_SENDMSG: { + struct io_async_msghdr *io = req->async_data; + if (io->iov != io->fast_iov) + kfree(io->iov); break; + } case IORING_OP_SPLICE: case IORING_OP_TEE: io_put_file(req, req->splice.file_in, (req->splice.flags & SPLICE_F_FD_IN_FIXED)); break; + case IORING_OP_OPENAT: + case IORING_OP_OPENAT2: + if (req->open.filename) + putname(req->open.filename); + break; } req->flags &= ~REQ_F_NEED_CLEANUP; } + + if (req->flags & REQ_F_INFLIGHT) + io_req_drop_files(req); } -static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, - bool force_nonblock, struct io_comp_state *cs) +static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_ring_ctx *ctx = req->ctx; int ret; @@ -5668,221 +5807,89 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, case IORING_OP_READV: case IORING_OP_READ_FIXED: case IORING_OP_READ: - if (sqe) { - ret = io_read_prep(req, sqe, force_nonblock); - if (ret < 0) - break; - } ret = io_read(req, force_nonblock, cs); break; case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: case IORING_OP_WRITE: - if (sqe) { - ret = io_write_prep(req, sqe, force_nonblock); - if (ret < 0) - break; - } ret = io_write(req, force_nonblock, cs); break; case IORING_OP_FSYNC: - if (sqe) { - ret = io_prep_fsync(req, sqe); - if (ret < 0) - break; - } ret = io_fsync(req, force_nonblock); break; case IORING_OP_POLL_ADD: - if (sqe) { - ret = io_poll_add_prep(req, sqe); - if (ret) - break; - } ret = io_poll_add(req); break; case IORING_OP_POLL_REMOVE: - if (sqe) { - ret = io_poll_remove_prep(req, sqe); - if (ret < 0) - break; - } ret = io_poll_remove(req); break; case IORING_OP_SYNC_FILE_RANGE: - if (sqe) { - ret = io_prep_sfr(req, sqe); - if (ret < 0) - break; - } ret = io_sync_file_range(req, force_nonblock); break; case IORING_OP_SENDMSG: + ret = io_sendmsg(req, force_nonblock, cs); + break; case IORING_OP_SEND: - if (sqe) { - ret = io_sendmsg_prep(req, sqe); - if (ret < 0) - break; - } - if (req->opcode == IORING_OP_SENDMSG) - ret = io_sendmsg(req, force_nonblock, cs); - else - ret = io_send(req, force_nonblock, cs); + ret = io_send(req, force_nonblock, cs); break; case IORING_OP_RECVMSG: + ret = io_recvmsg(req, force_nonblock, cs); + break; case IORING_OP_RECV: - if (sqe) { - ret = io_recvmsg_prep(req, sqe); - if (ret) - break; - } - if (req->opcode == IORING_OP_RECVMSG) - ret = io_recvmsg(req, force_nonblock, cs); - else - ret = io_recv(req, force_nonblock, cs); + ret = io_recv(req, force_nonblock, cs); break; case IORING_OP_TIMEOUT: - if (sqe) { - ret = io_timeout_prep(req, sqe, false); - if (ret) - break; - } ret = io_timeout(req); break; case IORING_OP_TIMEOUT_REMOVE: - if (sqe) { - ret = io_timeout_remove_prep(req, sqe); - if (ret) - break; - } ret = io_timeout_remove(req); break; case IORING_OP_ACCEPT: - if (sqe) { - ret = io_accept_prep(req, sqe); - if (ret) - break; - } ret = io_accept(req, force_nonblock, cs); break; case IORING_OP_CONNECT: - if (sqe) { - ret = io_connect_prep(req, sqe); - if (ret) - break; - } ret = io_connect(req, force_nonblock, cs); break; case IORING_OP_ASYNC_CANCEL: - if (sqe) { - ret = io_async_cancel_prep(req, sqe); - if (ret) - break; - } ret = io_async_cancel(req); break; case IORING_OP_FALLOCATE: - if (sqe) { - ret = io_fallocate_prep(req, sqe); - if (ret) - break; - } ret = io_fallocate(req, force_nonblock); break; case IORING_OP_OPENAT: - if (sqe) { - ret = io_openat_prep(req, sqe); - if (ret) - break; - } ret = io_openat(req, force_nonblock); break; case IORING_OP_CLOSE: - if (sqe) { - ret = io_close_prep(req, sqe); - if (ret) - break; - } ret = io_close(req, force_nonblock, cs); break; case IORING_OP_FILES_UPDATE: - if (sqe) { - ret = io_files_update_prep(req, sqe); - if (ret) - break; - } ret = io_files_update(req, force_nonblock, cs); break; case IORING_OP_STATX: - if (sqe) { - ret = io_statx_prep(req, sqe); - if (ret) - break; - } ret = io_statx(req, force_nonblock); break; case IORING_OP_FADVISE: - if (sqe) { - ret = io_fadvise_prep(req, sqe); - if (ret) - break; - } ret = io_fadvise(req, force_nonblock); break; case IORING_OP_MADVISE: - if (sqe) { - ret = io_madvise_prep(req, sqe); - if (ret) - break; - } ret = io_madvise(req, force_nonblock); break; case IORING_OP_OPENAT2: - if (sqe) { - ret = io_openat2_prep(req, sqe); - if (ret) - break; - } ret = io_openat2(req, force_nonblock); break; case IORING_OP_EPOLL_CTL: - if (sqe) { - ret = io_epoll_ctl_prep(req, sqe); - if (ret) - break; - } ret = io_epoll_ctl(req, force_nonblock, cs); break; case IORING_OP_SPLICE: - if (sqe) { - ret = io_splice_prep(req, sqe); - if (ret < 0) - break; - } ret = io_splice(req, force_nonblock); break; case IORING_OP_PROVIDE_BUFFERS: - if (sqe) { - ret = io_provide_buffers_prep(req, sqe); - if (ret) - break; - } ret = io_provide_buffers(req, force_nonblock, cs); break; case IORING_OP_REMOVE_BUFFERS: - if (sqe) { - ret = io_remove_buffers_prep(req, sqe); - if (ret) - break; - } ret = io_remove_buffers(req, force_nonblock, cs); break; case IORING_OP_TEE: - if (sqe) { - ret = io_tee_prep(req, sqe); - if (ret < 0) - break; - } ret = io_tee(req, force_nonblock); break; default: @@ -5928,7 +5935,7 @@ static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work) if (!ret) { do { - ret = io_issue_sqe(req, NULL, false, NULL); + ret = io_issue_sqe(req, false, NULL); /* * We can get EAGAIN for polled IO even though we're * forcing a sync submission from here, since we can't @@ -5957,20 +5964,19 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, return table->files[index & IORING_FILE_TABLE_MASK]; } -static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, - int fd, struct file **out_file, bool fixed) +static struct file *io_file_get(struct io_submit_state *state, + struct io_kiocb *req, int fd, bool fixed) { struct io_ring_ctx *ctx = req->ctx; struct file *file; if (fixed) { - if (unlikely(!ctx->file_data || - (unsigned) fd >= ctx->nr_user_files)) - return -EBADF; + if (unlikely((unsigned int)fd >= ctx->nr_user_files)) + return NULL; fd = array_index_nospec(fd, ctx->nr_user_files); file = io_file_from_index(ctx, fd); if (file) { - req->fixed_file_refs = ctx->file_data->cur_refs; + req->fixed_file_refs = &ctx->file_data->node->refs; percpu_ref_get(req->fixed_file_refs); } } else { @@ -5978,11 +5984,7 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, file = __io_file_get(state, fd); } - if (file || io_op_defs[req->opcode].needs_file_no_error) { - *out_file = file; - return 0; - } - return -EBADF; + return file; } static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, @@ -5994,46 +5996,10 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, if (unlikely(!fixed && io_async_submit(req->ctx))) return -EBADF; - return io_file_get(state, req, fd, &req->file, fixed); -} - -static int io_grab_files(struct io_kiocb *req) -{ - int ret = -EBADF; - struct io_ring_ctx *ctx = req->ctx; - - io_req_init_async(req); - - if (req->work.files || (req->flags & REQ_F_NO_FILE_TABLE)) - return 0; - if (!ctx->ring_file) - return -EBADF; - - rcu_read_lock(); - spin_lock_irq(&ctx->inflight_lock); - /* - * We use the f_ops->flush() handler to ensure that we can flush - * out work accessing these files if the fd is closed. Check if - * the fd has changed since we started down this path, and disallow - * this operation if it has. - */ - if (fcheck(ctx->ring_fd) == ctx->ring_file) { - list_add(&req->inflight_entry, &ctx->inflight_list); - req->flags |= REQ_F_INFLIGHT; - req->work.files = current->files; - ret = 0; - } - spin_unlock_irq(&ctx->inflight_lock); - rcu_read_unlock(); - - return ret; -} - -static inline int io_prep_work_files(struct io_kiocb *req) -{ - if (!io_op_defs[req->opcode].file_table) + req->file = io_file_get(state, req, fd, fixed); + if (req->file || io_op_defs[req->opcode].needs_file_no_error) return 0; - return io_grab_files(req); + return -EBADF; } static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) @@ -6080,7 +6046,7 @@ static void __io_queue_linked_timeout(struct io_kiocb *req) * we got a chance to setup the timer */ if (!list_empty(&req->link_list)) { - struct io_timeout_data *data = &req->io->timeout; + struct io_timeout_data *data = req->async_data; data->timer.function = io_link_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), @@ -6118,8 +6084,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) return nxt; } -static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_comp_state *cs) +static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs) { struct io_kiocb *linked_timeout; struct io_kiocb *nxt; @@ -6139,7 +6104,7 @@ again: old_creds = override_creds(req->work.creds); } - ret = io_issue_sqe(req, sqe, true, cs); + ret = io_issue_sqe(req, true, cs); /* * We async punt it if the file wasn't marked NOWAIT, or if the file @@ -6148,9 +6113,6 @@ again: if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { if (!io_arm_poll_handler(req)) { punt: - ret = io_prep_work_files(req); - if (unlikely(ret)) - goto err; /* * Queued up for async execution, worker will release * submit reference when the iocb is actually submitted. @@ -6164,7 +6126,6 @@ punt: } if (unlikely(ret)) { -err: /* un-prep timeout, so it'll be killed as any other linked */ req->flags &= ~REQ_F_LINK_TIMEOUT; req_set_fail_links(req); @@ -6204,7 +6165,7 @@ fail_req: io_req_complete(req, ret); } } else if (req->flags & REQ_F_FORCE_ASYNC) { - if (!req->io) { + if (!req->async_data) { ret = io_req_defer_prep(req, sqe); if (unlikely(ret)) goto fail_req; @@ -6218,7 +6179,12 @@ fail_req: req->work.flags |= IO_WQ_WORK_CONCURRENT; io_queue_async_work(req); } else { - __io_queue_sqe(req, sqe, cs); + if (sqe) { + ret = io_req_prep(req, sqe); + if (unlikely(ret)) + goto fail_req; + } + __io_queue_sqe(req, cs); } } @@ -6266,7 +6232,6 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, return ret; } trace_io_uring_link(ctx, req, head); - io_get_req_task(req); list_add_tail(&req->link_list, &head->link_list); /* last request of a link, enqueue the link */ @@ -6315,9 +6280,6 @@ static void io_submit_state_start(struct io_submit_state *state, struct io_ring_ctx *ctx, unsigned int max_ios) { blk_start_plug(&state->plug); -#ifdef CONFIG_BLOCK - state->plug.nowait = true; -#endif state->comp.nr = 0; INIT_LIST_HEAD(&state->comp.list); state->comp.ctx = ctx; @@ -6374,6 +6336,32 @@ static inline void io_consume_sqe(struct io_ring_ctx *ctx) ctx->cached_sq_head++; } +/* + * Check SQE restrictions (opcode and flags). + * + * Returns 'true' if SQE is allowed, 'false' otherwise. + */ +static inline bool io_check_restriction(struct io_ring_ctx *ctx, + struct io_kiocb *req, + unsigned int sqe_flags) +{ + if (!ctx->restricted) + return true; + + if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) + return false; + + if ((sqe_flags & ctx->restrictions.sqe_flags_required) != + ctx->restrictions.sqe_flags_required) + return false; + + if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed | + ctx->restrictions.sqe_flags_required)) + return false; + + return true; +} + #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ IOSQE_BUFFER_SELECT) @@ -6383,11 +6371,11 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, struct io_submit_state *state) { unsigned int sqe_flags; - int id; + int id, ret; req->opcode = READ_ONCE(sqe->opcode); req->user_data = READ_ONCE(sqe->user_data); - req->io = NULL; + req->async_data = NULL; req->file = NULL; req->ctx = ctx; req->flags = 0; @@ -6407,6 +6395,9 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) return -EINVAL; + if (unlikely(!io_check_restriction(ctx, req, sqe_flags))) + return -EACCES; + if ((sqe_flags & IOSQE_BUFFER_SELECT) && !io_op_defs[req->opcode].buffer_select) return -EOPNOTSUPP; @@ -6426,11 +6417,12 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if (!io_op_defs[req->opcode].needs_file) return 0; - return io_req_set_file(state, req, READ_ONCE(sqe->fd)); + ret = io_req_set_file(state, req, READ_ONCE(sqe->fd)); + state->ios_left--; + return ret; } -static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, - struct file *ring_file, int ring_fd) +static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) { struct io_submit_state state; struct io_kiocb *link = NULL; @@ -6439,7 +6431,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, /* if we have a backlog and couldn't flush it all, return BUSY */ if (test_bit(0, &ctx->sq_check_overflow)) { if (!list_empty(&ctx->cq_overflow_list) && - !io_cqring_overflow_flush(ctx, false)) + !io_cqring_overflow_flush(ctx, false, NULL, NULL)) return -EBUSY; } @@ -6449,10 +6441,10 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, if (!percpu_ref_tryget_many(&ctx->refs, nr)) return -EAGAIN; - io_submit_state_start(&state, ctx, nr); + atomic_long_add(nr, ¤t->io_uring->req_issue); + refcount_add(nr, ¤t->usage); - ctx->ring_fd = ring_fd; - ctx->ring_file = ring_file; + io_submit_state_start(&state, ctx, nr); for (i = 0; i < nr; i++) { const struct io_uring_sqe *sqe; @@ -6470,12 +6462,11 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, submitted = -EAGAIN; break; } - - err = io_init_req(ctx, req, sqe, &state); io_consume_sqe(ctx); /* will complete beyond this point, count as submitted */ submitted++; + err = io_init_req(ctx, req, sqe, &state); if (unlikely(err)) { fail_req: io_put_req(req); @@ -6494,6 +6485,8 @@ fail_req: int ref_used = (submitted == -EAGAIN) ? 0 : submitted; percpu_ref_put_many(&ctx->refs, nr - ref_used); + atomic_long_sub(nr - ref_used, ¤t->io_uring->req_issue); + put_task_struct_many(current, nr - ref_used); } if (link) io_queue_link_head(link, &state.comp); @@ -6520,117 +6513,186 @@ static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx) spin_unlock_irq(&ctx->completion_lock); } -static int io_sq_thread(void *data) +static int io_sq_wake_function(struct wait_queue_entry *wqe, unsigned mode, + int sync, void *key) { - struct io_ring_ctx *ctx = data; - const struct cred *old_cred; - DEFINE_WAIT(wait); - unsigned long timeout; + struct io_ring_ctx *ctx = container_of(wqe, struct io_ring_ctx, sqo_wait_entry); + int ret; + + ret = autoremove_wake_function(wqe, mode, sync, key); + if (ret) { + unsigned long flags; + + spin_lock_irqsave(&ctx->completion_lock, flags); + ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; + spin_unlock_irqrestore(&ctx->completion_lock, flags); + } + return ret; +} + +enum sq_ret { + SQT_IDLE = 1, + SQT_SPIN = 2, + SQT_DID_WORK = 4, +}; + +static enum sq_ret __io_sq_thread(struct io_ring_ctx *ctx, + unsigned long start_jiffies, bool cap_entries) +{ + unsigned long timeout = start_jiffies + ctx->sq_thread_idle; + struct io_sq_data *sqd = ctx->sq_data; + unsigned int to_submit; int ret = 0; - complete(&ctx->sq_thread_comp); +again: + if (!list_empty(&ctx->iopoll_list)) { + unsigned nr_events = 0; - old_cred = override_creds(ctx->creds); + mutex_lock(&ctx->uring_lock); + if (!list_empty(&ctx->iopoll_list) && !need_resched()) + io_do_iopoll(ctx, &nr_events, 0); + mutex_unlock(&ctx->uring_lock); + } + + to_submit = io_sqring_entries(ctx); + + /* + * If submit got -EBUSY, flag us as needing the application + * to enter the kernel to reap and flush events. + */ + if (!to_submit || ret == -EBUSY || need_resched()) { + /* + * Drop cur_mm before scheduling, we can't hold it for + * long periods (or over schedule()). Do this before + * adding ourselves to the waitqueue, as the unuse/drop + * may sleep. + */ + io_sq_thread_drop_mm(); - timeout = jiffies + ctx->sq_thread_idle; - while (!kthread_should_park()) { - unsigned int to_submit; + /* + * We're polling. If we're within the defined idle + * period, then let us spin without work before going + * to sleep. The exception is if we got EBUSY doing + * more IO, we should wait for the application to + * reap events and wake us up. + */ + if (!list_empty(&ctx->iopoll_list) || need_resched() || + (!time_after(jiffies, timeout) && ret != -EBUSY && + !percpu_ref_is_dying(&ctx->refs))) + return SQT_SPIN; - if (!list_empty(&ctx->iopoll_list)) { - unsigned nr_events = 0; + prepare_to_wait(&sqd->wait, &ctx->sqo_wait_entry, + TASK_INTERRUPTIBLE); - mutex_lock(&ctx->uring_lock); - if (!list_empty(&ctx->iopoll_list) && !need_resched()) - io_do_iopoll(ctx, &nr_events, 0); - else - timeout = jiffies + ctx->sq_thread_idle; - mutex_unlock(&ctx->uring_lock); + /* + * While doing polled IO, before going to sleep, we need + * to check if there are new reqs added to iopoll_list, + * it is because reqs may have been punted to io worker + * and will be added to iopoll_list later, hence check + * the iopoll_list again. + */ + if ((ctx->flags & IORING_SETUP_IOPOLL) && + !list_empty_careful(&ctx->iopoll_list)) { + finish_wait(&sqd->wait, &ctx->sqo_wait_entry); + goto again; } to_submit = io_sqring_entries(ctx); + if (!to_submit || ret == -EBUSY) + return SQT_IDLE; + } + + finish_wait(&sqd->wait, &ctx->sqo_wait_entry); + io_ring_clear_wakeup_flag(ctx); + + /* if we're handling multiple rings, cap submit size for fairness */ + if (cap_entries && to_submit > 8) + to_submit = 8; + + mutex_lock(&ctx->uring_lock); + if (likely(!percpu_ref_is_dying(&ctx->refs))) + ret = io_submit_sqes(ctx, to_submit); + mutex_unlock(&ctx->uring_lock); + + if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait)) + wake_up(&ctx->sqo_sq_wait); + + return SQT_DID_WORK; +} + +static void io_sqd_init_new(struct io_sq_data *sqd) +{ + struct io_ring_ctx *ctx; + + while (!list_empty(&sqd->ctx_new_list)) { + ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list); + init_wait(&ctx->sqo_wait_entry); + ctx->sqo_wait_entry.func = io_sq_wake_function; + list_move_tail(&ctx->sqd_list, &sqd->ctx_list); + complete(&ctx->sq_thread_comp); + } +} + +static int io_sq_thread(void *data) +{ + struct cgroup_subsys_state *cur_css = NULL; + const struct cred *old_cred = NULL; + struct io_sq_data *sqd = data; + struct io_ring_ctx *ctx; + unsigned long start_jiffies; + + start_jiffies = jiffies; + while (!kthread_should_stop()) { + enum sq_ret ret = 0; + bool cap_entries; /* - * If submit got -EBUSY, flag us as needing the application - * to enter the kernel to reap and flush events. + * Any changes to the sqd lists are synchronized through the + * kthread parking. This synchronizes the thread vs users, + * the users are synchronized on the sqd->ctx_lock. */ - if (!to_submit || ret == -EBUSY || need_resched()) { - /* - * Drop cur_mm before scheduling, we can't hold it for - * long periods (or over schedule()). Do this before - * adding ourselves to the waitqueue, as the unuse/drop - * may sleep. - */ - io_sq_thread_drop_mm(); + if (kthread_should_park()) + kthread_parkme(); - /* - * We're polling. If we're within the defined idle - * period, then let us spin without work before going - * to sleep. The exception is if we got EBUSY doing - * more IO, we should wait for the application to - * reap events and wake us up. - */ - if (!list_empty(&ctx->iopoll_list) || need_resched() || - (!time_after(jiffies, timeout) && ret != -EBUSY && - !percpu_ref_is_dying(&ctx->refs))) { - io_run_task_work(); - cond_resched(); - continue; - } + if (unlikely(!list_empty(&sqd->ctx_new_list))) + io_sqd_init_new(sqd); - prepare_to_wait(&ctx->sqo_wait, &wait, - TASK_INTERRUPTIBLE); + cap_entries = !list_is_singular(&sqd->ctx_list); - /* - * While doing polled IO, before going to sleep, we need - * to check if there are new reqs added to iopoll_list, - * it is because reqs may have been punted to io worker - * and will be added to iopoll_list later, hence check - * the iopoll_list again. - */ - if ((ctx->flags & IORING_SETUP_IOPOLL) && - !list_empty_careful(&ctx->iopoll_list)) { - finish_wait(&ctx->sqo_wait, &wait); - continue; + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { + if (current->cred != ctx->creds) { + if (old_cred) + revert_creds(old_cred); + old_cred = override_creds(ctx->creds); } + io_sq_thread_associate_blkcg(ctx, &cur_css); - io_ring_set_wakeup_flag(ctx); + ret |= __io_sq_thread(ctx, start_jiffies, cap_entries); - to_submit = io_sqring_entries(ctx); - if (!to_submit || ret == -EBUSY) { - if (kthread_should_park()) { - finish_wait(&ctx->sqo_wait, &wait); - break; - } - if (io_run_task_work()) { - finish_wait(&ctx->sqo_wait, &wait); - io_ring_clear_wakeup_flag(ctx); - continue; - } - if (signal_pending(current)) - flush_signals(current); - schedule(); - finish_wait(&ctx->sqo_wait, &wait); + io_sq_thread_drop_mm(); + } - io_ring_clear_wakeup_flag(ctx); - ret = 0; + if (ret & SQT_SPIN) { + io_run_task_work(); + cond_resched(); + } else if (ret == SQT_IDLE) { + if (kthread_should_park()) continue; - } - finish_wait(&ctx->sqo_wait, &wait); - - io_ring_clear_wakeup_flag(ctx); + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) + io_ring_set_wakeup_flag(ctx); + schedule(); + start_jiffies = jiffies; + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) + io_ring_clear_wakeup_flag(ctx); } - - mutex_lock(&ctx->uring_lock); - if (likely(!percpu_ref_is_dying(&ctx->refs))) - ret = io_submit_sqes(ctx, to_submit, NULL, -1); - mutex_unlock(&ctx->uring_lock); - timeout = jiffies + ctx->sq_thread_idle; } io_run_task_work(); - io_sq_thread_drop_mm(); - revert_creds(old_cred); + if (cur_css) + io_sq_thread_unassociate_blkcg(); + if (old_cred) + revert_creds(old_cred); kthread_parkme(); @@ -6670,6 +6732,22 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, return autoremove_wake_function(curr, mode, wake_flags, key); } +static int io_run_task_work_sig(void) +{ + if (io_run_task_work()) + return 1; + if (!signal_pending(current)) + return 0; + if (current->jobctl & JOBCTL_TASK_WORK) { + spin_lock_irq(¤t->sighand->siglock); + current->jobctl &= ~JOBCTL_TASK_WORK; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + return 1; + } + return -EINTR; +} + /* * Wait until events become available, if we don't already have some. The * application must reap them itself, as they reside on the shared cq ring. @@ -6715,19 +6793,11 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE); /* make sure we run task_work before checking for signals */ - if (io_run_task_work()) + ret = io_run_task_work_sig(); + if (ret > 0) continue; - if (signal_pending(current)) { - if (current->jobctl & JOBCTL_TASK_WORK) { - spin_lock_irq(¤t->sighand->siglock); - current->jobctl &= ~JOBCTL_TASK_WORK; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - continue; - } - ret = -EINTR; + else if (ret < 0) break; - } if (io_should_wake(&iowq, false)) break; schedule(); @@ -6805,18 +6875,116 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) return 0; } -static void io_sq_thread_stop(struct io_ring_ctx *ctx) +static void io_put_sq_data(struct io_sq_data *sqd) { - if (ctx->sqo_thread) { - wait_for_completion(&ctx->sq_thread_comp); + if (refcount_dec_and_test(&sqd->refs)) { /* * The park is a bit of a work-around, without it we get * warning spews on shutdown with SQPOLL set and affinity * set to a single CPU. */ - kthread_park(ctx->sqo_thread); - kthread_stop(ctx->sqo_thread); - ctx->sqo_thread = NULL; + if (sqd->thread) { + kthread_park(sqd->thread); + kthread_stop(sqd->thread); + } + + kfree(sqd); + } +} + +static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p) +{ + struct io_ring_ctx *ctx_attach; + struct io_sq_data *sqd; + struct fd f; + + f = fdget(p->wq_fd); + if (!f.file) + return ERR_PTR(-ENXIO); + if (f.file->f_op != &io_uring_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + ctx_attach = f.file->private_data; + sqd = ctx_attach->sq_data; + if (!sqd) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + refcount_inc(&sqd->refs); + fdput(f); + return sqd; +} + +static struct io_sq_data *io_get_sq_data(struct io_uring_params *p) +{ + struct io_sq_data *sqd; + + if (p->flags & IORING_SETUP_ATTACH_WQ) + return io_attach_sq_data(p); + + sqd = kzalloc(sizeof(*sqd), GFP_KERNEL); + if (!sqd) + return ERR_PTR(-ENOMEM); + + refcount_set(&sqd->refs, 1); + INIT_LIST_HEAD(&sqd->ctx_list); + INIT_LIST_HEAD(&sqd->ctx_new_list); + mutex_init(&sqd->ctx_lock); + mutex_init(&sqd->lock); + init_waitqueue_head(&sqd->wait); + return sqd; +} + +static void io_sq_thread_unpark(struct io_sq_data *sqd) + __releases(&sqd->lock) +{ + if (!sqd->thread) + return; + kthread_unpark(sqd->thread); + mutex_unlock(&sqd->lock); +} + +static void io_sq_thread_park(struct io_sq_data *sqd) + __acquires(&sqd->lock) +{ + if (!sqd->thread) + return; + mutex_lock(&sqd->lock); + kthread_park(sqd->thread); +} + +static void io_sq_thread_stop(struct io_ring_ctx *ctx) +{ + struct io_sq_data *sqd = ctx->sq_data; + + if (sqd) { + if (sqd->thread) { + /* + * We may arrive here from the error branch in + * io_sq_offload_create() where the kthread is created + * without being waked up, thus wake it up now to make + * sure the wait will complete. + */ + wake_up_process(sqd->thread); + wait_for_completion(&ctx->sq_thread_comp); + + io_sq_thread_park(sqd); + } + + mutex_lock(&sqd->ctx_lock); + list_del(&ctx->sqd_list); + mutex_unlock(&sqd->ctx_lock); + + if (sqd->thread) { + finish_wait(&sqd->wait, &ctx->sqo_wait_entry); + io_sq_thread_unpark(sqd); + } + + io_put_sq_data(sqd); + ctx->sq_data = NULL; } } @@ -6927,13 +7095,13 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx) } #endif -static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables, - unsigned nr_files) +static int io_sqe_alloc_file_tables(struct fixed_file_data *file_data, + unsigned nr_tables, unsigned nr_files) { int i; for (i = 0; i < nr_tables; i++) { - struct fixed_file_table *table = &ctx->file_data->table[i]; + struct fixed_file_table *table = &file_data->table[i]; unsigned this_files; this_files = min(nr_files, IORING_MAX_FILES_TABLE); @@ -6948,7 +7116,7 @@ static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables, return 0; for (i = 0; i < nr_tables; i++) { - struct fixed_file_table *table = &ctx->file_data->table[i]; + struct fixed_file_table *table = &file_data->table[i]; kfree(table->files); } return 1; @@ -7110,11 +7278,11 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { __s32 __user *fds = (__s32 __user *) arg; - unsigned nr_tables; + unsigned nr_tables, i; struct file *file; - int fd, ret = 0; - unsigned i; + int fd, ret = -ENOMEM; struct fixed_file_ref_node *ref_node; + struct fixed_file_data *file_data; if (ctx->file_data) return -EBUSY; @@ -7123,60 +7291,43 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, if (nr_args > IORING_MAX_FIXED_FILES) return -EMFILE; - ctx->file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL); - if (!ctx->file_data) + file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL); + if (!file_data) return -ENOMEM; - ctx->file_data->ctx = ctx; - init_completion(&ctx->file_data->done); - INIT_LIST_HEAD(&ctx->file_data->ref_list); - spin_lock_init(&ctx->file_data->lock); + file_data->ctx = ctx; + init_completion(&file_data->done); + INIT_LIST_HEAD(&file_data->ref_list); + spin_lock_init(&file_data->lock); nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE); - ctx->file_data->table = kcalloc(nr_tables, - sizeof(struct fixed_file_table), - GFP_KERNEL); - if (!ctx->file_data->table) { - kfree(ctx->file_data); - ctx->file_data = NULL; - return -ENOMEM; - } + file_data->table = kcalloc(nr_tables, sizeof(file_data->table), + GFP_KERNEL); + if (!file_data->table) + goto out_free; - if (percpu_ref_init(&ctx->file_data->refs, io_file_ref_kill, - PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { - kfree(ctx->file_data->table); - kfree(ctx->file_data); - ctx->file_data = NULL; - return -ENOMEM; - } + if (percpu_ref_init(&file_data->refs, io_file_ref_kill, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) + goto out_free; - if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) { - percpu_ref_exit(&ctx->file_data->refs); - kfree(ctx->file_data->table); - kfree(ctx->file_data); - ctx->file_data = NULL; - return -ENOMEM; - } + if (io_sqe_alloc_file_tables(file_data, nr_tables, nr_args)) + goto out_ref; for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { struct fixed_file_table *table; unsigned index; - ret = -EFAULT; - if (copy_from_user(&fd, &fds[i], sizeof(fd))) - break; + if (copy_from_user(&fd, &fds[i], sizeof(fd))) { + ret = -EFAULT; + goto out_fput; + } /* allow sparse sets */ - if (fd == -1) { - ret = 0; + if (fd == -1) continue; - } - table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT]; - index = i & IORING_FILE_TABLE_MASK; file = fget(fd); - ret = -EBADF; if (!file) - break; + goto out_fput; /* * Don't allow io_uring instances to be registered. If UNIX @@ -7187,29 +7338,14 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, */ if (file->f_op == &io_uring_fops) { fput(file); - break; + goto out_fput; } - ret = 0; + table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT]; + index = i & IORING_FILE_TABLE_MASK; table->files[index] = file; } - if (ret) { - for (i = 0; i < ctx->nr_user_files; i++) { - file = io_file_from_index(ctx, i); - if (file) - fput(file); - } - for (i = 0; i < nr_tables; i++) - kfree(ctx->file_data->table[i].files); - - percpu_ref_exit(&ctx->file_data->refs); - kfree(ctx->file_data->table); - kfree(ctx->file_data); - ctx->file_data = NULL; - ctx->nr_user_files = 0; - return ret; - } - + ctx->file_data = file_data; ret = io_sqe_files_scm(ctx); if (ret) { io_sqe_files_unregister(ctx); @@ -7222,11 +7358,26 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return PTR_ERR(ref_node); } - ctx->file_data->cur_refs = &ref_node->refs; - spin_lock(&ctx->file_data->lock); - list_add(&ref_node->node, &ctx->file_data->ref_list); - spin_unlock(&ctx->file_data->lock); - percpu_ref_get(&ctx->file_data->refs); + file_data->node = ref_node; + spin_lock(&file_data->lock); + list_add(&ref_node->node, &file_data->ref_list); + spin_unlock(&file_data->lock); + percpu_ref_get(&file_data->refs); + return ret; +out_fput: + for (i = 0; i < ctx->nr_user_files; i++) { + file = io_file_from_index(ctx, i); + if (file) + fput(file); + } + for (i = 0; i < nr_tables; i++) + kfree(file_data->table[i].files); + ctx->nr_user_files = 0; +out_ref: + percpu_ref_exit(&file_data->refs); +out_free: + kfree(file_data->table); + kfree(file_data); return ret; } @@ -7277,14 +7428,12 @@ static int io_queue_file_removal(struct fixed_file_data *data, struct file *file) { struct io_file_put *pfile; - struct percpu_ref *refs = data->cur_refs; - struct fixed_file_ref_node *ref_node; + struct fixed_file_ref_node *ref_node = data->node; pfile = kzalloc(sizeof(*pfile), GFP_KERNEL); if (!pfile) return -ENOMEM; - ref_node = container_of(refs, struct fixed_file_ref_node, refs); pfile->file = file; list_add(&pfile->list, &ref_node->file_list); @@ -7327,7 +7476,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT]; index = i & IORING_FILE_TABLE_MASK; if (table->files[index]) { - file = io_file_from_index(ctx, index); + file = table->files[index]; err = io_queue_file_removal(data, file); if (err) break; @@ -7356,6 +7505,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, table->files[index] = file; err = io_sqe_file_register(ctx, file, i); if (err) { + table->files[index] = NULL; fput(file); break; } @@ -7366,10 +7516,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, } if (needs_switch) { - percpu_ref_kill(data->cur_refs); + percpu_ref_kill(&data->node->refs); spin_lock(&data->lock); list_add(&ref_node->node, &data->ref_list); - data->cur_refs = &ref_node->refs; + data->node = ref_node; spin_unlock(&data->lock); percpu_ref_get(&ctx->file_data->refs); } else @@ -7450,23 +7600,65 @@ out_fput: return ret; } -static int io_sq_offload_start(struct io_ring_ctx *ctx, - struct io_uring_params *p) +static int io_uring_alloc_task_context(struct task_struct *task) { - int ret; + struct io_uring_task *tctx; - mmgrab(current->mm); - ctx->sqo_mm = current->mm; + tctx = kmalloc(sizeof(*tctx), GFP_KERNEL); + if (unlikely(!tctx)) + return -ENOMEM; + + xa_init(&tctx->xa); + init_waitqueue_head(&tctx->wait); + tctx->last = NULL; + tctx->in_idle = 0; + atomic_long_set(&tctx->req_issue, 0); + atomic_long_set(&tctx->req_complete, 0); + task->io_uring = tctx; + return 0; +} + +void __io_uring_free(struct task_struct *tsk) +{ + struct io_uring_task *tctx = tsk->io_uring; + + WARN_ON_ONCE(!xa_empty(&tctx->xa)); + kfree(tctx); + tsk->io_uring = NULL; +} + +static int io_sq_offload_create(struct io_ring_ctx *ctx, + struct io_uring_params *p) +{ + int ret; if (ctx->flags & IORING_SETUP_SQPOLL) { + struct io_sq_data *sqd; + ret = -EPERM; if (!capable(CAP_SYS_ADMIN)) goto err; + sqd = io_get_sq_data(p); + if (IS_ERR(sqd)) { + ret = PTR_ERR(sqd); + goto err; + } + + ctx->sq_data = sqd; + io_sq_thread_park(sqd); + mutex_lock(&sqd->ctx_lock); + list_add(&ctx->sqd_list, &sqd->ctx_new_list); + mutex_unlock(&sqd->ctx_lock); + io_sq_thread_unpark(sqd); + ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); if (!ctx->sq_thread_idle) ctx->sq_thread_idle = HZ; + if (sqd->thread) + goto done; + if (p->flags & IORING_SETUP_SQ_AFF) { int cpu = p->sq_thread_cpu; @@ -7476,25 +7668,27 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, if (!cpu_online(cpu)) goto err; - ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread, - ctx, cpu, - "io_uring-sq"); + sqd->thread = kthread_create_on_cpu(io_sq_thread, sqd, + cpu, "io_uring-sq"); } else { - ctx->sqo_thread = kthread_create(io_sq_thread, ctx, + sqd->thread = kthread_create(io_sq_thread, sqd, "io_uring-sq"); } - if (IS_ERR(ctx->sqo_thread)) { - ret = PTR_ERR(ctx->sqo_thread); - ctx->sqo_thread = NULL; + if (IS_ERR(sqd->thread)) { + ret = PTR_ERR(sqd->thread); + sqd->thread = NULL; goto err; } - wake_up_process(ctx->sqo_thread); + ret = io_uring_alloc_task_context(sqd->thread); + if (ret) + goto err; } else if (p->flags & IORING_SETUP_SQ_AFF) { /* Can't have SQ_AFF without SQPOLL */ ret = -EINVAL; goto err; } +done: ret = io_init_wq_offload(ctx, p); if (ret) goto err; @@ -7502,13 +7696,17 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, return 0; err: io_finish_async(ctx); - if (ctx->sqo_mm) { - mmdrop(ctx->sqo_mm); - ctx->sqo_mm = NULL; - } return ret; } +static void io_sq_offload_start(struct io_ring_ctx *ctx) +{ + struct io_sq_data *sqd = ctx->sq_data; + + if ((ctx->flags & IORING_SETUP_SQPOLL) && sqd->thread) + wake_up_process(sqd->thread); +} + static inline void __io_unaccount_mem(struct user_struct *user, unsigned long nr_pages) { @@ -7540,11 +7738,11 @@ static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages, if (ctx->limit_mem) __io_unaccount_mem(ctx->user, nr_pages); - if (ctx->sqo_mm) { + if (ctx->mm_account) { if (acct == ACCT_LOCKED) - ctx->sqo_mm->locked_vm -= nr_pages; + ctx->mm_account->locked_vm -= nr_pages; else if (acct == ACCT_PINNED) - atomic64_sub(nr_pages, &ctx->sqo_mm->pinned_vm); + atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); } } @@ -7559,11 +7757,11 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages, return ret; } - if (ctx->sqo_mm) { + if (ctx->mm_account) { if (acct == ACCT_LOCKED) - ctx->sqo_mm->locked_vm += nr_pages; + ctx->mm_account->locked_vm += nr_pages; else if (acct == ACCT_PINNED) - atomic64_add(nr_pages, &ctx->sqo_mm->pinned_vm); + atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); } return 0; @@ -7643,7 +7841,8 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) for (j = 0; j < imu->nr_bvecs; j++) unpin_user_page(imu->bvec[j].bv_page); - io_unaccount_mem(ctx, imu->nr_bvecs, ACCT_PINNED); + if (imu->acct_pages) + io_unaccount_mem(ctx, imu->acct_pages, ACCT_PINNED); kvfree(imu->bvec); imu->nr_bvecs = 0; } @@ -7679,11 +7878,80 @@ static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, return 0; } +/* + * Not super efficient, but this is just a registration time. And we do cache + * the last compound head, so generally we'll only do a full search if we don't + * match that one. + * + * We check if the given compound head page has already been accounted, to + * avoid double accounting it. This allows us to account the full size of the + * page, not just the constituent pages of a huge page. + */ +static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, + int nr_pages, struct page *hpage) +{ + int i, j; + + /* check current page array */ + for (i = 0; i < nr_pages; i++) { + if (!PageCompound(pages[i])) + continue; + if (compound_head(pages[i]) == hpage) + return true; + } + + /* check previously registered pages */ + for (i = 0; i < ctx->nr_user_bufs; i++) { + struct io_mapped_ubuf *imu = &ctx->user_bufs[i]; + + for (j = 0; j < imu->nr_bvecs; j++) { + if (!PageCompound(imu->bvec[j].bv_page)) + continue; + if (compound_head(imu->bvec[j].bv_page) == hpage) + return true; + } + } + + return false; +} + +static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, + int nr_pages, struct io_mapped_ubuf *imu, + struct page **last_hpage) +{ + int i, ret; + + for (i = 0; i < nr_pages; i++) { + if (!PageCompound(pages[i])) { + imu->acct_pages++; + } else { + struct page *hpage; + + hpage = compound_head(pages[i]); + if (hpage == *last_hpage) + continue; + *last_hpage = hpage; + if (headpage_already_acct(ctx, pages, i, hpage)) + continue; + imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; + } + } + + if (!imu->acct_pages) + return 0; + + ret = io_account_mem(ctx, imu->acct_pages, ACCT_PINNED); + if (ret) + imu->acct_pages = 0; + return ret; +} + static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { struct vm_area_struct **vmas = NULL; struct page **pages = NULL; + struct page *last_hpage = NULL; int i, j, got_pages = 0; int ret = -EINVAL; @@ -7726,10 +7994,6 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, start = ubuf >> PAGE_SHIFT; nr_pages = end - start; - ret = io_account_mem(ctx, nr_pages, ACCT_PINNED); - if (ret) - goto err; - ret = 0; if (!pages || nr_pages > got_pages) { kvfree(vmas); @@ -7741,7 +8005,6 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, GFP_KERNEL); if (!pages || !vmas) { ret = -ENOMEM; - io_unaccount_mem(ctx, nr_pages, ACCT_PINNED); goto err; } got_pages = nr_pages; @@ -7750,10 +8013,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec), GFP_KERNEL); ret = -ENOMEM; - if (!imu->bvec) { - io_unaccount_mem(ctx, nr_pages, ACCT_PINNED); + if (!imu->bvec) goto err; - } ret = 0; mmap_read_lock(current->mm); @@ -7782,7 +8043,13 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, */ if (pret > 0) unpin_user_pages(pages, pret); - io_unaccount_mem(ctx, nr_pages, ACCT_PINNED); + kvfree(imu->bvec); + goto err; + } + + ret = io_buffer_account_pin(ctx, pages, pret, imu, &last_hpage); + if (ret) { + unpin_user_pages(pages, pret); kvfree(imu->bvec); goto err; } @@ -7867,11 +8134,19 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_finish_async(ctx); io_sqe_buffer_unregister(ctx); - if (ctx->sqo_mm) { - mmdrop(ctx->sqo_mm); - ctx->sqo_mm = NULL; + + if (ctx->sqo_task) { + put_task_struct(ctx->sqo_task); + ctx->sqo_task = NULL; + mmdrop(ctx->mm_account); + ctx->mm_account = NULL; } +#ifdef CONFIG_BLK_CGROUP + if (ctx->sqo_blkcg_css) + css_put(ctx->sqo_blkcg_css); +#endif + io_sqe_files_unregister(ctx); io_eventfd_unregister(ctx); io_destroy_buffers(ctx); @@ -7906,8 +8181,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) * io_commit_cqring */ smp_rmb(); - if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head != - ctx->rings->sq_ring_entries) + if (!io_sqring_full(ctx)) mask |= EPOLLOUT | EPOLLWRNORM; if (io_cqring_events(ctx, false)) mask |= EPOLLIN | EPOLLRDNORM; @@ -7946,7 +8220,7 @@ static void io_ring_exit_work(struct work_struct *work) */ do { if (ctx->rings) - io_cqring_overflow_flush(ctx, true); + io_cqring_overflow_flush(ctx, true, NULL, NULL); io_iopoll_try_reap_events(ctx); } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)); io_ring_ctx_free(ctx); @@ -7958,15 +8232,15 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) percpu_ref_kill(&ctx->refs); mutex_unlock(&ctx->uring_lock); - io_kill_timeouts(ctx); - io_poll_remove_all(ctx); + io_kill_timeouts(ctx, NULL); + io_poll_remove_all(ctx, NULL); if (ctx->io_wq) io_wq_cancel_all(ctx->io_wq); /* if we failed setting up the ctx, we might not have any rings */ if (ctx->rings) - io_cqring_overflow_flush(ctx, true); + io_cqring_overflow_flush(ctx, true, NULL, NULL); io_iopoll_try_reap_events(ctx); idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); @@ -7979,7 +8253,13 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) ACCT_LOCKED); INIT_WORK(&ctx->exit_work, io_ring_exit_work); - queue_work(system_wq, &ctx->exit_work); + /* + * Use system_unbound_wq to avoid spawning tons of event kworkers + * if we're exiting a ton of rings at the same time. It just adds + * noise and overhead, there's no discernable change in runtime + * over using system_wq. + */ + queue_work(system_unbound_wq, &ctx->exit_work); } static int io_uring_release(struct inode *inode, struct file *file) @@ -7995,7 +8275,7 @@ static bool io_wq_files_match(struct io_wq_work *work, void *data) { struct files_struct *files = data; - return work->files == files; + return !files || work->files == files; } /* @@ -8016,6 +8296,22 @@ static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req) return false; } +static bool io_match_link_files(struct io_kiocb *req, + struct files_struct *files) +{ + struct io_kiocb *link; + + if (io_match_files(req, files)) + return true; + if (req->flags & REQ_F_LINK_HEAD) { + list_for_each_entry(link, &req->link_list, link_list) { + if (io_match_files(link, files)) + return true; + } + } + return false; +} + /* * We're looking to cancel 'req' because it's holding on to our files, but * 'req' could be a link to another request. See if it is, and cancel that @@ -8063,12 +8359,68 @@ static bool io_timeout_remove_link(struct io_ring_ctx *ctx, return found; } -static void io_uring_cancel_files(struct io_ring_ctx *ctx, +static bool io_cancel_link_cb(struct io_wq_work *work, void *data) +{ + return io_match_link(container_of(work, struct io_kiocb, work), data); +} + +static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) +{ + enum io_wq_cancel cret; + + /* cancel this particular work, if it's running */ + cret = io_wq_cancel_work(ctx->io_wq, &req->work); + if (cret != IO_WQ_CANCEL_NOTFOUND) + return; + + /* find links that hold this pending, cancel those */ + cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_link_cb, req, true); + if (cret != IO_WQ_CANCEL_NOTFOUND) + return; + + /* if we have a poll link holding this pending, cancel that */ + if (io_poll_remove_link(ctx, req)) + return; + + /* final option, timeout link is holding this req pending */ + io_timeout_remove_link(ctx, req); +} + +static void io_cancel_defer_files(struct io_ring_ctx *ctx, + struct files_struct *files) +{ + struct io_defer_entry *de = NULL; + LIST_HEAD(list); + + spin_lock_irq(&ctx->completion_lock); + list_for_each_entry_reverse(de, &ctx->defer_list, list) { + if (io_match_link_files(de->req, files)) { + list_cut_position(&list, &ctx->defer_list, &de->list); + break; + } + } + spin_unlock_irq(&ctx->completion_lock); + + while (!list_empty(&list)) { + de = list_first_entry(&list, struct io_defer_entry, list); + list_del_init(&de->list); + req_set_fail_links(de->req); + io_put_req(de->req); + io_req_complete(de->req, -ECANCELED); + kfree(de); + } +} + +/* + * Returns true if we found and killed one or more files pinning requests + */ +static bool io_uring_cancel_files(struct io_ring_ctx *ctx, struct files_struct *files) { if (list_empty_careful(&ctx->inflight_list)) - return; + return false; + io_cancel_defer_files(ctx, files); /* cancel all at once, should be faster than doing it one by one*/ io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true); @@ -8078,7 +8430,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, spin_lock_irq(&ctx->inflight_lock); list_for_each_entry(req, &ctx->inflight_list, inflight_entry) { - if (req->work.files != files) + if (files && req->work.files != files) continue; /* req is being completed, ignore */ if (!refcount_inc_not_zero(&req->refs)) @@ -8094,60 +8446,215 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, /* We need to keep going until we don't find a matching req */ if (!cancel_req) break; + /* cancel this request, or head link requests */ + io_attempt_cancel(ctx, cancel_req); + io_put_req(cancel_req); + /* cancellations _may_ trigger task work */ + io_run_task_work(); + schedule(); + finish_wait(&ctx->inflight_wait, &wait); + } - if (cancel_req->flags & REQ_F_OVERFLOW) { - spin_lock_irq(&ctx->completion_lock); - list_del(&cancel_req->compl.list); - cancel_req->flags &= ~REQ_F_OVERFLOW; + return true; +} - io_cqring_mark_overflow(ctx); - WRITE_ONCE(ctx->rings->cq_overflow, - atomic_inc_return(&ctx->cached_cq_overflow)); - io_commit_cqring(ctx); - spin_unlock_irq(&ctx->completion_lock); +static bool io_cancel_task_cb(struct io_wq_work *work, void *data) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct task_struct *task = data; - /* - * Put inflight ref and overflow ref. If that's - * all we had, then we're done with this request. - */ - if (refcount_sub_and_test(2, &cancel_req->refs)) { - io_free_req(cancel_req); - finish_wait(&ctx->inflight_wait, &wait); - continue; + return io_task_match(req, task); +} + +static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, + struct task_struct *task, + struct files_struct *files) +{ + bool ret; + + ret = io_uring_cancel_files(ctx, files); + if (!files) { + enum io_wq_cancel cret; + + cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, task, true); + if (cret != IO_WQ_CANCEL_NOTFOUND) + ret = true; + + /* SQPOLL thread does its own polling */ + if (!(ctx->flags & IORING_SETUP_SQPOLL)) { + while (!list_empty_careful(&ctx->iopoll_list)) { + io_iopoll_try_reap_events(ctx); + ret = true; } - } else { - io_wq_cancel_work(ctx->io_wq, &cancel_req->work); - /* could be a link, check and remove if it is */ - if (!io_poll_remove_link(ctx, cancel_req)) - io_timeout_remove_link(ctx, cancel_req); - io_put_req(cancel_req); } - schedule(); - finish_wait(&ctx->inflight_wait, &wait); + ret |= io_poll_remove_all(ctx, task); + ret |= io_kill_timeouts(ctx, task); } + + return ret; } -static bool io_cancel_task_cb(struct io_wq_work *work, void *data) +/* + * We need to iteratively cancel requests, in case a request has dependent + * hard links. These persist even for failure of cancelations, hence keep + * looping until none are found. + */ +static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, + struct files_struct *files) { - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - struct task_struct *task = data; + struct task_struct *task = current; + + if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) + task = ctx->sq_data->thread; + + io_cqring_overflow_flush(ctx, true, task, files); + + while (__io_uring_cancel_task_requests(ctx, task, files)) { + io_run_task_work(); + cond_resched(); + } +} + +/* + * Note that this task has used io_uring. We use it for cancelation purposes. + */ +static int io_uring_add_task_file(struct file *file) +{ + struct io_uring_task *tctx = current->io_uring; + + if (unlikely(!tctx)) { + int ret; + + ret = io_uring_alloc_task_context(current); + if (unlikely(ret)) + return ret; + tctx = current->io_uring; + } + if (tctx->last != file) { + void *old = xa_load(&tctx->xa, (unsigned long)file); + + if (!old) { + get_file(file); + xa_store(&tctx->xa, (unsigned long)file, file, GFP_KERNEL); + } + tctx->last = file; + } + + return 0; +} + +/* + * Remove this io_uring_file -> task mapping. + */ +static void io_uring_del_task_file(struct file *file) +{ + struct io_uring_task *tctx = current->io_uring; + + if (tctx->last == file) + tctx->last = NULL; + file = xa_erase(&tctx->xa, (unsigned long)file); + if (file) + fput(file); +} + +static void __io_uring_attempt_task_drop(struct file *file) +{ + struct file *old = xa_load(¤t->io_uring->xa, (unsigned long)file); + + if (old == file) + io_uring_del_task_file(file); +} + +/* + * Drop task note for this file if we're the only ones that hold it after + * pending fput() + */ +static void io_uring_attempt_task_drop(struct file *file, bool exiting) +{ + if (!current->io_uring) + return; + /* + * fput() is pending, will be 2 if the only other ref is our potential + * task file note. If the task is exiting, drop regardless of count. + */ + if (!exiting && atomic_long_read(&file->f_count) != 2) + return; + + __io_uring_attempt_task_drop(file); +} + +void __io_uring_files_cancel(struct files_struct *files) +{ + struct io_uring_task *tctx = current->io_uring; + struct file *file; + unsigned long index; + + /* make sure overflow events are dropped */ + tctx->in_idle = true; + + xa_for_each(&tctx->xa, index, file) { + struct io_ring_ctx *ctx = file->private_data; + + io_uring_cancel_task_requests(ctx, files); + if (files) + io_uring_del_task_file(file); + } +} + +static inline bool io_uring_task_idle(struct io_uring_task *tctx) +{ + return atomic_long_read(&tctx->req_issue) == + atomic_long_read(&tctx->req_complete); +} + +/* + * Find any io_uring fd that this task has registered or done IO on, and cancel + * requests. + */ +void __io_uring_task_cancel(void) +{ + struct io_uring_task *tctx = current->io_uring; + DEFINE_WAIT(wait); + long completions; + + /* make sure overflow events are dropped */ + tctx->in_idle = true; + + while (!io_uring_task_idle(tctx)) { + /* read completions before cancelations */ + completions = atomic_long_read(&tctx->req_complete); + __io_uring_files_cancel(NULL); + + prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE); + + /* + * If we've seen completions, retry. This avoids a race where + * a completion comes in before we did prepare_to_wait(). + */ + if (completions != atomic_long_read(&tctx->req_complete)) + continue; + if (io_uring_task_idle(tctx)) + break; + schedule(); + } - return req->task == task; + finish_wait(&tctx->wait, &wait); + tctx->in_idle = false; } static int io_uring_flush(struct file *file, void *data) { struct io_ring_ctx *ctx = file->private_data; - io_uring_cancel_files(ctx, data); - /* * If the task is going away, cancel work it may have pending */ if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) - io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, current, true); + data = NULL; + io_uring_cancel_task_requests(ctx, data); + io_uring_attempt_task_drop(file, !data); return 0; } @@ -8221,6 +8728,25 @@ static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, #endif /* !CONFIG_MMU */ +static void io_sqpoll_wait_sq(struct io_ring_ctx *ctx) +{ + DEFINE_WAIT(wait); + + do { + if (!io_sqring_full(ctx)) + break; + + prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE); + + if (!io_sqring_full(ctx)) + break; + + schedule(); + } while (!signal_pending(current)); + + finish_wait(&ctx->sqo_sq_wait, &wait); +} + SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, u32, min_complete, u32, flags, const sigset_t __user *, sig, size_t, sigsz) @@ -8232,7 +8758,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, io_run_task_work(); - if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP)) + if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | + IORING_ENTER_SQ_WAIT)) return -EINVAL; f = fdget(fd); @@ -8248,6 +8775,10 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (!percpu_ref_tryget(&ctx->refs)) goto out_fput; + ret = -EBADFD; + if (ctx->flags & IORING_SETUP_R_DISABLED) + goto out; + /* * For SQ polling, the thread will do all submissions and completions. * Just return the requested submit count, and wake the thread if @@ -8256,13 +8787,18 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, ret = 0; if (ctx->flags & IORING_SETUP_SQPOLL) { if (!list_empty_careful(&ctx->cq_overflow_list)) - io_cqring_overflow_flush(ctx, false); + io_cqring_overflow_flush(ctx, false, NULL, NULL); if (flags & IORING_ENTER_SQ_WAKEUP) - wake_up(&ctx->sqo_wait); + wake_up(&ctx->sq_data->wait); + if (flags & IORING_ENTER_SQ_WAIT) + io_sqpoll_wait_sq(ctx); submitted = to_submit; } else if (to_submit) { + ret = io_uring_add_task_file(f.file); + if (unlikely(ret)) + goto out; mutex_lock(&ctx->uring_lock); - submitted = io_submit_sqes(ctx, to_submit, f.file, fd); + submitted = io_submit_sqes(ctx, to_submit); mutex_unlock(&ctx->uring_lock); if (submitted != to_submit) @@ -8328,11 +8864,25 @@ static int io_uring_show_cred(int id, void *p, void *data) static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) { + struct io_sq_data *sq = NULL; + bool has_lock; int i; - mutex_lock(&ctx->uring_lock); + /* + * Avoid ABBA deadlock between the seq lock and the io_uring mutex, + * since fdinfo case grabs it in the opposite direction of normal use + * cases. If we fail to get the lock, we just don't iterate any + * structures that could be going away outside the io_uring mutex. + */ + has_lock = mutex_trylock(&ctx->uring_lock); + + if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) + sq = ctx->sq_data; + + seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1); + seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1); seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files); - for (i = 0; i < ctx->nr_user_files; i++) { + for (i = 0; has_lock && i < ctx->nr_user_files; i++) { struct fixed_file_table *table; struct file *f; @@ -8344,13 +8894,13 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) seq_printf(m, "%5u: <none>\n", i); } seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs); - for (i = 0; i < ctx->nr_user_bufs; i++) { + for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) { struct io_mapped_ubuf *buf = &ctx->user_bufs[i]; seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, (unsigned int) buf->len); } - if (!idr_is_empty(&ctx->personality_idr)) { + if (has_lock && !idr_is_empty(&ctx->personality_idr)) { seq_printf(m, "Personalities:\n"); idr_for_each(&ctx->personality_idr, io_uring_show_cred, m); } @@ -8365,7 +8915,8 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) req->task->task_works != NULL); } spin_unlock_irq(&ctx->completion_lock); - mutex_unlock(&ctx->uring_lock); + if (has_lock) + mutex_unlock(&ctx->uring_lock); } static void io_uring_show_fdinfo(struct seq_file *m, struct file *f) @@ -8463,6 +9014,7 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx) file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx, O_RDWR | O_CLOEXEC); if (IS_ERR(file)) { +err_fd: put_unused_fd(ret); ret = PTR_ERR(file); goto err; @@ -8471,6 +9023,10 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx) #if defined(CONFIG_UNIX) ctx->ring_sock->file = file; #endif + if (unlikely(io_uring_add_task_file(file))) { + file = ERR_PTR(-ENOMEM); + goto err_fd; + } fd_install(ret, file); return ret; err: @@ -8548,6 +9104,36 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, ctx->user = user; ctx->creds = get_current_cred(); + ctx->sqo_task = get_task_struct(current); + + /* + * This is just grabbed for accounting purposes. When a process exits, + * the mm is exited and dropped before the files, hence we need to hang + * on to this mm purely for the purposes of being able to unaccount + * memory (locked/pinned vm). It's not used for anything else. + */ + mmgrab(current->mm); + ctx->mm_account = current->mm; + +#ifdef CONFIG_BLK_CGROUP + /* + * The sq thread will belong to the original cgroup it was inited in. + * If the cgroup goes offline (e.g. disabling the io controller), then + * issued bios will be associated with the closest cgroup later in the + * block layer. + */ + rcu_read_lock(); + ctx->sqo_blkcg_css = blkcg_css(); + ret = css_tryget_online(ctx->sqo_blkcg_css); + rcu_read_unlock(); + if (!ret) { + /* don't init against a dying cgroup, have the user try again */ + ctx->sqo_blkcg_css = NULL; + ret = -ENODEV; + goto err; + } +#endif + /* * Account memory _before_ installing the file descriptor. Once * the descriptor is installed, it can get closed at any time. Also @@ -8562,10 +9148,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, if (ret) goto err; - ret = io_sq_offload_start(ctx, p); + ret = io_sq_offload_create(ctx, p); if (ret) goto err; + if (!(p->flags & IORING_SETUP_R_DISABLED)) + io_sq_offload_start(ctx); + memset(&p->sq_off, 0, sizeof(p->sq_off)); p->sq_off.head = offsetof(struct io_rings, sq.head); p->sq_off.tail = offsetof(struct io_rings, sq.tail); @@ -8628,7 +9217,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | - IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ)) + IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | + IORING_SETUP_R_DISABLED)) return -EINVAL; return io_uring_create(entries, &p, params); @@ -8704,6 +9294,91 @@ static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) return -EINVAL; } +static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg, + unsigned int nr_args) +{ + struct io_uring_restriction *res; + size_t size; + int i, ret; + + /* Restrictions allowed only if rings started disabled */ + if (!(ctx->flags & IORING_SETUP_R_DISABLED)) + return -EBADFD; + + /* We allow only a single restrictions registration */ + if (ctx->restrictions.registered) + return -EBUSY; + + if (!arg || nr_args > IORING_MAX_RESTRICTIONS) + return -EINVAL; + + size = array_size(nr_args, sizeof(*res)); + if (size == SIZE_MAX) + return -EOVERFLOW; + + res = memdup_user(arg, size); + if (IS_ERR(res)) + return PTR_ERR(res); + + ret = 0; + + for (i = 0; i < nr_args; i++) { + switch (res[i].opcode) { + case IORING_RESTRICTION_REGISTER_OP: + if (res[i].register_op >= IORING_REGISTER_LAST) { + ret = -EINVAL; + goto out; + } + + __set_bit(res[i].register_op, + ctx->restrictions.register_op); + break; + case IORING_RESTRICTION_SQE_OP: + if (res[i].sqe_op >= IORING_OP_LAST) { + ret = -EINVAL; + goto out; + } + + __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); + break; + case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: + ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; + break; + case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: + ctx->restrictions.sqe_flags_required = res[i].sqe_flags; + break; + default: + ret = -EINVAL; + goto out; + } + } + +out: + /* Reset all restrictions if an error happened */ + if (ret != 0) + memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); + else + ctx->restrictions.registered = true; + + kfree(res); + return ret; +} + +static int io_register_enable_rings(struct io_ring_ctx *ctx) +{ + if (!(ctx->flags & IORING_SETUP_R_DISABLED)) + return -EBADFD; + + if (ctx->restrictions.registered) + ctx->restricted = 1; + + ctx->flags &= ~IORING_SETUP_R_DISABLED; + + io_sq_offload_start(ctx); + + return 0; +} + static bool io_register_op_must_quiesce(int op) { switch (op) { @@ -8745,11 +9420,31 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, * after we've killed the percpu ref. */ mutex_unlock(&ctx->uring_lock); - ret = wait_for_completion_interruptible(&ctx->ref_comp); + do { + ret = wait_for_completion_interruptible(&ctx->ref_comp); + if (!ret) + break; + ret = io_run_task_work_sig(); + if (ret < 0) + break; + } while (1); + mutex_lock(&ctx->uring_lock); + if (ret) { percpu_ref_resurrect(&ctx->refs); - ret = -EINTR; + goto out_quiesce; + } + } + + if (ctx->restricted) { + if (opcode >= IORING_REGISTER_LAST) { + ret = -EINVAL; + goto out; + } + + if (!test_bit(opcode, ctx->restrictions.register_op)) { + ret = -EACCES; goto out; } } @@ -8813,15 +9508,25 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_unregister_personality(ctx, nr_args); break; + case IORING_REGISTER_ENABLE_RINGS: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_register_enable_rings(ctx); + break; + case IORING_REGISTER_RESTRICTIONS: + ret = io_register_restrictions(ctx, arg, nr_args); + break; default: ret = -EINVAL; break; } +out: if (io_register_op_must_quiesce(opcode)) { /* bring the ctx back to life */ percpu_ref_reinit(&ctx->refs); -out: +out_quiesce: reinit_completion(&ctx->ref_comp); } return ret; diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index bcfc288dba3f..8180061b9e16 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -22,18 +22,25 @@ #include "../internal.h" /* - * Structure allocated for each page when block size < PAGE_SIZE to track - * sub-page uptodate status and I/O completions. + * Structure allocated for each page or THP when block size < page size + * to track sub-page uptodate status and I/O completions. */ struct iomap_page { - atomic_t read_count; - atomic_t write_count; + atomic_t read_bytes_pending; + atomic_t write_bytes_pending; spinlock_t uptodate_lock; - DECLARE_BITMAP(uptodate, PAGE_SIZE / 512); + unsigned long uptodate[]; }; static inline struct iomap_page *to_iomap_page(struct page *page) { + /* + * per-block data is stored in the head page. Callers should + * not be dealing with tail pages (and if they are, they can + * call thp_head() first. + */ + VM_BUG_ON_PGFLAGS(PageTail(page), page); + if (page_has_private(page)) return (struct iomap_page *)page_private(page); return NULL; @@ -45,20 +52,16 @@ static struct iomap_page * iomap_page_create(struct inode *inode, struct page *page) { struct iomap_page *iop = to_iomap_page(page); + unsigned int nr_blocks = i_blocks_per_page(inode, page); - if (iop || i_blocksize(inode) == PAGE_SIZE) + if (iop || nr_blocks <= 1) return iop; - iop = kmalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL); - atomic_set(&iop->read_count, 0); - atomic_set(&iop->write_count, 0); + iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)), + GFP_NOFS | __GFP_NOFAIL); spin_lock_init(&iop->uptodate_lock); - bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE); - - /* - * migrate_page_move_mapping() assumes that pages with private data have - * their count elevated by 1. - */ + if (PageUptodate(page)) + bitmap_fill(iop->uptodate, nr_blocks); attach_page_private(page, iop); return iop; } @@ -67,11 +70,14 @@ static void iomap_page_release(struct page *page) { struct iomap_page *iop = detach_page_private(page); + unsigned int nr_blocks = i_blocks_per_page(page->mapping->host, page); if (!iop) return; - WARN_ON_ONCE(atomic_read(&iop->read_count)); - WARN_ON_ONCE(atomic_read(&iop->write_count)); + WARN_ON_ONCE(atomic_read(&iop->read_bytes_pending)); + WARN_ON_ONCE(atomic_read(&iop->write_bytes_pending)); + WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) != + PageUptodate(page)); kfree(iop); } @@ -142,19 +148,11 @@ iomap_iop_set_range_uptodate(struct page *page, unsigned off, unsigned len) struct inode *inode = page->mapping->host; unsigned first = off >> inode->i_blkbits; unsigned last = (off + len - 1) >> inode->i_blkbits; - bool uptodate = true; unsigned long flags; - unsigned int i; spin_lock_irqsave(&iop->uptodate_lock, flags); - for (i = 0; i < PAGE_SIZE / i_blocksize(inode); i++) { - if (i >= first && i <= last) - set_bit(i, iop->uptodate); - else if (!test_bit(i, iop->uptodate)) - uptodate = false; - } - - if (uptodate) + bitmap_set(iop->uptodate, first, last - first + 1); + if (bitmap_full(iop->uptodate, i_blocks_per_page(inode, page))) SetPageUptodate(page); spin_unlock_irqrestore(&iop->uptodate_lock, flags); } @@ -172,13 +170,6 @@ iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len) } static void -iomap_read_finish(struct iomap_page *iop, struct page *page) -{ - if (!iop || atomic_dec_and_test(&iop->read_count)) - unlock_page(page); -} - -static void iomap_read_page_end_io(struct bio_vec *bvec, int error) { struct page *page = bvec->bv_page; @@ -191,7 +182,8 @@ iomap_read_page_end_io(struct bio_vec *bvec, int error) iomap_set_range_uptodate(page, bvec->bv_offset, bvec->bv_len); } - iomap_read_finish(iop, page); + if (!iop || atomic_sub_and_test(bvec->bv_len, &iop->read_bytes_pending)) + unlock_page(page); } static void @@ -271,30 +263,19 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, } ctx->cur_page_in_bio = true; + if (iop) + atomic_add(plen, &iop->read_bytes_pending); - /* - * Try to merge into a previous segment if we can. - */ + /* Try to merge into a previous segment if we can */ sector = iomap_sector(iomap, pos); - if (ctx->bio && bio_end_sector(ctx->bio) == sector) + if (ctx->bio && bio_end_sector(ctx->bio) == sector) { + if (__bio_try_merge_page(ctx->bio, page, plen, poff, + &same_page)) + goto done; is_contig = true; - - if (is_contig && - __bio_try_merge_page(ctx->bio, page, plen, poff, &same_page)) { - if (!same_page && iop) - atomic_inc(&iop->read_count); - goto done; } - /* - * If we start a new segment we need to increase the read count, and we - * need to do so before submitting any previous full bio to make sure - * that we don't prematurely unlock the page. - */ - if (iop) - atomic_inc(&iop->read_count); - - if (!ctx->bio || !is_contig || bio_full(ctx->bio, plen)) { + if (!is_contig || bio_full(ctx->bio, plen)) { gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); gfp_t orig_gfp = gfp; int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -571,13 +552,13 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags, { struct iomap_page *iop = iomap_page_create(inode, page); loff_t block_size = i_blocksize(inode); - loff_t block_start = pos & ~(block_size - 1); - loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1); + loff_t block_start = round_down(pos, block_size); + loff_t block_end = round_up(pos + len, block_size); unsigned from = offset_in_page(pos), to = from + len, poff, plen; - int status; if (PageUptodate(page)) return 0; + ClearPageError(page); do { iomap_adjust_read_range(inode, iop, &block_start, @@ -594,14 +575,13 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags, if (WARN_ON_ONCE(flags & IOMAP_WRITE_F_UNSHARE)) return -EIO; zero_user_segments(page, poff, from, to, poff + plen); - iomap_set_range_uptodate(page, poff, plen); - continue; + } else { + int status = iomap_read_page_sync(block_start, page, + poff, plen, srcmap); + if (status) + return status; } - - status = iomap_read_page_sync(block_start, page, poff, plen, - srcmap); - if (status) - return status; + iomap_set_range_uptodate(page, poff, plen); } while ((block_start += plen) < block_end); return 0; @@ -685,9 +665,8 @@ iomap_set_page_dirty(struct page *page) } EXPORT_SYMBOL_GPL(iomap_set_page_dirty); -static int -__iomap_write_end(struct inode *inode, loff_t pos, unsigned len, - unsigned copied, struct page *page) +static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, + size_t copied, struct page *page) { flush_dcache_page(page); @@ -709,15 +688,15 @@ __iomap_write_end(struct inode *inode, loff_t pos, unsigned len, return copied; } -static int -iomap_write_end_inline(struct inode *inode, struct page *page, - struct iomap *iomap, loff_t pos, unsigned copied) +static size_t iomap_write_end_inline(struct inode *inode, struct page *page, + struct iomap *iomap, loff_t pos, size_t copied) { void *addr; WARN_ON_ONCE(!PageUptodate(page)); BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data)); + flush_dcache_page(page); addr = kmap_atomic(page); memcpy(iomap->inline_data + pos, addr + pos, copied); kunmap_atomic(addr); @@ -726,13 +705,14 @@ iomap_write_end_inline(struct inode *inode, struct page *page, return copied; } -static int -iomap_write_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, - struct page *page, struct iomap *iomap, struct iomap *srcmap) +/* Returns the number of bytes copied. May be 0. Cannot be an errno. */ +static size_t iomap_write_end(struct inode *inode, loff_t pos, size_t len, + size_t copied, struct page *page, struct iomap *iomap, + struct iomap *srcmap) { const struct iomap_page_ops *page_ops = iomap->page_ops; loff_t old_size = inode->i_size; - int ret; + size_t ret; if (srcmap->type == IOMAP_INLINE) { ret = iomap_write_end_inline(inode, page, iomap, pos, copied); @@ -811,13 +791,8 @@ again: copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); - flush_dcache_page(page); - - status = iomap_write_end(inode, pos, bytes, copied, page, iomap, + copied = iomap_write_end(inode, pos, bytes, copied, page, iomap, srcmap); - if (unlikely(status < 0)) - break; - copied = status; cond_resched(); @@ -891,11 +866,8 @@ iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data, status = iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap); - if (unlikely(status <= 0)) { - if (WARN_ON_ONCE(status == 0)) - return -EIO; - return status; - } + if (WARN_ON_ONCE(status == 0)) + return -EIO; cond_resched(); @@ -928,11 +900,13 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, } EXPORT_SYMBOL_GPL(iomap_file_unshare); -static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, - unsigned bytes, struct iomap *iomap, struct iomap *srcmap) +static s64 iomap_zero(struct inode *inode, loff_t pos, u64 length, + struct iomap *iomap, struct iomap *srcmap) { struct page *page; int status; + unsigned offset = offset_in_page(pos); + unsigned bytes = min_t(u64, PAGE_SIZE - offset, length); status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap, srcmap); if (status) @@ -944,38 +918,33 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, return iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap); } -static loff_t -iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, - void *data, struct iomap *iomap, struct iomap *srcmap) +static loff_t iomap_zero_range_actor(struct inode *inode, loff_t pos, + loff_t length, void *data, struct iomap *iomap, + struct iomap *srcmap) { bool *did_zero = data; loff_t written = 0; - int status; /* already zeroed? we're done. */ if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) - return count; + return length; do { - unsigned offset, bytes; - - offset = offset_in_page(pos); - bytes = min_t(loff_t, PAGE_SIZE - offset, count); + s64 bytes; if (IS_DAX(inode)) - status = dax_iomap_zero(pos, offset, bytes, iomap); + bytes = dax_iomap_zero(pos, length, iomap); else - status = iomap_zero(inode, pos, offset, bytes, iomap, - srcmap); - if (status < 0) - return status; + bytes = iomap_zero(inode, pos, length, iomap, srcmap); + if (bytes < 0) + return bytes; pos += bytes; - count -= bytes; + length -= bytes; written += bytes; if (did_zero) *did_zero = true; - } while (count > 0); + } while (length > 0); return written; } @@ -1070,7 +1039,7 @@ EXPORT_SYMBOL_GPL(iomap_page_mkwrite); static void iomap_finish_page_writeback(struct inode *inode, struct page *page, - int error) + int error, unsigned int len) { struct iomap_page *iop = to_iomap_page(page); @@ -1079,10 +1048,10 @@ iomap_finish_page_writeback(struct inode *inode, struct page *page, mapping_set_error(inode->i_mapping, -EIO); } - WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE && !iop); - WARN_ON_ONCE(iop && atomic_read(&iop->write_count) <= 0); + WARN_ON_ONCE(i_blocks_per_page(inode, page) > 1 && !iop); + WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) <= 0); - if (!iop || atomic_dec_and_test(&iop->write_count)) + if (!iop || atomic_sub_and_test(len, &iop->write_bytes_pending)) end_page_writeback(page); } @@ -1116,7 +1085,8 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) /* walk each page on bio, ending page IO on them */ bio_for_each_segment_all(bv, bio, iter_all) - iomap_finish_page_writeback(inode, bv->bv_page, error); + iomap_finish_page_writeback(inode, bv->bv_page, error, + bv->bv_len); bio_put(bio); } /* The ioend has been freed by bio_put() */ @@ -1332,8 +1302,8 @@ iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page, merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, &same_page); - if (iop && !same_page) - atomic_inc(&iop->write_count); + if (iop) + atomic_add(len, &iop->write_bytes_pending); if (!merged) { if (bio_full(wpc->ioend->io_bio, len)) { @@ -1375,8 +1345,8 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, int error = 0, count = 0, i; LIST_HEAD(submit_list); - WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE && !iop); - WARN_ON_ONCE(iop && atomic_read(&iop->write_count) != 0); + WARN_ON_ONCE(i_blocks_per_page(inode, page) > 1 && !iop); + WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) != 0); /* * Walk through the page to find areas to write back. If we run off the diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index c1aafb2ab990..933f234d5bec 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -76,7 +76,7 @@ static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap, dio->submit.cookie = submit_bio(bio); } -static ssize_t iomap_dio_complete(struct iomap_dio *dio) +ssize_t iomap_dio_complete(struct iomap_dio *dio) { const struct iomap_dio_ops *dops = dio->dops; struct kiocb *iocb = dio->iocb; @@ -108,7 +108,7 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) * ->end_io() when necessary, otherwise a racing buffer read would cache * zeros from unwritten extents. */ - if (!dio->error && + if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) { int err; err = invalidate_inode_pages2_range(inode->i_mapping, @@ -118,6 +118,7 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) dio_warn_stale_pagecache(iocb->ki_filp); } + inode_dio_end(file_inode(iocb->ki_filp)); /* * If this is a DSYNC write, make sure we push it to stable storage now * that we've written data. @@ -125,11 +126,11 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC)) ret = generic_write_sync(iocb, ret); - inode_dio_end(file_inode(iocb->ki_filp)); kfree(dio); return ret; } +EXPORT_SYMBOL_GPL(iomap_dio_complete); static void iomap_dio_complete_work(struct work_struct *work) { @@ -388,6 +389,16 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, return iomap_dio_bio_actor(inode, pos, length, dio, iomap); case IOMAP_INLINE: return iomap_dio_inline_actor(inode, pos, length, dio, iomap); + case IOMAP_DELALLOC: + /* + * DIO is not serialised against mmap() access at all, and so + * if the page_mkwrite occurs between the writeback and the + * iomap_apply() call in the DIO path, then it will see the + * DELALLOC block that the page-mkwrite allocated. + */ + pr_warn_ratelimited("Direct I/O collision with buffered writes! File: %pD4 Comm: %.20s\n", + dio->iocb->ki_filp, current->comm); + return -EIO; default: WARN_ON_ONCE(1); return -EIO; @@ -406,8 +417,8 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, * Returns -ENOTBLK In case of a page invalidation invalidation failure for * writes. The callers needs to fall back to buffered I/O in this case. */ -ssize_t -iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, +struct iomap_dio * +__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, bool wait_for_completion) { @@ -421,14 +432,14 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, struct iomap_dio *dio; if (!count) - return 0; + return NULL; if (WARN_ON(is_sync_kiocb(iocb) && !wait_for_completion)) - return -EIO; + return ERR_PTR(-EIO); dio = kmalloc(sizeof(*dio), GFP_KERNEL); if (!dio) - return -ENOMEM; + return ERR_PTR(-ENOMEM); dio->iocb = iocb; atomic_set(&dio->ref, 1); @@ -558,7 +569,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->wait_for_completion = wait_for_completion; if (!atomic_dec_and_test(&dio->ref)) { if (!wait_for_completion) - return -EIOCBQUEUED; + return ERR_PTR(-EIOCBQUEUED); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); @@ -574,10 +585,26 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, __set_current_state(TASK_RUNNING); } - return iomap_dio_complete(dio); + return dio; out_free_dio: kfree(dio); - return ret; + if (ret) + return ERR_PTR(ret); + return NULL; +} +EXPORT_SYMBOL_GPL(__iomap_dio_rw); + +ssize_t +iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, + const struct iomap_ops *ops, const struct iomap_dio_ops *dops, + bool wait_for_completion) +{ + struct iomap_dio *dio; + + dio = __iomap_dio_rw(iocb, iter, ops, dops, wait_for_completion); + if (IS_ERR_OR_NULL(dio)) + return PTR_ERR_OR_ZERO(dio); + return iomap_dio_complete(dio); } EXPORT_SYMBOL_GPL(iomap_dio_rw); diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c index 89f61d93c0bc..107ee80c3568 100644 --- a/fs/iomap/seek.c +++ b/fs/iomap/seek.c @@ -127,7 +127,7 @@ iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length, SEEK_HOLE); if (offset < 0) return length; - /* fall through */ + fallthrough; case IOMAP_HOLE: *(loff_t *)data = offset; return 0; @@ -175,7 +175,7 @@ iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length, SEEK_DATA); if (offset < 0) return length; - /*FALLTHRU*/ + fallthrough; default: *(loff_t *)data = offset; return 0; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index e4944436e733..17fdc482f554 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1285,7 +1285,7 @@ journal_t *jbd2_journal_init_inode(struct inode *inode) * superblock as being NULL to prevent the journal destroy from writing * back a bogus superblock. */ -static void journal_fail_superblock (journal_t *journal) +static void journal_fail_superblock(journal_t *journal) { struct buffer_head *bh = journal->j_sb_buffer; brelse(bh); @@ -1367,8 +1367,10 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags) int ret; /* Buffer got discarded which means block device got invalidated */ - if (!buffer_mapped(bh)) + if (!buffer_mapped(bh)) { + unlock_buffer(bh); return -EIO; + } trace_jbd2_write_superblock(journal, write_flags); if (!(journal->j_flags & JBD2_BARRIER)) @@ -1815,7 +1817,7 @@ int jbd2_journal_destroy(journal_t *journal) /** - *int jbd2_journal_check_used_features () - Check if features specified are used. + *int jbd2_journal_check_used_features() - Check if features specified are used. * @journal: Journal to check. * @compat: bitmask of compatible features * @ro: bitmask of features that force read-only mount @@ -1825,7 +1827,7 @@ int jbd2_journal_destroy(journal_t *journal) * features. Return true (non-zero) if it does. **/ -int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat, +int jbd2_journal_check_used_features(journal_t *journal, unsigned long compat, unsigned long ro, unsigned long incompat) { journal_superblock_t *sb; @@ -1860,7 +1862,7 @@ int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat, * all of a given set of features on this journal. Return true * (non-zero) if it can. */ -int jbd2_journal_check_available_features (journal_t *journal, unsigned long compat, +int jbd2_journal_check_available_features(journal_t *journal, unsigned long compat, unsigned long ro, unsigned long incompat) { if (!compat && !ro && !incompat) @@ -1882,7 +1884,7 @@ int jbd2_journal_check_available_features (journal_t *journal, unsigned long com } /** - * int jbd2_journal_set_features () - Mark a given journal feature in the superblock + * int jbd2_journal_set_features() - Mark a given journal feature in the superblock * @journal: Journal to act on. * @compat: bitmask of compatible features * @ro: bitmask of features that force read-only mount @@ -1893,7 +1895,7 @@ int jbd2_journal_check_available_features (journal_t *journal, unsigned long com * */ -int jbd2_journal_set_features (journal_t *journal, unsigned long compat, +int jbd2_journal_set_features(journal_t *journal, unsigned long compat, unsigned long ro, unsigned long incompat) { #define INCOMPAT_FEATURE_ON(f) \ diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 2ed278f0dced..faa97d748474 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -690,14 +690,11 @@ static int do_one_pass(journal_t *journal, * number. */ if (pass == PASS_SCAN && jbd2_has_feature_checksum(journal)) { - int chksum_err, chksum_seen; struct commit_header *cbh = (struct commit_header *)bh->b_data; unsigned found_chksum = be32_to_cpu(cbh->h_chksum[0]); - chksum_err = chksum_seen = 0; - if (info->end_transaction) { journal->j_failed_commit = info->end_transaction; @@ -705,42 +702,23 @@ static int do_one_pass(journal_t *journal, break; } - if (crc32_sum == found_chksum && - cbh->h_chksum_type == JBD2_CRC32_CHKSUM && - cbh->h_chksum_size == - JBD2_CRC32_CHKSUM_SIZE) - chksum_seen = 1; - else if (!(cbh->h_chksum_type == 0 && - cbh->h_chksum_size == 0 && - found_chksum == 0 && - !chksum_seen)) - /* - * If fs is mounted using an old kernel and then - * kernel with journal_chksum is used then we - * get a situation where the journal flag has - * checksum flag set but checksums are not - * present i.e chksum = 0, in the individual - * commit blocks. - * Hence to avoid checksum failures, in this - * situation, this extra check is added. - */ - chksum_err = 1; - - if (chksum_err) { - info->end_transaction = next_commit_ID; - - if (!jbd2_has_feature_async_commit(journal)) { - journal->j_failed_commit = - next_commit_ID; - brelse(bh); - break; - } - } + /* Neither checksum match nor unused? */ + if (!((crc32_sum == found_chksum && + cbh->h_chksum_type == + JBD2_CRC32_CHKSUM && + cbh->h_chksum_size == + JBD2_CRC32_CHKSUM_SIZE) || + (cbh->h_chksum_type == 0 && + cbh->h_chksum_size == 0 && + found_chksum == 0))) + goto chksum_error; + crc32_sum = ~0; } if (pass == PASS_SCAN && !jbd2_commit_block_csum_verify(journal, bh->b_data)) { + chksum_error: info->end_transaction = next_commit_ID; if (!jbd2_has_feature_async_commit(journal)) { diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e91aad3637a2..43985738aa86 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2026,6 +2026,9 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) */ static void __jbd2_journal_unfile_buffer(struct journal_head *jh) { + J_ASSERT_JH(jh, jh->b_transaction != NULL); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + __jbd2_journal_temp_unlink_buffer(jh); jh->b_transaction = NULL; } @@ -2078,10 +2081,6 @@ out: * int jbd2_journal_try_to_free_buffers() - try to free page buffers. * @journal: journal for operation * @page: to try and free - * @gfp_mask: we use the mask to detect how hard should we try to release - * buffers. If __GFP_DIRECT_RECLAIM and __GFP_FS is set, we wait for commit - * code to release the buffers. - * * * For all the buffers on this page, * if they are fully written out ordered data, move them onto BUF_CLEAN @@ -2112,11 +2111,11 @@ out: * * Return 0 on failure, 1 on success */ -int jbd2_journal_try_to_free_buffers(journal_t *journal, - struct page *page, gfp_t gfp_mask) +int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page) { struct buffer_head *head; struct buffer_head *bh; + bool has_write_io_error = false; int ret = 0; J_ASSERT(PageLocked(page)); @@ -2141,11 +2140,26 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, jbd2_journal_put_journal_head(jh); if (buffer_jbd(bh)) goto busy; + + /* + * If we free a metadata buffer which has been failed to + * write out, the jbd2 checkpoint procedure will not detect + * this failure and may lead to filesystem inconsistency + * after cleanup journal tail. + */ + if (buffer_write_io_error(bh)) { + pr_err("JBD2: Error while async write back metadata bh %llu.", + (unsigned long long)bh->b_blocknr); + has_write_io_error = true; + } } while ((bh = bh->b_this_page) != head); ret = try_to_free_buffers(page); busy: + if (has_write_io_error) + jbd2_journal_abort(journal, -EIO); + return ret; } @@ -2572,6 +2586,13 @@ bool __jbd2_journal_refile_buffer(struct journal_head *jh) was_dirty = test_clear_buffer_jbddirty(bh); __jbd2_journal_temp_unlink_buffer(jh); + + /* + * b_transaction must be set, otherwise the new b_transaction won't + * be holding jh reference + */ + J_ASSERT_JH(jh, jh->b_transaction != NULL); + /* * We set b_transaction here because b_next_transaction will inherit * our jh reference and thus __jbd2_journal_file_buffer() must not diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index ab8cdd9e9325..78858f6e9583 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -341,7 +341,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) rdev = old_decode_dev(je16_to_cpu(jdev.old_id)); else rdev = new_decode_dev(je32_to_cpu(jdev.new_id)); - /* fall through */ + fallthrough; case S_IFSOCK: case S_IFIFO: diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index bccfc40b3a74..2f6f0b140c05 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -1273,7 +1273,7 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c, dbg_readinode("symlink's target '%s' cached\n", f->target); } - /* fall through... */ + fallthrough; case S_IFBLK: case S_IFCHR: diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index a2f5338a5ea1..176580f54af9 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -473,7 +473,7 @@ static int metapage_readpage(struct file *fp, struct page *page) struct inode *inode = page->mapping->host; struct bio *bio = NULL; int block_offset; - int blocks_per_page = PAGE_SIZE >> inode->i_blkbits; + int blocks_per_page = i_blocks_per_page(inode, page); sector_t page_start; /* address of page in fs blocks */ sector_t pblock; int xlen; diff --git a/fs/libfs.c b/fs/libfs.c index 4d08edf19c78..e0d42e977d9a 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -137,11 +137,11 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) switch (whence) { case 1: offset += file->f_pos; - /* fall through */ + fallthrough; case 0: if (offset >= 0) break; - /* fall through */ + fallthrough; default: return -EINVAL; } diff --git a/fs/locks.c b/fs/locks.c index 8fc0542f5132..1f84a03601fe 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1499,7 +1499,7 @@ static void lease_clear_pending(struct file_lock *fl, int arg) switch (arg) { case F_UNLCK: fl->fl_flags &= ~FL_UNLOCK_PENDING; - /* fall through */ + fallthrough; case F_RDLCK: fl->fl_flags &= ~FL_DOWNGRADE_PENDING; } @@ -2525,7 +2525,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, cmd = F_SETLKW; file_lock->fl_flags |= FL_OFDLCK; file_lock->fl_owner = filp; - /* Fallthrough */ + fallthrough; case F_SETLKW: file_lock->fl_flags |= FL_SLEEP; } @@ -2656,7 +2656,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, cmd = F_SETLKW64; file_lock->fl_flags |= FL_OFDLCK; file_lock->fl_owner = filp; - /* Fallthrough */ + fallthrough; case F_SETLKW64: file_lock->fl_flags |= FL_SLEEP; } diff --git a/fs/namei.c b/fs/namei.c index e99e2a9da0f7..f1eb8ccd2be9 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -568,8 +568,8 @@ static bool path_connected(struct vfsmount *mnt, struct dentry *dentry) { struct super_block *sb = mnt->mnt_sb; - /* Bind mounts and multi-root filesystems can have disconnected paths */ - if (!(sb->s_iflags & SB_I_MULTIROOT) && (mnt->mnt_root == sb->s_root)) + /* Bind mounts can have disconnected paths */ + if (mnt->mnt_root == sb->s_root) return true; return is_subdir(dentry, mnt->mnt_root); diff --git a/fs/namespace.c b/fs/namespace.c index bae0e95b3713..294e05a13d17 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3072,10 +3072,10 @@ static void shrink_submounts(struct mount *mnt) } } -void *copy_mount_options(const void __user * data) +static void *copy_mount_options(const void __user * data) { char *copy; - unsigned size; + unsigned left, offset; if (!data) return NULL; @@ -3084,20 +3084,31 @@ void *copy_mount_options(const void __user * data) if (!copy) return ERR_PTR(-ENOMEM); - size = PAGE_SIZE - offset_in_page(data); + left = copy_from_user(copy, data, PAGE_SIZE); - if (copy_from_user(copy, data, size)) { + /* + * Not all architectures have an exact copy_from_user(). Resort to + * byte at a time. + */ + offset = PAGE_SIZE - left; + while (left) { + char c; + if (get_user(c, (const char __user *)data + offset)) + break; + copy[offset] = c; + left--; + offset++; + } + + if (left == PAGE_SIZE) { kfree(copy); return ERR_PTR(-EFAULT); } - if (size != PAGE_SIZE) { - if (copy_from_user(copy + size, data + size, PAGE_SIZE - size)) - memset(copy + size, 0, PAGE_SIZE - size); - } + return copy; } -char *copy_mount_string(const void __user *data) +static char *copy_mount_string(const void __user *data) { return data ? strndup_user(data, PATH_MAX) : NULL; } diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index d1a0e2c8b1b4..08108b6d2fa1 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -753,7 +753,7 @@ out: case -ENODEV: /* Our extent block devices are unavailable */ set_bit(NFS_LSEG_UNAVAILABLE, &lseg->pls_flags); - /* Fall through */ + fallthrough; case 0: return lseg; default: diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index a12f42e7d8c7..cb52db9a0cfb 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -579,6 +579,9 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); do { + if (entry->label) + entry->label->len = NFS4_MAXLABELLEN; + status = xdr_decode(desc, entry, &stream); if (status != 0) { if (status == -EAGAIN) @@ -1181,7 +1184,7 @@ int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags) /* A NFSv4 OPEN will revalidate later */ if (server->caps & NFS_CAP_ATOMIC_OPEN) goto out; - /* Fallthrough */ + fallthrough; case S_IFDIR: if (server->flags & NFS_MOUNT_NOCTO) break; diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index a13e69009f19..7f5aa0403e16 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -187,7 +187,7 @@ static int filelayout_async_handle_error(struct rpc_task *task, pnfs_error_mark_layout_for_return(inode, lseg); pnfs_set_lo_fail(lseg); rpc_wake_up(&tbl->slot_tbl_waitq); - /* fall through */ + fallthrough; default: reset: dprintk("%s Retry through MDS. Error %d\n", __func__, diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 965145592750..a163533446fa 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -715,7 +715,7 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task, } static void -ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, int idx) +ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, u32 idx) { struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); @@ -724,7 +724,7 @@ ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, int idx) } static void -ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, int idx) +ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx) { struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); @@ -734,14 +734,14 @@ ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, int idx) static struct nfs4_pnfs_ds * ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg, - int start_idx, int *best_idx, + u32 start_idx, u32 *best_idx, bool check_device) { struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); struct nfs4_ff_layout_mirror *mirror; struct nfs4_pnfs_ds *ds; bool fail_return = false; - int idx; + u32 idx; /* mirrors are initially sorted by efficiency */ for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) { @@ -766,21 +766,21 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg, static struct nfs4_pnfs_ds * ff_layout_choose_any_ds_for_read(struct pnfs_layout_segment *lseg, - int start_idx, int *best_idx) + u32 start_idx, u32 *best_idx) { return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, false); } static struct nfs4_pnfs_ds * ff_layout_choose_valid_ds_for_read(struct pnfs_layout_segment *lseg, - int start_idx, int *best_idx) + u32 start_idx, u32 *best_idx) { return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, true); } static struct nfs4_pnfs_ds * ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, - int start_idx, int *best_idx) + u32 start_idx, u32 *best_idx) { struct nfs4_pnfs_ds *ds; @@ -791,7 +791,8 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, } static struct nfs4_pnfs_ds * -ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio, int *best_idx) +ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio, + u32 *best_idx) { struct pnfs_layout_segment *lseg = pgio->pg_lseg; struct nfs4_pnfs_ds *ds; @@ -837,7 +838,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_pgio_mirror *pgm; struct nfs4_ff_layout_mirror *mirror; struct nfs4_pnfs_ds *ds; - int ds_idx; + u32 ds_idx, i; retry: ff_layout_pg_check_layout(pgio, req); @@ -863,14 +864,14 @@ retry: goto retry; } - mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx); + for (i = 0; i < pgio->pg_mirror_count; i++) { + mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i); + pgm = &pgio->pg_mirrors[i]; + pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize; + } pgio->pg_mirror_idx = ds_idx; - /* read always uses only one mirror - idx 0 for pgio layer */ - pgm = &pgio->pg_mirrors[0]; - pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize; - if (NFS_SERVER(pgio->pg_inode)->flags & (NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR)) pgio->pg_maxretrans = io_maxretrans; @@ -894,7 +895,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs4_ff_layout_mirror *mirror; struct nfs_pgio_mirror *pgm; struct nfs4_pnfs_ds *ds; - int i; + u32 i; retry: ff_layout_pg_check_layout(pgio, req); @@ -1038,7 +1039,7 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs) static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr) { u32 idx = hdr->pgio_mirror_idx + 1; - int new_idx = 0; + u32 new_idx = 0; if (ff_layout_choose_any_ds_for_read(hdr->lseg, idx + 1, &new_idx)) ff_layout_send_layouterror(hdr->lseg); @@ -1075,7 +1076,7 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, struct nfs4_state *state, struct nfs_client *clp, struct pnfs_layout_segment *lseg, - int idx) + u32 idx) { struct pnfs_layout_hdr *lo = lseg->pls_layout; struct inode *inode = lo->plh_inode; @@ -1133,7 +1134,7 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, nfs4_delete_deviceid(devid->ld, devid->nfs_client, &devid->deviceid); rpc_wake_up(&tbl->slot_tbl_waitq); - /* fall through */ + fallthrough; default: if (ff_layout_avoid_mds_available_ds(lseg)) return -NFS4ERR_RESET_TO_PNFS; @@ -1149,7 +1150,7 @@ reset: /* Retry all errors through either pNFS or MDS except for -EJUKEBOX */ static int ff_layout_async_handle_error_v3(struct rpc_task *task, struct pnfs_layout_segment *lseg, - int idx) + u32 idx) { struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); @@ -1184,7 +1185,7 @@ static int ff_layout_async_handle_error(struct rpc_task *task, struct nfs4_state *state, struct nfs_client *clp, struct pnfs_layout_segment *lseg, - int idx) + u32 idx) { int vers = clp->cl_nfs_mod->rpc_vers->number; @@ -1211,7 +1212,7 @@ static int ff_layout_async_handle_error(struct rpc_task *task, } static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, - int idx, u64 offset, u64 length, + u32 idx, u64 offset, u64 length, u32 *op_status, int opnum, int error) { struct nfs4_ff_layout_mirror *mirror; @@ -1260,7 +1261,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, */ if (opnum == OP_READ) break; - /* Fallthrough */ + fallthrough; default: pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg); @@ -1809,7 +1810,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) loff_t offset = hdr->args.offset; int vers; struct nfs_fh *fh; - int idx = hdr->pgio_mirror_idx; + u32 idx = hdr->pgio_mirror_idx; mirror = FF_LAYOUT_COMP(lseg, idx); ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true); diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 66949da0e827..222afba70bc0 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -651,21 +651,21 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) { case Opt_xprt_udp6: protofamily = AF_INET6; - /* fall through */ + fallthrough; case Opt_xprt_udp: ctx->flags &= ~NFS_MOUNT_TCP; ctx->nfs_server.protocol = XPRT_TRANSPORT_UDP; break; case Opt_xprt_tcp6: protofamily = AF_INET6; - /* fall through */ + fallthrough; case Opt_xprt_tcp: ctx->flags |= NFS_MOUNT_TCP; ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP; break; case Opt_xprt_rdma6: protofamily = AF_INET6; - /* fall through */ + fallthrough; case Opt_xprt_rdma: /* vector side protocols to TCP */ ctx->flags |= NFS_MOUNT_TCP; @@ -684,13 +684,13 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) { case Opt_xprt_udp6: mountfamily = AF_INET6; - /* fall through */ + fallthrough; case Opt_xprt_udp: ctx->mount_server.protocol = XPRT_TRANSPORT_UDP; break; case Opt_xprt_tcp6: mountfamily = AF_INET6; - /* fall through */ + fallthrough; case Opt_xprt_tcp: ctx->mount_server.protocol = XPRT_TRANSPORT_TCP; break; @@ -899,9 +899,11 @@ static int nfs23_parse_monolithic(struct fs_context *fc, ctx->version = NFS_DEFAULT_VERSION; switch (data->version) { case 1: - data->namlen = 0; /* fall through */ + data->namlen = 0; + fallthrough; case 2: - data->bsize = 0; /* fall through */ + data->bsize = 0; + fallthrough; case 3: if (data->flags & NFS_MOUNT_VER3) goto out_no_v3; @@ -909,14 +911,14 @@ static int nfs23_parse_monolithic(struct fs_context *fc, memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE); /* Turn off security negotiation */ extra_flags |= NFS_MOUNT_SECFLAVOUR; - /* fall through */ + fallthrough; case 4: if (data->flags & NFS_MOUNT_SECFLAVOUR) goto out_no_sec; - /* fall through */ + fallthrough; case 5: memset(data->context, 0, sizeof(data->context)); - /* fall through */ + fallthrough; case 6: if (data->flags & NFS_MOUNT_VER3) { if (data->root.size > NFS3_FHSIZE || data->root.size == 0) @@ -1037,6 +1039,65 @@ out_invalid_fh: } #if IS_ENABLED(CONFIG_NFS_V4) +struct compat_nfs_string { + compat_uint_t len; + compat_uptr_t data; +}; + +static inline void compat_nfs_string(struct nfs_string *dst, + struct compat_nfs_string *src) +{ + dst->data = compat_ptr(src->data); + dst->len = src->len; +} + +struct compat_nfs4_mount_data_v1 { + compat_int_t version; + compat_int_t flags; + compat_int_t rsize; + compat_int_t wsize; + compat_int_t timeo; + compat_int_t retrans; + compat_int_t acregmin; + compat_int_t acregmax; + compat_int_t acdirmin; + compat_int_t acdirmax; + struct compat_nfs_string client_addr; + struct compat_nfs_string mnt_path; + struct compat_nfs_string hostname; + compat_uint_t host_addrlen; + compat_uptr_t host_addr; + compat_int_t proto; + compat_int_t auth_flavourlen; + compat_uptr_t auth_flavours; +}; + +static void nfs4_compat_mount_data_conv(struct nfs4_mount_data *data) +{ + struct compat_nfs4_mount_data_v1 *compat = + (struct compat_nfs4_mount_data_v1 *)data; + + /* copy the fields backwards */ + data->auth_flavours = compat_ptr(compat->auth_flavours); + data->auth_flavourlen = compat->auth_flavourlen; + data->proto = compat->proto; + data->host_addr = compat_ptr(compat->host_addr); + data->host_addrlen = compat->host_addrlen; + compat_nfs_string(&data->hostname, &compat->hostname); + compat_nfs_string(&data->mnt_path, &compat->mnt_path); + compat_nfs_string(&data->client_addr, &compat->client_addr); + data->acdirmax = compat->acdirmax; + data->acdirmin = compat->acdirmin; + data->acregmax = compat->acregmax; + data->acregmin = compat->acregmin; + data->retrans = compat->retrans; + data->timeo = compat->timeo; + data->wsize = compat->wsize; + data->rsize = compat->rsize; + data->flags = compat->flags; + data->version = compat->version; +} + /* * Validate NFSv4 mount options */ @@ -1047,89 +1108,83 @@ static int nfs4_parse_monolithic(struct fs_context *fc, struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address; char *c; - if (data == NULL) - goto out_no_data; + if (!data) { + if (is_remount_fc(fc)) + goto done; + return nfs_invalf(fc, + "NFS4: mount program didn't pass any mount data"); + } ctx->version = 4; - switch (data->version) { - case 1: - if (data->host_addrlen > sizeof(ctx->nfs_server.address)) - goto out_no_address; - if (data->host_addrlen == 0) - goto out_no_address; - ctx->nfs_server.addrlen = data->host_addrlen; - if (copy_from_user(sap, data->host_addr, data->host_addrlen)) - return -EFAULT; - if (!nfs_verify_server_address(sap)) - goto out_no_address; - ctx->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port); - - if (data->auth_flavourlen) { - rpc_authflavor_t pseudoflavor; - if (data->auth_flavourlen > 1) - goto out_inval_auth; - if (copy_from_user(&pseudoflavor, - data->auth_flavours, - sizeof(pseudoflavor))) - return -EFAULT; - ctx->selected_flavor = pseudoflavor; - } else - ctx->selected_flavor = RPC_AUTH_UNIX; - - c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN); - if (IS_ERR(c)) - return PTR_ERR(c); - ctx->nfs_server.hostname = c; + if (data->version != 1) + return generic_parse_monolithic(fc, data); - c = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN); - if (IS_ERR(c)) - return PTR_ERR(c); - ctx->nfs_server.export_path = c; - dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", c); + if (in_compat_syscall()) + nfs4_compat_mount_data_conv(data); - c = strndup_user(data->client_addr.data, 16); - if (IS_ERR(c)) - return PTR_ERR(c); - ctx->client_address = c; - - /* - * Translate to nfs_fs_context, which nfs_fill_super - * can deal with. - */ + if (data->host_addrlen > sizeof(ctx->nfs_server.address)) + goto out_no_address; + if (data->host_addrlen == 0) + goto out_no_address; + ctx->nfs_server.addrlen = data->host_addrlen; + if (copy_from_user(sap, data->host_addr, data->host_addrlen)) + return -EFAULT; + if (!nfs_verify_server_address(sap)) + goto out_no_address; + ctx->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port); - ctx->flags = data->flags & NFS4_MOUNT_FLAGMASK; - ctx->rsize = data->rsize; - ctx->wsize = data->wsize; - ctx->timeo = data->timeo; - ctx->retrans = data->retrans; - ctx->acregmin = data->acregmin; - ctx->acregmax = data->acregmax; - ctx->acdirmin = data->acdirmin; - ctx->acdirmax = data->acdirmax; - ctx->nfs_server.protocol = data->proto; - nfs_validate_transport_protocol(ctx); - if (ctx->nfs_server.protocol == XPRT_TRANSPORT_UDP) - goto out_invalid_transport_udp; + if (data->auth_flavourlen) { + rpc_authflavor_t pseudoflavor; - break; - default: - goto generic; + if (data->auth_flavourlen > 1) + goto out_inval_auth; + if (copy_from_user(&pseudoflavor, data->auth_flavours, + sizeof(pseudoflavor))) + return -EFAULT; + ctx->selected_flavor = pseudoflavor; + } else { + ctx->selected_flavor = RPC_AUTH_UNIX; } + c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN); + if (IS_ERR(c)) + return PTR_ERR(c); + ctx->nfs_server.hostname = c; + + c = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN); + if (IS_ERR(c)) + return PTR_ERR(c); + ctx->nfs_server.export_path = c; + dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", c); + + c = strndup_user(data->client_addr.data, 16); + if (IS_ERR(c)) + return PTR_ERR(c); + ctx->client_address = c; + + /* + * Translate to nfs_fs_context, which nfs_fill_super + * can deal with. + */ + + ctx->flags = data->flags & NFS4_MOUNT_FLAGMASK; + ctx->rsize = data->rsize; + ctx->wsize = data->wsize; + ctx->timeo = data->timeo; + ctx->retrans = data->retrans; + ctx->acregmin = data->acregmin; + ctx->acregmax = data->acregmax; + ctx->acdirmin = data->acdirmin; + ctx->acdirmax = data->acdirmax; + ctx->nfs_server.protocol = data->proto; + nfs_validate_transport_protocol(ctx); + if (ctx->nfs_server.protocol == XPRT_TRANSPORT_UDP) + goto out_invalid_transport_udp; +done: ctx->skip_reconfig_option_check = true; return 0; -generic: - return generic_parse_monolithic(fc, data); - -out_no_data: - if (is_remount_fc(fc)) { - ctx->skip_reconfig_option_check = true; - return 0; - } - return nfs_invalf(fc, "NFS4: mount program didn't pass any mount data"); - out_inval_auth: return nfs_invalf(fc, "NFS4: Invalid number of RPC auth flavours %d", data->auth_flavourlen); diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 26c94b32d6f4..c6c863382f37 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -108,7 +108,7 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type) case -EPROTONOSUPPORT: dprintk("NFS_V3_ACL extension not supported; disabling\n"); server->caps &= ~NFS_CAP_ACLS; - /* fall through */ + fallthrough; case -ENOTSUPP: status = -EOPNOTSUPP; default: @@ -228,7 +228,7 @@ static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, dprintk("NFS_V3_ACL SETACL RPC not supported" "(will not retry)\n"); server->caps &= ~NFS_CAP_ACLS; - /* fall through */ + fallthrough; case -ENOTSUPP: status = -EOPNOTSUPP; } diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 142225f0af59..2b2211d1234e 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -356,7 +356,15 @@ static ssize_t _nfs42_proc_copy(struct file *src, truncate_pagecache_range(dst_inode, pos_dst, pos_dst + res->write_res.count); - + spin_lock(&dst_inode->i_lock); + NFS_I(dst_inode)->cache_validity |= (NFS_INO_REVAL_PAGECACHE | + NFS_INO_REVAL_FORCED | NFS_INO_INVALID_SIZE | + NFS_INO_INVALID_ATTR | NFS_INO_INVALID_DATA); + spin_unlock(&dst_inode->i_lock); + spin_lock(&src_inode->i_lock); + NFS_I(src_inode)->cache_validity |= (NFS_INO_REVAL_PAGECACHE | + NFS_INO_REVAL_FORCED | NFS_INO_INVALID_ATIME); + spin_unlock(&src_inode->i_lock); status = res->write_res.count; out: if (args->sync) diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index a33970765467..fdfc77486ace 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -211,7 +211,7 @@ static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence) ret = nfs42_proc_llseek(filep, offset, whence); if (ret != -ENOTSUPP) return ret; - /* Fall through */ + fallthrough; default: return nfs_file_llseek(filep, offset, whence); } diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index 1e7296395d71..62e6eea5c516 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -520,7 +520,7 @@ static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap, switch (token) { case Opt_find_uid: im->im_type = IDMAP_TYPE_USER; - /* Fall through */ + fallthrough; case Opt_find_gid: im->im_conv = IDMAP_CONV_NAMETOID; ret = match_strlcpy(im->im_name, &substr, IDMAP_NAMESZ); @@ -528,7 +528,7 @@ static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap, case Opt_find_user: im->im_type = IDMAP_TYPE_USER; - /* Fall through */ + fallthrough; case Opt_find_group: im->im_conv = IDMAP_CONV_IDTONAME; ret = match_int(&substr, &im->im_id); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index dbd01548335b..6e95c85fe395 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -483,7 +483,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server, stateid); goto wait_on_recovery; } - /* Fall through */ + fallthrough; case -NFS4ERR_OPENMODE: if (inode) { int err; @@ -534,10 +534,10 @@ static int nfs4_do_handle_exception(struct nfs_server *server, ret = -EBUSY; break; } - /* Fall through */ + fallthrough; case -NFS4ERR_DELAY: nfs_inc_server_stats(server, NFSIOS_DELAY); - /* Fall through */ + fallthrough; case -NFS4ERR_GRACE: case -NFS4ERR_LAYOUTTRYLATER: case -NFS4ERR_RECALLCONFLICT: @@ -1505,7 +1505,7 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode, case NFS4_OPEN_CLAIM_PREVIOUS: if (!test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags)) break; - /* Fall through */ + fallthrough; default: return 0; } @@ -2439,7 +2439,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) case NFS4_OPEN_CLAIM_DELEG_CUR_FH: case NFS4_OPEN_CLAIM_DELEG_PREV_FH: data->o_arg.open_bitmap = &nfs4_open_noattr_bitmap[0]; - /* Fall through */ + fallthrough; case NFS4_OPEN_CLAIM_FH: task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; } @@ -3293,8 +3293,10 @@ static int _nfs4_do_setattr(struct inode *inode, /* Servers should only apply open mode checks for file size changes */ truncate = (arg->iap->ia_valid & ATTR_SIZE) ? true : false; - if (!truncate) + if (!truncate) { + nfs4_inode_make_writeable(inode); goto zero_stateid; + } if (nfs4_copy_delegation_stateid(inode, FMODE_WRITE, &arg->stateid, &delegation_cred)) { /* Use that stateid */ @@ -3545,11 +3547,11 @@ static void nfs4_close_done(struct rpc_task *task, void *data) nfs4_free_revoked_stateid(server, &calldata->arg.stateid, task->tk_msg.rpc_cred); - /* Fallthrough */ + fallthrough; case -NFS4ERR_BAD_STATEID: if (calldata->arg.fmode == 0) break; - /* Fallthrough */ + fallthrough; default: task->tk_status = nfs4_async_handle_exception(task, server, task->tk_status, &exception); @@ -6294,7 +6296,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) nfs4_free_revoked_stateid(data->res.server, data->args.stateid, task->tk_msg.rpc_cred); - /* Fallthrough */ + fallthrough; case -NFS4ERR_BAD_STATEID: case -NFS4ERR_STALE_STATEID: case -ETIMEDOUT: @@ -6314,7 +6316,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) data->res.fattr = NULL; goto out_restart; } - /* Fallthrough */ + fallthrough; default: task->tk_status = nfs4_async_handle_exception(task, data->res.server, task->tk_status, @@ -6622,13 +6624,13 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) if (nfs4_update_lock_stateid(calldata->lsp, &calldata->res.stateid)) break; - /* Fall through */ + fallthrough; case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_EXPIRED: nfs4_free_revoked_stateid(calldata->server, &calldata->arg.stateid, task->tk_msg.rpc_cred); - /* Fall through */ + fallthrough; case -NFS4ERR_BAD_STATEID: case -NFS4ERR_STALE_STATEID: if (nfs4_sync_lock_stateid(&calldata->arg.stateid, @@ -7298,7 +7300,12 @@ int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, err = nfs4_set_lock_state(state, fl); if (err != 0) return err; - err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW); + do { + err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW); + if (err != -NFS4ERR_DELAY) + break; + ssleep(1); + } while (err == -NFS4ERR_DELAY); return nfs4_handle_delegation_recall_error(server, state, stateid, fl, err); } @@ -8665,7 +8672,7 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata) dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status); rpc_delay(task, NFS4_POLL_RETRY_MIN); task->tk_status = 0; - /* fall through */ + fallthrough; case -NFS4ERR_RETRY_UNCACHED_REP: rpc_restart_call_prepare(task); return; @@ -9113,13 +9120,13 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf switch(task->tk_status) { case 0: wake_up_all(&clp->cl_lock_waitq); - /* Fallthrough */ + fallthrough; case -NFS4ERR_COMPLETE_ALREADY: case -NFS4ERR_WRONG_CRED: /* What to do here? */ break; case -NFS4ERR_DELAY: rpc_delay(task, NFS4_POLL_RETRY_MAX); - /* fall through */ + fallthrough; case -NFS4ERR_RETRY_UNCACHED_REP: return -EAGAIN; case -NFS4ERR_BADSESSION: @@ -9434,10 +9441,10 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) &lrp->args.range, lrp->args.inode)) goto out_restart; - /* Fallthrough */ + fallthrough; default: task->tk_status = 0; - /* Fallthrough */ + fallthrough; case 0: break; case -NFS4ERR_DELAY: diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index b1dba24918f8..4bf10792cb5b 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1530,7 +1530,7 @@ restart: default: pr_err("NFS: %s: unhandled error %d\n", __func__, status); - /* Fall through */ + fallthrough; case -ENOMEM: case -NFS4ERR_DENIED: case -NFS4ERR_RECLAIM_BAD: @@ -1667,7 +1667,7 @@ restart: break; } printk(KERN_ERR "NFS: %s: unhandled error %d\n", __func__, status); - /* Fall through */ + fallthrough; case -ENOENT: case -ENOMEM: case -EACCES: @@ -1683,7 +1683,7 @@ restart: set_bit(ops->state_flag_bit, &state->flags); break; } - /* Fall through */ + fallthrough; case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_OLD_STATEID: @@ -1695,7 +1695,7 @@ restart: case -NFS4ERR_EXPIRED: case -NFS4ERR_NO_GRACE: nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); - /* Fall through */ + fallthrough; case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: @@ -2273,11 +2273,11 @@ again: case -ETIMEDOUT: if (clnt->cl_softrtry) break; - /* Fall through */ + fallthrough; case -NFS4ERR_DELAY: case -EAGAIN: ssleep(1); - /* Fall through */ + fallthrough; case -NFS4ERR_STALE_CLIENTID: dprintk("NFS: %s after status %d, retrying\n", __func__, status); @@ -2289,7 +2289,7 @@ again: } if (clnt->cl_auth->au_flavor == RPC_AUTH_UNIX) break; - /* Fall through */ + fallthrough; case -NFS4ERR_CLID_INUSE: case -NFS4ERR_WRONGSEC: /* No point in retrying if we already used RPC_AUTH_UNIX */ diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 6ea4cac41e46..6985cacf4700 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -711,7 +711,7 @@ static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr, case FLUSH_COND_STABLE: if (nfs_reqs_to_commit(cinfo)) break; - /* fall through */ + fallthrough; default: hdr->args.stable = NFS_FILE_SYNC; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 40332c758d84..71f7741126b6 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1541,7 +1541,7 @@ void pnfs_roc_release(struct nfs4_layoutreturn_args *args, case 0: if (res->lrs_present) res_stateid = &res->stateid; - /* Fallthrough */ + fallthrough; default: arg_stateid = &args->stateid; } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 7a70287f21a2..f943e37853fa 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1200,13 +1200,6 @@ static void nfs_get_cache_cookie(struct super_block *sb, } #endif -static void nfs_set_readahead(struct backing_dev_info *bdi, - unsigned long iomax_pages) -{ - bdi->ra_pages = VM_READAHEAD_PAGES; - bdi->io_pages = iomax_pages; -} - int nfs_get_tree_common(struct fs_context *fc) { struct nfs_fs_context *ctx = nfs_fc2context(fc); @@ -1251,7 +1244,7 @@ int nfs_get_tree_common(struct fs_context *fc) MINOR(server->s_dev)); if (error) goto error_splat_super; - nfs_set_readahead(s->s_bdi, server->rpages); + s->s_bdi->io_pages = server->rpages; server->super = s; } diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c index 8ceb6425e01a..d056ad2fdefd 100644 --- a/fs/nfs_common/nfsacl.c +++ b/fs/nfs_common/nfsacl.c @@ -237,7 +237,7 @@ posix_acl_from_nfsacl(struct posix_acl *acl) break; case ACL_MASK: mask = pa; - /* fall through */ + fallthrough; case ACL_OTHER: break; } diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 9bbaa671c079..a07c39c94bbd 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -83,13 +83,13 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp, bex->soff = iomap.addr; break; } - /*FALLTHRU*/ + fallthrough; case IOMAP_HOLE: if (seg->iomode == IOMODE_READ) { bex->es = PNFS_BLOCK_NONE_DATA; break; } - /*FALLTHRU*/ + fallthrough; case IOMAP_DELALLOC: default: WARN(1, "pnfsd: filesystem returned %d extent\n", iomap.type); @@ -170,7 +170,7 @@ nfsd4_block_proc_getdeviceinfo(struct super_block *sb, struct nfs4_client *clp, struct nfsd4_getdeviceinfo *gdp) { - if (sb->s_bdev != sb->s_bdev->bd_contains) + if (bdev_is_partition(sb->s_bdev)) return nfserr_inval; return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp)); } @@ -382,7 +382,7 @@ nfsd4_scsi_proc_getdeviceinfo(struct super_block *sb, struct nfs4_client *clp, struct nfsd4_getdeviceinfo *gdp) { - if (sb->s_bdev != sb->s_bdev->bd_contains) + if (bdev_is_partition(sb->s_bdev)) return nfserr_inval; return nfserrno(nfsd4_block_get_device_info_scsi(sb, clp, gdp)); } diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 7fbe9840a03e..052be5bf9ef5 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1119,7 +1119,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback break; case -ESERVERFAULT: ++session->se_cb_seq_nr; - /* Fall through */ + fallthrough; case 1: case -NFS4ERR_BADSESSION: nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status); diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index e12409eca7cc..a97873f2d22b 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -681,7 +681,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) rpc_delay(task, HZ/100); /* 10 mili-seconds */ return 0; } - /* Fallthrough */ + fallthrough; default: /* * Unknown error or non-responding client, we'll need to fence. diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index a527da3d8052..eaf50eafa935 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -428,7 +428,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; reclaim = true; - /* fall through */ + fallthrough; case NFS4_OPEN_CLAIM_FH: case NFS4_OPEN_CLAIM_DELEG_CUR_FH: status = do_open_fhandle(rqstp, cstate, open); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 81ed8e8bab3f..c09a2a4281ec 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3117,7 +3117,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, break; default: /* checked by xdr code */ WARN_ON_ONCE(1); - /* fall through */ + fallthrough; case SP4_SSV: status = nfserr_encr_alg_unsupp; goto out_nolock; @@ -4532,7 +4532,7 @@ static int nfsd4_cb_recall_done(struct nfsd4_callback *cb, rpc_delay(task, 2 * HZ); return 0; } - /*FALLTHRU*/ + fallthrough; default: return 1; } @@ -4597,6 +4597,8 @@ static bool nfsd_breaker_owns_lease(struct file_lock *fl) if (!i_am_nfsd()) return NULL; rqst = kthread_data(current); + if (!rqst->rq_lease_breaker) + return NULL; clp = *(rqst->rq_lease_breaker); return dl->dl_stid.sc_client == clp; } @@ -5652,7 +5654,7 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) break; default: printk("unknown stateid type %x\n", s->sc_type); - /* Fallthrough */ + fallthrough; case NFS4_CLOSED_STID: case NFS4_CLOSED_DELEG_STID: status = nfserr_bad_stateid; @@ -6742,7 +6744,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, case NFS4_READW_LT: if (nfsd4_has_session(cstate)) fl_flags |= FL_SLEEP; - /* Fallthrough */ + fallthrough; case NFS4_READ_LT: spin_lock(&fp->fi_lock); nf = find_readable_file_locked(fp); @@ -6754,7 +6756,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, case NFS4_WRITEW_LT: if (nfsd4_has_session(cstate)) fl_flags |= FL_SLEEP; - /* Fallthrough */ + fallthrough; case NFS4_WRITE_LT: spin_lock(&fp->fi_lock); nf = find_writeable_file_locked(fp); @@ -6816,7 +6818,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, break; case FILE_LOCK_DEFERRED: nbl = NULL; - /* Fallthrough */ + fallthrough; case -EAGAIN: /* conflock holds conflicting lock */ status = nfserr_denied; dprintk("NFSD: nfsd4_lock: conflicting lock found!\n"); diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 37bc8f5f4514..c81dbbad8792 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -459,7 +459,7 @@ static bool fsid_type_ok_for_exp(u8 fsid_type, struct svc_export *exp) case FSID_DEV: if (!old_valid_dev(exp_sb(exp)->s_dev)) return false; - /* FALL THROUGH */ + fallthrough; case FSID_MAJOR_MINOR: case FSID_ENCODE_DEV: return exp_sb(exp)->s_type->fs_flags & FS_REQUIRES_DEV; @@ -469,7 +469,7 @@ static bool fsid_type_ok_for_exp(u8 fsid_type, struct svc_export *exp) case FSID_UUID16: if (!is_root_export(exp)) return false; - /* fall through */ + fallthrough; case FSID_UUID4_INUM: case FSID_UUID16_INUM: return exp->ex_uuid != NULL; diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 543bbe0a556e..6e0b066480c5 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -314,7 +314,7 @@ nfsd_proc_create(struct svc_rqst *rqstp) rdev = inode->i_rdev; attr->ia_valid |= ATTR_SIZE; - /* FALLTHROUGH */ + fallthrough; case S_IFIFO: /* this is probably a permission check.. * at least IRIX implements perm checking on diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index b603dfcdd361..f7f6473578af 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -221,7 +221,7 @@ int nfsd_vers(struct nfsd_net *nn, int vers, enum vers_op change) case NFSD_TEST: if (nn->nfsd_versions) return nn->nfsd_versions[vers]; - /* Fallthrough */ + fallthrough; case NFSD_AVAIL: return nfsd_support_version(vers); } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 7d2933b85b65..aba5af9df328 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1456,7 +1456,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, *created = true; break; } - /* fall through */ + fallthrough; case NFS4_CREATE_EXCLUSIVE4_1: if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime && d_inode(dchild)->i_atime.tv_sec == v_atime @@ -1465,7 +1465,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, *created = true; goto set_attr; } - /* fall through */ + fallthrough; case NFS3_CREATE_GUARDED: err = nfserr_exist; } diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index fb5a9a8a13cf..e516ae389ca5 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -519,7 +519,7 @@ int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode) break; case NILFS_IFILE_INO: lockdep_set_class(&bmap->b_sem, &nilfs_bmap_mdt_lock_key); - /* Fall through */ + fallthrough; default: bmap->b_ptr_type = NILFS_BMAP_PTR_VM; bmap->b_last_allocated_key = 0; diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index 0b453ef8fae5..2217f904a7cf 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -626,7 +626,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs, !(flags & NILFS_SS_SYNDT)) goto try_next_pseg; state = RF_DSYNC_ST; - /* Fall through */ + fallthrough; case RF_DSYNC_ST: if (!(flags & NILFS_SS_SYNDT)) goto confused; diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index a651e821c2de..e3726aca28ed 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1138,7 +1138,8 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) nilfs_sc_cstage_set(sci, NILFS_ST_DAT); goto dat_stage; } - nilfs_sc_cstage_inc(sci); /* Fall through */ + nilfs_sc_cstage_inc(sci); + fallthrough; case NILFS_ST_GC: if (nilfs_doing_gc()) { head = &sci->sc_gc_inodes; @@ -1159,7 +1160,8 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) } sci->sc_stage.gc_inode_ptr = NULL; } - nilfs_sc_cstage_inc(sci); /* Fall through */ + nilfs_sc_cstage_inc(sci); + fallthrough; case NILFS_ST_FILE: head = &sci->sc_dirty_files; ii = list_prepare_entry(sci->sc_stage.dirty_file_ptr, head, @@ -1186,7 +1188,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) } nilfs_sc_cstage_inc(sci); sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED; - /* Fall through */ + fallthrough; case NILFS_ST_IFILE: err = nilfs_segctor_scan_file(sci, sci->sc_root->ifile, &nilfs_sc_file_ops); @@ -1197,13 +1199,14 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) err = nilfs_segctor_create_checkpoint(sci); if (unlikely(err)) break; - /* Fall through */ + fallthrough; case NILFS_ST_CPFILE: err = nilfs_segctor_scan_file(sci, nilfs->ns_cpfile, &nilfs_sc_file_ops); if (unlikely(err)) break; - nilfs_sc_cstage_inc(sci); /* Fall through */ + nilfs_sc_cstage_inc(sci); + fallthrough; case NILFS_ST_SUFILE: err = nilfs_sufile_freev(nilfs->ns_sufile, sci->sc_freesegs, sci->sc_nfreesegs, &ndone); @@ -1219,7 +1222,8 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) &nilfs_sc_file_ops); if (unlikely(err)) break; - nilfs_sc_cstage_inc(sci); /* Fall through */ + nilfs_sc_cstage_inc(sci); + fallthrough; case NILFS_ST_DAT: dat_stage: err = nilfs_segctor_scan_file(sci, nilfs->ns_dat, @@ -1230,7 +1234,8 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) nilfs_sc_cstage_set(sci, NILFS_ST_DONE); return 0; } - nilfs_sc_cstage_inc(sci); /* Fall through */ + nilfs_sc_cstage_inc(sci); + fallthrough; case NILFS_ST_SR: if (mode == SC_LSEG_SR) { /* Appending a super root */ diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 559de311deca..3e01d8f2ab90 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1147,7 +1147,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, } switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) { - case FAN_MARK_ADD: /* fallthrough */ + case FAN_MARK_ADD: case FAN_MARK_REMOVE: if (!mask) return -EINVAL; diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 9bb9f0952b18..caf563981532 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -1810,6 +1810,12 @@ int ntfs_read_inode_mount(struct inode *vi) brelse(bh); } + if (le32_to_cpu(m->bytes_allocated) != vol->mft_record_size) { + ntfs_error(sb, "Incorrect mft record size %u in superblock, should be %u.", + le32_to_cpu(m->bytes_allocated), vol->mft_record_size); + goto err_out; + } + /* Apply the mst fixups. */ if (post_read_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size)) { /* FIXME: Try to use the $MFTMirr now. */ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 4c1b90442d6f..78710788c237 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6013,7 +6013,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) goto out; } - /* Appending truncate log(TA) and and flushing truncate log(TF) are + /* Appending truncate log(TA) and flushing truncate log(TF) are * two separated transactions. They can be both committed but not * checkpointed. If crash occurs then, both two transaction will be * replayed with several already released to global bitmap clusters. @@ -7654,8 +7654,10 @@ out_mutex: * main_bm related locks for avoiding the current IO starve, then go to * trim the next group */ - if (ret >= 0 && group <= last_group) + if (ret >= 0 && group <= last_group) { + cond_resched(); goto next_group; + } out: range->len = trimmed * sb->s_blocksize; return ret; diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 89d13e0705fe..0179a73a3fa2 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1766,7 +1766,6 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, int sectsize; char *p = (char *)page; struct fd f; - struct inode *inode; ssize_t ret = -EINVAL; int live_threshold; @@ -1793,20 +1792,16 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, reg->hr_block_bytes == 0) goto out2; - inode = igrab(f.file->f_mapping->host); - if (inode == NULL) + if (!S_ISBLK(f.file->f_mapping->host->i_mode)) goto out2; - if (!S_ISBLK(inode->i_mode)) - goto out3; - - reg->hr_bdev = I_BDEV(f.file->f_mapping->host); - ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL); - if (ret) { + reg->hr_bdev = blkdev_get_by_dev(f.file->f_mapping->host->i_rdev, + FMODE_WRITE | FMODE_READ, NULL); + if (IS_ERR(reg->hr_bdev)) { + ret = PTR_ERR(reg->hr_bdev); reg->hr_bdev = NULL; - goto out3; + goto out2; } - inode = NULL; bdevname(reg->hr_bdev, reg->hr_dev_name); @@ -1909,16 +1904,13 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, config_item_name(®->hr_item), reg->hr_dev_name); out3: - iput(inode); + if (ret < 0) { + blkdev_put(reg->hr_bdev, FMODE_READ | FMODE_WRITE); + reg->hr_bdev = NULL; + } out2: fdput(f); out: - if (ret < 0) { - if (reg->hr_bdev) { - blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE); - reg->hr_bdev = NULL; - } - } return ret; } diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index 1ef24574f481..cea739be77c4 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -67,7 +67,7 @@ static void o2quo_fence_self(void) default: WARN_ON(o2nm_single_cluster->cl_fence_method >= O2NM_FENCE_METHODS); - /* fall through */ + fallthrough; case O2NM_FENCE_RESET: printk(KERN_ERR "*** ocfs2 is very sorry to be fencing this " "system by restarting ***\n"); diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 720e9f94957e..fc8252a28cb1 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -677,7 +677,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, /* * Under certain conditions, the window slide code * might have reduced the number of bits available or - * disabled the the local alloc entirely. Re-check + * disabled the local alloc entirely. Re-check * here and return -ENOSPC if necessary. */ status = -ENOSPC; diff --git a/fs/pipe.c b/fs/pipe.c index 60dbee457143..0ac197658a2d 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -106,25 +106,6 @@ void pipe_double_lock(struct pipe_inode_info *pipe1, } } -/* Drop the inode semaphore and wait for a pipe event, atomically */ -void pipe_wait(struct pipe_inode_info *pipe) -{ - DEFINE_WAIT(rdwait); - DEFINE_WAIT(wrwait); - - /* - * Pipes are system-local resources, so sleeping on them - * is considered a noninteractive wait: - */ - prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE); - prepare_to_wait(&pipe->wr_wait, &wrwait, TASK_INTERRUPTIBLE); - pipe_unlock(pipe); - schedule(); - finish_wait(&pipe->rd_wait, &rdwait); - finish_wait(&pipe->wr_wait, &wrwait); - pipe_lock(pipe); -} - static void anon_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { @@ -913,19 +894,18 @@ int create_pipe_files(struct file **res, int flags) { struct inode *inode = get_pipe_inode(); struct file *f; + int error; if (!inode) return -ENFILE; if (flags & O_NOTIFICATION_PIPE) { -#ifdef CONFIG_WATCH_QUEUE - if (watch_queue_init(inode->i_pipe) < 0) { + error = watch_queue_init(inode->i_pipe); + if (error) { + free_pipe_info(inode->i_pipe); iput(inode); - return -ENOMEM; + return error; } -#else - return -ENOPKG; -#endif } f = alloc_file_pseudo(inode, pipe_mnt, "", @@ -1035,12 +1015,52 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes) return do_pipe2(fildes, 0); } +/* + * This is the stupid "wait for pipe to be readable or writable" + * model. + * + * See pipe_read/write() for the proper kind of exclusive wait, + * but that requires that we wake up any other readers/writers + * if we then do not end up reading everything (ie the whole + * "wake_next_reader/writer" logic in pipe_read/write()). + */ +void pipe_wait_readable(struct pipe_inode_info *pipe) +{ + pipe_unlock(pipe); + wait_event_interruptible(pipe->rd_wait, pipe_readable(pipe)); + pipe_lock(pipe); +} + +void pipe_wait_writable(struct pipe_inode_info *pipe) +{ + pipe_unlock(pipe); + wait_event_interruptible(pipe->wr_wait, pipe_writable(pipe)); + pipe_lock(pipe); +} + +/* + * This depends on both the wait (here) and the wakeup (wake_up_partner) + * holding the pipe lock, so "*cnt" is stable and we know a wakeup cannot + * race with the count check and waitqueue prep. + * + * Normally in order to avoid races, you'd do the prepare_to_wait() first, + * then check the condition you're waiting for, and only then sleep. But + * because of the pipe lock, we can check the condition before being on + * the wait queue. + * + * We use the 'rd_wait' waitqueue for pipe partner waiting. + */ static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) { + DEFINE_WAIT(rdwait); int cur = *cnt; while (cur == *cnt) { - pipe_wait(pipe); + prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE); + pipe_unlock(pipe); + schedule(); + finish_wait(&pipe->rd_wait, &rdwait); + pipe_lock(pipe); if (signal_pending(current)) break; } @@ -1050,7 +1070,6 @@ static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) static void wake_up_partner(struct pipe_inode_info *pipe) { wake_up_interruptible_all(&pipe->rd_wait); - wake_up_interruptible_all(&pipe->wr_wait); } static int fifo_open(struct inode *inode, struct file *filp) diff --git a/fs/proc/base.c b/fs/proc/base.c index 617db4e0faa0..aa69c35d904c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1055,7 +1055,6 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) { - static DEFINE_MUTEX(oom_adj_mutex); struct mm_struct *mm = NULL; struct task_struct *task; int err = 0; @@ -1095,7 +1094,7 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) struct task_struct *p = find_lock_task_mm(task); if (p) { - if (atomic_read(&p->mm->mm_users) > 1) { + if (test_bit(MMF_MULTIPROCESS, &p->mm->flags)) { mm = p->mm; mmgrab(mm); } diff --git a/fs/proc/page.c b/fs/proc/page.c index f909243d4a66..9f1077d94cde 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -217,6 +217,9 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2); u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1); u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1); +#ifdef CONFIG_64BIT + u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2); +#endif return u; }; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 5066b0251ed8..846d43df3fdf 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -520,16 +520,10 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, page = device_private_entry_to_page(swpent); } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap && pte_none(*pte))) { - page = find_get_entry(vma->vm_file->f_mapping, + page = xa_load(&vma->vm_file->f_mapping->i_pages, linear_page_index(vma, addr)); - if (!page) - return; - if (xa_is_value(page)) mss->swap += PAGE_SIZE; - else - put_page(page); - return; } @@ -653,6 +647,10 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_MERGEABLE)] = "mg", [ilog2(VM_UFFD_MISSING)]= "um", [ilog2(VM_UFFD_WP)] = "uw", +#ifdef CONFIG_ARM64_MTE + [ilog2(VM_MTE)] = "mt", + [ilog2(VM_MTE_ALLOWED)] = "", +#endif #ifdef CONFIG_ARCH_HAS_PKEYS /* These come out via ProtectionKey: */ [ilog2(VM_PKEY_BIT0)] = "", @@ -723,9 +721,21 @@ static const struct mm_walk_ops smaps_shmem_walk_ops = { .pte_hole = smaps_pte_hole, }; +/* + * Gather mem stats from @vma with the indicated beginning + * address @start, and keep them in @mss. + * + * Use vm_start of @vma as the beginning address if @start is 0. + */ static void smap_gather_stats(struct vm_area_struct *vma, - struct mem_size_stats *mss) + struct mem_size_stats *mss, unsigned long start) { + const struct mm_walk_ops *ops = &smaps_walk_ops; + + /* Invalid start */ + if (start >= vma->vm_end) + return; + #ifdef CONFIG_SHMEM /* In case of smaps_rollup, reset the value from previous vma */ mss->check_shmem_swap = false; @@ -742,18 +752,20 @@ static void smap_gather_stats(struct vm_area_struct *vma, */ unsigned long shmem_swapped = shmem_swap_usage(vma); - if (!shmem_swapped || (vma->vm_flags & VM_SHARED) || - !(vma->vm_flags & VM_WRITE)) { + if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) || + !(vma->vm_flags & VM_WRITE))) { mss->swap += shmem_swapped; } else { mss->check_shmem_swap = true; - walk_page_vma(vma, &smaps_shmem_walk_ops, mss); - return; + ops = &smaps_shmem_walk_ops; } } #endif /* mmap_lock is held in m_start */ - walk_page_vma(vma, &smaps_walk_ops, mss); + if (!start) + walk_page_vma(vma, ops, mss); + else + walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss); } #define SEQ_PUT_DEC(str, val) \ @@ -805,7 +817,7 @@ static int show_smap(struct seq_file *m, void *v) memset(&mss, 0, sizeof(mss)); - smap_gather_stats(vma, &mss); + smap_gather_stats(vma, &mss, 0); show_map_vma(m, vma); @@ -853,9 +865,73 @@ static int show_smaps_rollup(struct seq_file *m, void *v) hold_task_mempolicy(priv); - for (vma = priv->mm->mmap; vma; vma = vma->vm_next) { - smap_gather_stats(vma, &mss); + for (vma = priv->mm->mmap; vma;) { + smap_gather_stats(vma, &mss, 0); last_vma_end = vma->vm_end; + + /* + * Release mmap_lock temporarily if someone wants to + * access it for write request. + */ + if (mmap_lock_is_contended(mm)) { + mmap_read_unlock(mm); + ret = mmap_read_lock_killable(mm); + if (ret) { + release_task_mempolicy(priv); + goto out_put_mm; + } + + /* + * After dropping the lock, there are four cases to + * consider. See the following example for explanation. + * + * +------+------+-----------+ + * | VMA1 | VMA2 | VMA3 | + * +------+------+-----------+ + * | | | | + * 4k 8k 16k 400k + * + * Suppose we drop the lock after reading VMA2 due to + * contention, then we get: + * + * last_vma_end = 16k + * + * 1) VMA2 is freed, but VMA3 exists: + * + * find_vma(mm, 16k - 1) will return VMA3. + * In this case, just continue from VMA3. + * + * 2) VMA2 still exists: + * + * find_vma(mm, 16k - 1) will return VMA2. + * Iterate the loop like the original one. + * + * 3) No more VMAs can be found: + * + * find_vma(mm, 16k - 1) will return NULL. + * No more things to do, just break. + * + * 4) (last_vma_end - 1) is the middle of a vma (VMA'): + * + * find_vma(mm, 16k - 1) will return VMA' whose range + * contains last_vma_end. + * Iterate VMA' from last_vma_end. + */ + vma = find_vma(mm, last_vma_end - 1); + /* Case 3 above */ + if (!vma) + break; + + /* Case 1 above */ + if (vma->vm_start >= last_vma_end) + continue; + + /* Case 4 above */ + if (vma->vm_end > last_vma_end) + smap_gather_stats(vma, &mss, last_vma_end); + } + /* Case 2 above */ + vma = vma->vm_next; } show_vma_header_prefix(m, priv->mm->mmap->vm_start, diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c index 819428dfa32f..3ce89216670c 100644 --- a/fs/pstore/zone.c +++ b/fs/pstore/zone.c @@ -1081,7 +1081,6 @@ next_zone: readop = psz_ftrace_read; break; case PSTORE_TYPE_CONSOLE: - fallthrough; case PSTORE_TYPE_PMSG: readop = psz_record_read; break; diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig index d1ceb76adb71..b59cd172b5f9 100644 --- a/fs/quota/Kconfig +++ b/fs/quota/Kconfig @@ -70,8 +70,3 @@ config QFMT_V2 config QUOTACTL bool default n - -config QUOTACTL_COMPAT - bool - depends on QUOTACTL && COMPAT_FOR_U64_ALIGNMENT - default y diff --git a/fs/quota/Makefile b/fs/quota/Makefile index f2b49d0f0287..9160639daffa 100644 --- a/fs/quota/Makefile +++ b/fs/quota/Makefile @@ -4,5 +4,4 @@ obj-$(CONFIG_QFMT_V1) += quota_v1.o obj-$(CONFIG_QFMT_V2) += quota_v2.o obj-$(CONFIG_QUOTA_TREE) += quota_tree.o obj-$(CONFIG_QUOTACTL) += quota.o kqid.o -obj-$(CONFIG_QUOTACTL_COMPAT) += compat.o obj-$(CONFIG_QUOTA_NETLINK_INTERFACE) += netlink.o diff --git a/fs/quota/compat.c b/fs/quota/compat.c deleted file mode 100644 index c30572857619..000000000000 --- a/fs/quota/compat.c +++ /dev/null @@ -1,120 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include <linux/syscalls.h> -#include <linux/compat.h> -#include <linux/quotaops.h> - -/* - * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64) - * and is necessary due to alignment problems. - */ -struct compat_if_dqblk { - compat_u64 dqb_bhardlimit; - compat_u64 dqb_bsoftlimit; - compat_u64 dqb_curspace; - compat_u64 dqb_ihardlimit; - compat_u64 dqb_isoftlimit; - compat_u64 dqb_curinodes; - compat_u64 dqb_btime; - compat_u64 dqb_itime; - compat_uint_t dqb_valid; -}; - -/* XFS structures */ -struct compat_fs_qfilestat { - compat_u64 dqb_bhardlimit; - compat_u64 qfs_nblks; - compat_uint_t qfs_nextents; -}; - -struct compat_fs_quota_stat { - __s8 qs_version; - __u16 qs_flags; - __s8 qs_pad; - struct compat_fs_qfilestat qs_uquota; - struct compat_fs_qfilestat qs_gquota; - compat_uint_t qs_incoredqs; - compat_int_t qs_btimelimit; - compat_int_t qs_itimelimit; - compat_int_t qs_rtbtimelimit; - __u16 qs_bwarnlimit; - __u16 qs_iwarnlimit; -}; - -COMPAT_SYSCALL_DEFINE4(quotactl32, unsigned int, cmd, - const char __user *, special, qid_t, id, - void __user *, addr) -{ - unsigned int cmds; - struct if_dqblk __user *dqblk; - struct compat_if_dqblk __user *compat_dqblk; - struct fs_quota_stat __user *fsqstat; - struct compat_fs_quota_stat __user *compat_fsqstat; - compat_uint_t data; - u16 xdata; - long ret; - - cmds = cmd >> SUBCMDSHIFT; - - switch (cmds) { - case Q_GETQUOTA: - dqblk = compat_alloc_user_space(sizeof(struct if_dqblk)); - compat_dqblk = addr; - ret = kernel_quotactl(cmd, special, id, dqblk); - if (ret) - break; - if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) || - get_user(data, &dqblk->dqb_valid) || - put_user(data, &compat_dqblk->dqb_valid)) - ret = -EFAULT; - break; - case Q_SETQUOTA: - dqblk = compat_alloc_user_space(sizeof(struct if_dqblk)); - compat_dqblk = addr; - ret = -EFAULT; - if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) || - get_user(data, &compat_dqblk->dqb_valid) || - put_user(data, &dqblk->dqb_valid)) - break; - ret = kernel_quotactl(cmd, special, id, dqblk); - break; - case Q_XGETQSTAT: - fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat)); - compat_fsqstat = addr; - ret = kernel_quotactl(cmd, special, id, fsqstat); - if (ret) - break; - ret = -EFAULT; - /* Copying qs_version, qs_flags, qs_pad */ - if (copy_in_user(compat_fsqstat, fsqstat, - offsetof(struct compat_fs_quota_stat, qs_uquota))) - break; - /* Copying qs_uquota */ - if (copy_in_user(&compat_fsqstat->qs_uquota, - &fsqstat->qs_uquota, - sizeof(compat_fsqstat->qs_uquota)) || - get_user(data, &fsqstat->qs_uquota.qfs_nextents) || - put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents)) - break; - /* Copying qs_gquota */ - if (copy_in_user(&compat_fsqstat->qs_gquota, - &fsqstat->qs_gquota, - sizeof(compat_fsqstat->qs_gquota)) || - get_user(data, &fsqstat->qs_gquota.qfs_nextents) || - put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents)) - break; - /* Copying the rest */ - if (copy_in_user(&compat_fsqstat->qs_incoredqs, - &fsqstat->qs_incoredqs, - sizeof(struct compat_fs_quota_stat) - - offsetof(struct compat_fs_quota_stat, qs_incoredqs)) || - get_user(xdata, &fsqstat->qs_iwarnlimit) || - put_user(xdata, &compat_fsqstat->qs_iwarnlimit)) - break; - ret = 0; - break; - default: - ret = kernel_quotactl(cmd, special, id, addr); - } - return ret; -} diff --git a/fs/quota/compat.h b/fs/quota/compat.h new file mode 100644 index 000000000000..ef7d1e12d650 --- /dev/null +++ b/fs/quota/compat.h @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/compat.h> + +struct compat_if_dqblk { + compat_u64 dqb_bhardlimit; + compat_u64 dqb_bsoftlimit; + compat_u64 dqb_curspace; + compat_u64 dqb_ihardlimit; + compat_u64 dqb_isoftlimit; + compat_u64 dqb_curinodes; + compat_u64 dqb_btime; + compat_u64 dqb_itime; + compat_uint_t dqb_valid; +}; + +struct compat_fs_qfilestat { + compat_u64 dqb_bhardlimit; + compat_u64 qfs_nblks; + compat_uint_t qfs_nextents; +}; + +struct compat_fs_quota_stat { + __s8 qs_version; + __u16 qs_flags; + __s8 qs_pad; + struct compat_fs_qfilestat qs_uquota; + struct compat_fs_qfilestat qs_gquota; + compat_uint_t qs_incoredqs; + compat_int_t qs_btimelimit; + compat_int_t qs_itimelimit; + compat_int_t qs_rtbtimelimit; + __u16 qs_bwarnlimit; + __u16 qs_iwarnlimit; +}; diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 5444d3c4d93f..6b37d58f1067 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -19,6 +19,7 @@ #include <linux/types.h> #include <linux/writeback.h> #include <linux/nospec.h> +#include "compat.h" static int check_quotactl_permission(struct super_block *sb, int type, int cmd, qid_t id) @@ -38,7 +39,7 @@ static int check_quotactl_permission(struct super_block *sb, int type, int cmd, if ((type == USRQUOTA && uid_eq(current_euid(), make_kuid(current_user_ns(), id))) || (type == GRPQUOTA && in_egroup_p(make_kgid(current_user_ns(), id)))) break; - /*FALLTHROUGH*/ + fallthrough; default: if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -211,8 +212,18 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id, if (ret) return ret; copy_to_if_dqblk(&idq, &fdq); - if (copy_to_user(addr, &idq, sizeof(idq))) - return -EFAULT; + + if (compat_need_64bit_alignment_fixup()) { + struct compat_if_dqblk __user *compat_dqblk = addr; + + if (copy_to_user(compat_dqblk, &idq, sizeof(*compat_dqblk))) + return -EFAULT; + if (put_user(idq.dqb_valid, &compat_dqblk->dqb_valid)) + return -EFAULT; + } else { + if (copy_to_user(addr, &idq, sizeof(idq))) + return -EFAULT; + } return 0; } @@ -277,8 +288,16 @@ static int quota_setquota(struct super_block *sb, int type, qid_t id, struct if_dqblk idq; struct kqid qid; - if (copy_from_user(&idq, addr, sizeof(idq))) - return -EFAULT; + if (compat_need_64bit_alignment_fixup()) { + struct compat_if_dqblk __user *compat_dqblk = addr; + + if (copy_from_user(&idq, compat_dqblk, sizeof(*compat_dqblk)) || + get_user(idq.dqb_valid, &compat_dqblk->dqb_valid)) + return -EFAULT; + } else { + if (copy_from_user(&idq, addr, sizeof(idq))) + return -EFAULT; + } if (!sb->s_qcop->set_dqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); @@ -382,6 +401,33 @@ static int quota_getstate(struct super_block *sb, int type, return 0; } +static int compat_copy_fs_qfilestat(struct compat_fs_qfilestat __user *to, + struct fs_qfilestat *from) +{ + if (copy_to_user(to, from, sizeof(*to)) || + put_user(from->qfs_nextents, &to->qfs_nextents)) + return -EFAULT; + return 0; +} + +static int compat_copy_fs_quota_stat(struct compat_fs_quota_stat __user *to, + struct fs_quota_stat *from) +{ + if (put_user(from->qs_version, &to->qs_version) || + put_user(from->qs_flags, &to->qs_flags) || + put_user(from->qs_pad, &to->qs_pad) || + compat_copy_fs_qfilestat(&to->qs_uquota, &from->qs_uquota) || + compat_copy_fs_qfilestat(&to->qs_gquota, &from->qs_gquota) || + put_user(from->qs_incoredqs, &to->qs_incoredqs) || + put_user(from->qs_btimelimit, &to->qs_btimelimit) || + put_user(from->qs_itimelimit, &to->qs_itimelimit) || + put_user(from->qs_rtbtimelimit, &to->qs_rtbtimelimit) || + put_user(from->qs_bwarnlimit, &to->qs_bwarnlimit) || + put_user(from->qs_iwarnlimit, &to->qs_iwarnlimit)) + return -EFAULT; + return 0; +} + static int quota_getxstate(struct super_block *sb, int type, void __user *addr) { struct fs_quota_stat fqs; @@ -390,9 +436,14 @@ static int quota_getxstate(struct super_block *sb, int type, void __user *addr) if (!sb->s_qcop->get_state) return -ENOSYS; ret = quota_getstate(sb, type, &fqs); - if (!ret && copy_to_user(addr, &fqs, sizeof(fqs))) + if (ret) + return ret; + + if (compat_need_64bit_alignment_fixup()) + return compat_copy_fs_quota_stat(addr, &fqs); + if (copy_to_user(addr, &fqs, sizeof(fqs))) return -EFAULT; - return ret; + return 0; } static int quota_getstatev(struct super_block *sb, int type, @@ -816,8 +867,8 @@ static struct super_block *quotactl_block(const char __user *special, int cmd) * calls. Maybe we need to add the process quotas etc. in the future, * but we probably should use rlimits for that. */ -int kernel_quotactl(unsigned int cmd, const char __user *special, - qid_t id, void __user *addr) +SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special, + qid_t, id, void __user *, addr) { uint cmds, type; struct super_block *sb = NULL; @@ -871,9 +922,3 @@ out: path_put(pathp); return ret; } - -SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special, - qid_t, id, void __user *, addr) -{ - return kernel_quotactl(cmd, special, id, addr); -} diff --git a/fs/read_write.c b/fs/read_write.c index 5db58b8c78d0..19f5c4bf75aa 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -538,6 +538,14 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t inc_syscw(current); return ret; } +/* + * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()", + * but autofs is one of the few internal kernel users that actually + * wants this _and_ can be built as a module. So we need to export + * this symbol for autofs, even though it really isn't appropriate + * for any other kernel modules. + */ +EXPORT_SYMBOL_GPL(__kernel_write); ssize_t kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) @@ -752,185 +760,6 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, return ret; } -/** - * rw_copy_check_uvector() - Copy an array of &struct iovec from userspace - * into the kernel and check that it is valid. - * - * @type: One of %CHECK_IOVEC_ONLY, %READ, or %WRITE. - * @uvector: Pointer to the userspace array. - * @nr_segs: Number of elements in userspace array. - * @fast_segs: Number of elements in @fast_pointer. - * @fast_pointer: Pointer to (usually small on-stack) kernel array. - * @ret_pointer: (output parameter) Pointer to a variable that will point to - * either @fast_pointer, a newly allocated kernel array, or NULL, - * depending on which array was used. - * - * This function copies an array of &struct iovec of @nr_segs from - * userspace into the kernel and checks that each element is valid (e.g. - * it does not point to a kernel address or cause overflow by being too - * large, etc.). - * - * As an optimization, the caller may provide a pointer to a small - * on-stack array in @fast_pointer, typically %UIO_FASTIOV elements long - * (the size of this array, or 0 if unused, should be given in @fast_segs). - * - * @ret_pointer will always point to the array that was used, so the - * caller must take care not to call kfree() on it e.g. in case the - * @fast_pointer array was used and it was allocated on the stack. - * - * Return: The total number of bytes covered by the iovec array on success - * or a negative error code on error. - */ -ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, - unsigned long nr_segs, unsigned long fast_segs, - struct iovec *fast_pointer, - struct iovec **ret_pointer) -{ - unsigned long seg; - ssize_t ret; - struct iovec *iov = fast_pointer; - - /* - * SuS says "The readv() function *may* fail if the iovcnt argument - * was less than or equal to 0, or greater than {IOV_MAX}. Linux has - * traditionally returned zero for zero segments, so... - */ - if (nr_segs == 0) { - ret = 0; - goto out; - } - - /* - * First get the "struct iovec" from user memory and - * verify all the pointers - */ - if (nr_segs > UIO_MAXIOV) { - ret = -EINVAL; - goto out; - } - if (nr_segs > fast_segs) { - iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL); - if (iov == NULL) { - ret = -ENOMEM; - goto out; - } - } - if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) { - ret = -EFAULT; - goto out; - } - - /* - * According to the Single Unix Specification we should return EINVAL - * if an element length is < 0 when cast to ssize_t or if the - * total length would overflow the ssize_t return value of the - * system call. - * - * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the - * overflow case. - */ - ret = 0; - for (seg = 0; seg < nr_segs; seg++) { - void __user *buf = iov[seg].iov_base; - ssize_t len = (ssize_t)iov[seg].iov_len; - - /* see if we we're about to use an invalid len or if - * it's about to overflow ssize_t */ - if (len < 0) { - ret = -EINVAL; - goto out; - } - if (type >= 0 - && unlikely(!access_ok(buf, len))) { - ret = -EFAULT; - goto out; - } - if (len > MAX_RW_COUNT - ret) { - len = MAX_RW_COUNT - ret; - iov[seg].iov_len = len; - } - ret += len; - } -out: - *ret_pointer = iov; - return ret; -} - -#ifdef CONFIG_COMPAT -ssize_t compat_rw_copy_check_uvector(int type, - const struct compat_iovec __user *uvector, unsigned long nr_segs, - unsigned long fast_segs, struct iovec *fast_pointer, - struct iovec **ret_pointer) -{ - compat_ssize_t tot_len; - struct iovec *iov = *ret_pointer = fast_pointer; - ssize_t ret = 0; - int seg; - - /* - * SuS says "The readv() function *may* fail if the iovcnt argument - * was less than or equal to 0, or greater than {IOV_MAX}. Linux has - * traditionally returned zero for zero segments, so... - */ - if (nr_segs == 0) - goto out; - - ret = -EINVAL; - if (nr_segs > UIO_MAXIOV) - goto out; - if (nr_segs > fast_segs) { - ret = -ENOMEM; - iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL); - if (iov == NULL) - goto out; - } - *ret_pointer = iov; - - ret = -EFAULT; - if (!access_ok(uvector, nr_segs*sizeof(*uvector))) - goto out; - - /* - * Single unix specification: - * We should -EINVAL if an element length is not >= 0 and fitting an - * ssize_t. - * - * In Linux, the total length is limited to MAX_RW_COUNT, there is - * no overflow possibility. - */ - tot_len = 0; - ret = -EINVAL; - for (seg = 0; seg < nr_segs; seg++) { - compat_uptr_t buf; - compat_ssize_t len; - - if (__get_user(len, &uvector->iov_len) || - __get_user(buf, &uvector->iov_base)) { - ret = -EFAULT; - goto out; - } - if (len < 0) /* size_t not fitting in compat_ssize_t .. */ - goto out; - if (type >= 0 && - !access_ok(compat_ptr(buf), len)) { - ret = -EFAULT; - goto out; - } - if (len > MAX_RW_COUNT - tot_len) - len = MAX_RW_COUNT - tot_len; - tot_len += len; - iov->iov_base = compat_ptr(buf); - iov->iov_len = (compat_size_t) len; - uvector++; - iov++; - } - ret = tot_len; - -out: - return ret; -} -#endif - static ssize_t do_iter_read(struct file *file, struct iov_iter *iter, loff_t *pos, rwf_t flags) { @@ -1247,224 +1076,93 @@ SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec, return do_pwritev(fd, vec, vlen, pos, flags); } +/* + * Various compat syscalls. Note that they all pretend to take a native + * iovec - import_iovec will properly treat those as compat_iovecs based on + * in_compat_syscall(). + */ #ifdef CONFIG_COMPAT -static size_t compat_readv(struct file *file, - const struct compat_iovec __user *vec, - unsigned long vlen, loff_t *pos, rwf_t flags) -{ - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov = iovstack; - struct iov_iter iter; - ssize_t ret; - - ret = compat_import_iovec(READ, vec, vlen, UIO_FASTIOV, &iov, &iter); - if (ret >= 0) { - ret = do_iter_read(file, &iter, pos, flags); - kfree(iov); - } - if (ret > 0) - add_rchar(current, ret); - inc_syscr(current); - return ret; -} - -static size_t do_compat_readv(compat_ulong_t fd, - const struct compat_iovec __user *vec, - compat_ulong_t vlen, rwf_t flags) -{ - struct fd f = fdget_pos(fd); - ssize_t ret; - loff_t pos; - - if (!f.file) - return -EBADF; - pos = f.file->f_pos; - ret = compat_readv(f.file, vec, vlen, &pos, flags); - if (ret >= 0) - f.file->f_pos = pos; - fdput_pos(f); - return ret; - -} - -COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd, - const struct compat_iovec __user *,vec, - compat_ulong_t, vlen) -{ - return do_compat_readv(fd, vec, vlen, 0); -} - -static long do_compat_preadv64(unsigned long fd, - const struct compat_iovec __user *vec, - unsigned long vlen, loff_t pos, rwf_t flags) -{ - struct fd f; - ssize_t ret; - - if (pos < 0) - return -EINVAL; - f = fdget(fd); - if (!f.file) - return -EBADF; - ret = -ESPIPE; - if (f.file->f_mode & FMODE_PREAD) - ret = compat_readv(f.file, vec, vlen, &pos, flags); - fdput(f); - return ret; -} - #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64 COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, - const struct compat_iovec __user *,vec, + const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos) { - return do_compat_preadv64(fd, vec, vlen, pos, 0); + return do_preadv(fd, vec, vlen, pos, 0); } #endif COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, - const struct compat_iovec __user *,vec, + const struct iovec __user *, vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; - return do_compat_preadv64(fd, vec, vlen, pos, 0); + return do_preadv(fd, vec, vlen, pos, 0); } #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2 COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, - const struct compat_iovec __user *,vec, + const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { if (pos == -1) - return do_compat_readv(fd, vec, vlen, flags); - - return do_compat_preadv64(fd, vec, vlen, pos, flags); + return do_readv(fd, vec, vlen, flags); + return do_preadv(fd, vec, vlen, pos, flags); } #endif COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd, - const struct compat_iovec __user *,vec, + const struct iovec __user *, vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; if (pos == -1) - return do_compat_readv(fd, vec, vlen, flags); - - return do_compat_preadv64(fd, vec, vlen, pos, flags); -} - -static size_t compat_writev(struct file *file, - const struct compat_iovec __user *vec, - unsigned long vlen, loff_t *pos, rwf_t flags) -{ - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov = iovstack; - struct iov_iter iter; - ssize_t ret; - - ret = compat_import_iovec(WRITE, vec, vlen, UIO_FASTIOV, &iov, &iter); - if (ret >= 0) { - file_start_write(file); - ret = do_iter_write(file, &iter, pos, flags); - file_end_write(file); - kfree(iov); - } - if (ret > 0) - add_wchar(current, ret); - inc_syscw(current); - return ret; -} - -static size_t do_compat_writev(compat_ulong_t fd, - const struct compat_iovec __user* vec, - compat_ulong_t vlen, rwf_t flags) -{ - struct fd f = fdget_pos(fd); - ssize_t ret; - loff_t pos; - - if (!f.file) - return -EBADF; - pos = f.file->f_pos; - ret = compat_writev(f.file, vec, vlen, &pos, flags); - if (ret >= 0) - f.file->f_pos = pos; - fdput_pos(f); - return ret; -} - -COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd, - const struct compat_iovec __user *, vec, - compat_ulong_t, vlen) -{ - return do_compat_writev(fd, vec, vlen, 0); -} - -static long do_compat_pwritev64(unsigned long fd, - const struct compat_iovec __user *vec, - unsigned long vlen, loff_t pos, rwf_t flags) -{ - struct fd f; - ssize_t ret; - - if (pos < 0) - return -EINVAL; - f = fdget(fd); - if (!f.file) - return -EBADF; - ret = -ESPIPE; - if (f.file->f_mode & FMODE_PWRITE) - ret = compat_writev(f.file, vec, vlen, &pos, flags); - fdput(f); - return ret; + return do_readv(fd, vec, vlen, flags); + return do_preadv(fd, vec, vlen, pos, flags); } #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64 COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, - const struct compat_iovec __user *,vec, + const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos) { - return do_compat_pwritev64(fd, vec, vlen, pos, 0); + return do_pwritev(fd, vec, vlen, pos, 0); } #endif COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, - const struct compat_iovec __user *,vec, + const struct iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; - return do_compat_pwritev64(fd, vec, vlen, pos, 0); + return do_pwritev(fd, vec, vlen, pos, 0); } #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, - const struct compat_iovec __user *,vec, + const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { if (pos == -1) - return do_compat_writev(fd, vec, vlen, flags); - - return do_compat_pwritev64(fd, vec, vlen, pos, flags); + return do_writev(fd, vec, vlen, flags); + return do_pwritev(fd, vec, vlen, pos, flags); } #endif COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd, - const struct compat_iovec __user *,vec, + const struct iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; if (pos == -1) - return do_compat_writev(fd, vec, vlen, flags); - - return do_compat_pwritev64(fd, vec, vlen, pos, flags); + return do_writev(fd, vec, vlen, flags); + return do_pwritev(fd, vec, vlen, pos, flags); } - -#endif +#endif /* CONFIG_COMPAT */ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, loff_t max) diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c index 6b2b4362089e..b57b3ffcbc32 100644 --- a/fs/romfs/storage.c +++ b/fs/romfs/storage.c @@ -217,10 +217,8 @@ int romfs_dev_read(struct super_block *sb, unsigned long pos, size_t limit; limit = romfs_maxsize(sb); - if (pos >= limit) + if (pos >= limit || buflen > limit - pos) return -EIO; - if (buflen > limit - pos) - buflen = limit - pos; #ifdef CONFIG_ROMFS_ON_MTD if (sb->s_mtd) diff --git a/fs/seq_file.c b/fs/seq_file.c index 4e6239f33c06..31219c1db17d 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -295,7 +295,7 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence) switch (whence) { case SEEK_CUR: offset += file->f_pos; - /* fall through */ + fallthrough; case SEEK_SET: if (offset < 0) break; diff --git a/fs/signalfd.c b/fs/signalfd.c index 5b78719be445..456046e15873 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -176,7 +176,7 @@ static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, kernel_siginfo_t *info if (!nonblock) break; ret = -EAGAIN; - /* fall through */ + fallthrough; default: spin_unlock_irq(¤t->sighand->siglock); return ret; diff --git a/fs/splice.c b/fs/splice.c index d7c8a7c4db07..70cc52af780b 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -33,7 +33,6 @@ #include <linux/security.h> #include <linux/gfp.h> #include <linux/socket.h> -#include <linux/compat.h> #include <linux/sched/signal.h> #include "internal.h" @@ -526,6 +525,22 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des return 1; } +/* We know we have a pipe buffer, but maybe it's empty? */ +static inline bool eat_empty_buffer(struct pipe_inode_info *pipe) +{ + unsigned int tail = pipe->tail; + unsigned int mask = pipe->ring_size - 1; + struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + + if (unlikely(!buf->len)) { + pipe_buf_release(pipe, buf); + pipe->tail = tail+1; + return true; + } + + return false; +} + /** * splice_from_pipe_next - wait for some data to splice from * @pipe: pipe to splice from @@ -545,6 +560,7 @@ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_des if (signal_pending(current)) return -ERESTARTSYS; +repeat: while (pipe_empty(pipe->head, pipe->tail)) { if (!pipe->writers) return 0; @@ -563,9 +579,12 @@ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_des sd->need_wakeup = false; } - pipe_wait(pipe); + pipe_wait_readable(pipe); } + if (eat_empty_buffer(pipe)) + goto repeat; + return 1; } @@ -1077,7 +1096,7 @@ static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags) return -EAGAIN; if (signal_pending(current)) return -ERESTARTSYS; - pipe_wait(pipe); + pipe_wait_writable(pipe); } } @@ -1332,20 +1351,6 @@ static int vmsplice_type(struct fd f, int *type) * Currently we punt and implement it as a normal copy, see pipe_to_user(). * */ -static long do_vmsplice(struct file *f, struct iov_iter *iter, unsigned int flags) -{ - if (unlikely(flags & ~SPLICE_F_ALL)) - return -EINVAL; - - if (!iov_iter_count(iter)) - return 0; - - if (iov_iter_rw(iter) == WRITE) - return vmsplice_to_pipe(f, iter, flags); - else - return vmsplice_to_user(f, iter, flags); -} - SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, unsigned long, nr_segs, unsigned int, flags) { @@ -1356,6 +1361,9 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, struct fd f; int type; + if (unlikely(flags & ~SPLICE_F_ALL)) + return -EINVAL; + f = fdget(fd); error = vmsplice_type(f, &type); if (error) @@ -1363,40 +1371,21 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, error = import_iovec(type, uiov, nr_segs, ARRAY_SIZE(iovstack), &iov, &iter); - if (error >= 0) { - error = do_vmsplice(f.file, &iter, flags); - kfree(iov); - } - fdput(f); - return error; -} - -#ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32, - unsigned int, nr_segs, unsigned int, flags) -{ - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov = iovstack; - struct iov_iter iter; - ssize_t error; - struct fd f; - int type; + if (error < 0) + goto out_fdput; - f = fdget(fd); - error = vmsplice_type(f, &type); - if (error) - return error; + if (!iov_iter_count(&iter)) + error = 0; + else if (iov_iter_rw(&iter) == WRITE) + error = vmsplice_to_pipe(f.file, &iter, flags); + else + error = vmsplice_to_user(f.file, &iter, flags); - error = compat_import_iovec(type, iov32, nr_segs, - ARRAY_SIZE(iovstack), &iov, &iter); - if (error >= 0) { - error = do_vmsplice(f.file, &iter, flags); - kfree(iov); - } + kfree(iov); +out_fdput: fdput(f); return error; } -#endif SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, @@ -1454,7 +1443,7 @@ static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) ret = -EAGAIN; break; } - pipe_wait(pipe); + pipe_wait_readable(pipe); } pipe_unlock(pipe); @@ -1493,7 +1482,7 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) ret = -ERESTARTSYS; break; } - pipe_wait(pipe); + pipe_wait_writable(pipe); } pipe_unlock(pipe); diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 76bb1c846845..8a19773b5a0b 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -87,7 +87,11 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length, int error, i; struct bio *bio; - bio = bio_alloc(GFP_NOIO, page_count); + if (page_count <= BIO_MAX_PAGES) + bio = bio_alloc(GFP_NOIO, page_count); + else + bio = bio_kmalloc(GFP_NOIO, page_count); + if (!bio) return -ENOMEM; diff --git a/fs/super.c b/fs/super.c index 904459b35119..a51c2083cd6b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1256,6 +1256,8 @@ static int set_bdev_super(struct super_block *s, void *data) s->s_dev = s->s_bdev->bd_dev; s->s_bdi = bdi_get(s->s_bdev->bd_bdi); + if (blk_queue_stable_writes(s->s_bdev->bd_disk->queue)) + s->s_iflags |= SB_I_STABLE_WRITES; return 0; } diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 9d042942d8b2..155521e51ac5 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -81,19 +81,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, struct ubifs_inode *ui; bool encrypted = false; - if (IS_ENCRYPTED(dir)) { - err = fscrypt_get_encryption_info(dir); - if (err) { - ubifs_err(c, "fscrypt_get_encryption_info failed: %i", err); - return ERR_PTR(err); - } - - if (!fscrypt_has_encryption_key(dir)) - return ERR_PTR(-EPERM); - - encrypted = true; - } - inode = new_inode(c->vfs_sb); ui = ubifs_inode(inode); if (!inode) @@ -112,6 +99,12 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, current_time(inode); inode->i_mapping->nrpages = 0; + err = fscrypt_prepare_new_inode(dir, inode, &encrypted); + if (err) { + ubifs_err(c, "fscrypt_prepare_new_inode failed: %i", err); + goto out_iput; + } + switch (mode & S_IFMT) { case S_IFREG: inode->i_mapping->a_ops = &ubifs_file_address_operations; @@ -131,7 +124,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, case S_IFBLK: case S_IFCHR: inode->i_op = &ubifs_file_inode_operations; - encrypted = false; break; default: BUG(); @@ -151,9 +143,8 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, if (c->highest_inum >= INUM_WATERMARK) { spin_unlock(&c->cnt_lock); ubifs_err(c, "out of inode numbers"); - make_bad_inode(inode); - iput(inode); - return ERR_PTR(-EINVAL); + err = -EINVAL; + goto out_iput; } ubifs_warn(c, "running out of inode numbers (current %lu, max %u)", (unsigned long)c->highest_inum, INUM_WATERMARK); @@ -171,16 +162,19 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, spin_unlock(&c->cnt_lock); if (encrypted) { - err = fscrypt_inherit_context(dir, inode, &encrypted, true); + err = fscrypt_set_context(inode, NULL); if (err) { - ubifs_err(c, "fscrypt_inherit_context failed: %i", err); - make_bad_inode(inode); - iput(inode); - return ERR_PTR(err); + ubifs_err(c, "fscrypt_set_context failed: %i", err); + goto out_iput; } } return inode; + +out_iput: + make_bad_inode(inode); + iput(inode); + return ERR_PTR(err); } static int dbg_check_name(const struct ubifs_info *c, @@ -515,7 +509,7 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx) if (err) return err; - err = fscrypt_fname_alloc_buffer(dir, UBIFS_MAX_NLEN, &fstr); + err = fscrypt_fname_alloc_buffer(UBIFS_MAX_NLEN, &fstr); if (err) return err; diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index 22bfda158f7f..6d6cd85c2b4c 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -269,7 +269,7 @@ void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops, break; /* No more room on heap so make it un-categorized */ cat = LPROPS_UNCAT; - /* Fall through */ + fallthrough; case LPROPS_UNCAT: list_add(&lprops->list, &c->uncat_list); break; @@ -313,7 +313,7 @@ static void ubifs_remove_from_cat(struct ubifs_info *c, case LPROPS_FREEABLE: c->freeable_cnt -= 1; ubifs_assert(c, c->freeable_cnt >= 0); - /* Fall through */ + fallthrough; case LPROPS_UNCAT: case LPROPS_EMPTY: case LPROPS_FRDI_IDX: diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index a2420c900275..fbddb2a1c03f 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2177,6 +2177,8 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) c->vi.vol_id); if (err) goto out_close; + sb->s_bdi->ra_pages = 0; + sb->s_bdi->io_pages = 0; sb->s_fs_info = c; sb->s_magic = UBIFS_SUPER_MAGIC; diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c index 6023c97c6da2..25ff91c7e94a 100644 --- a/fs/udf/symlink.c +++ b/fs/udf/symlink.c @@ -52,7 +52,7 @@ static int udf_pc_to_char(struct super_block *sb, unsigned char *from, elen += pc->lengthComponentIdent; break; } - /* Fall through */ + fallthrough; case 2: if (tolen == 0) return -ENAMETOOLONG; diff --git a/fs/ufs/util.h b/fs/ufs/util.h index e1f1b2e868a7..4931bec1a01c 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -42,7 +42,7 @@ ufs_get_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1, case UFS_ST_SUNOS: if (fs32_to_cpu(sb, usb3->fs_postblformat) == UFS_42POSTBLFMT) return fs32_to_cpu(sb, usb1->fs_u0.fs_sun.fs_state); - /* Fall Through - to UFS_ST_SUN */ + fallthrough; /* to UFS_ST_SUN */ case UFS_ST_SUN: return fs32_to_cpu(sb, usb3->fs_un2.fs_sun.fs_state); case UFS_ST_SUNx86: @@ -63,7 +63,7 @@ ufs_set_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1, usb1->fs_u0.fs_sun.fs_state = cpu_to_fs32(sb, value); break; } - /* Fall Through - to UFS_ST_SUN */ + fallthrough; /* to UFS_ST_SUN */ case UFS_ST_SUN: usb3->fs_un2.fs_sun.fs_state = cpu_to_fs32(sb, value); break; @@ -197,7 +197,7 @@ ufs_get_inode_uid(struct super_block *sb, struct ufs_inode *inode) case UFS_UID_EFT: if (inode->ui_u1.oldids.ui_suid == 0xFFFF) return fs32_to_cpu(sb, inode->ui_u3.ui_sun.ui_uid); - /* Fall through */ + fallthrough; default: return fs16_to_cpu(sb, inode->ui_u1.oldids.ui_suid); } @@ -215,7 +215,7 @@ ufs_set_inode_uid(struct super_block *sb, struct ufs_inode *inode, u32 value) inode->ui_u3.ui_sun.ui_uid = cpu_to_fs32(sb, value); if (value > 0xFFFF) value = 0xFFFF; - /* Fall through */ + fallthrough; default: inode->ui_u1.oldids.ui_suid = cpu_to_fs16(sb, value); break; @@ -231,7 +231,7 @@ ufs_get_inode_gid(struct super_block *sb, struct ufs_inode *inode) case UFS_UID_EFT: if (inode->ui_u1.oldids.ui_sgid == 0xFFFF) return fs32_to_cpu(sb, inode->ui_u3.ui_sun.ui_gid); - /* Fall through */ + fallthrough; default: return fs16_to_cpu(sb, inode->ui_u1.oldids.ui_sgid); } @@ -249,7 +249,7 @@ ufs_set_inode_gid(struct super_block *sb, struct ufs_inode *inode, u32 value) inode->ui_u3.ui_sun.ui_gid = cpu_to_fs32(sb, value); if (value > 0xFFFF) value = 0xFFFF; - /* Fall through */ + fallthrough; default: inode->ui_u1.oldids.ui_sgid = cpu_to_fs16(sb, value); break; diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c index 8fe03b4a0d2b..d7816c01a4f6 100644 --- a/fs/vboxsf/super.c +++ b/fs/vboxsf/super.c @@ -167,6 +167,8 @@ static int vboxsf_fill_super(struct super_block *sb, struct fs_context *fc) err = super_setup_bdi_name(sb, "vboxsf-%d", sbi->bdi_id); if (err) goto fail_free; + sb->s_bdi->ra_pages = 0; + sb->s_bdi->io_pages = 0; /* Turn source into a shfl_string and map the folder */ size = strlen(fc->source) + 1; @@ -384,7 +386,7 @@ fail_nomem: static int vboxsf_parse_monolithic(struct fs_context *fc, void *data) { - char *options = data; + unsigned char *options = data; if (options && options[0] == VBSF_MOUNT_SIGNATURE_BYTE_0 && options[1] == VBSF_MOUNT_SIGNATURE_BYTE_1 && diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c index 96bd160da48b..018057546067 100644 --- a/fs/vboxsf/utils.c +++ b/fs/vboxsf/utils.c @@ -226,7 +226,7 @@ int vboxsf_getattr(const struct path *path, struct kstat *kstat, break; case AT_STATX_FORCE_SYNC: sf_i->force_restat = 1; - /* fall-through */ + fallthrough; default: err = vboxsf_inode_revalidate(dentry); } diff --git a/fs/xattr.c b/fs/xattr.c index 386b45676d7e..cd7a563e8bcd 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -232,15 +232,15 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name, } /** - * __vfs_setxattr_locked: set an extended attribute while holding the inode + * __vfs_setxattr_locked - set an extended attribute while holding the inode * lock * - * @dentry - object to perform setxattr on - * @name - xattr name to set - * @value - value to set @name to - * @size - size of @value - * @flags - flags to pass into filesystem operations - * @delegated_inode - on return, will contain an inode pointer that + * @dentry: object to perform setxattr on + * @name: xattr name to set + * @value: value to set @name to + * @size: size of @value + * @flags: flags to pass into filesystem operations + * @delegated_inode: on return, will contain an inode pointer that * a delegation was broken on, NULL if none. */ int @@ -443,12 +443,12 @@ __vfs_removexattr(struct dentry *dentry, const char *name) EXPORT_SYMBOL(__vfs_removexattr); /** - * __vfs_removexattr_locked: set an extended attribute while holding the inode + * __vfs_removexattr_locked - set an extended attribute while holding the inode * lock * - * @dentry - object to perform setxattr on - * @name - name of xattr to remove - * @delegated_inode - on return, will contain an inode pointer that + * @dentry: object to perform setxattr on + * @name: name of xattr to remove + * @delegated_inode: on return, will contain an inode pointer that * a delegation was broken on, NULL if none. */ int diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index e841ed781a25..e986b95d94c9 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -93,25 +93,3 @@ kmem_alloc_large(size_t size, xfs_km_flags_t flags) return ptr; return __kmem_vmalloc(size, flags); } - -void * -kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags) -{ - int retries = 0; - gfp_t lflags = kmem_flags_convert(flags); - void *ptr; - - trace_kmem_realloc(newsize, flags, _RET_IP_); - - do { - ptr = krealloc(old, newsize, lflags); - if (ptr || (flags & KM_MAYFAIL)) - return ptr; - if (!(++retries % 100)) - xfs_err(NULL, - "%s(%u) possible memory allocation deadlock size %zu in %s (mode:0x%x)", - current->comm, current->pid, - newsize, __func__, lflags); - congestion_wait(BLK_RW_ASYNC, HZ/50); - } while (1); -} diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 8e8555817e6d..38007117697e 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -59,7 +59,6 @@ kmem_flags_convert(xfs_km_flags_t flags) extern void *kmem_alloc(size_t, xfs_km_flags_t); extern void *kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags); extern void *kmem_alloc_large(size_t size, xfs_km_flags_t); -extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t); static inline void kmem_free(const void *ptr) { kvfree(ptr); @@ -72,12 +71,6 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags) return kmem_alloc(size, flags | KM_ZERO); } -static inline void * -kmem_zalloc_large(size_t size, xfs_km_flags_t flags) -{ - return kmem_alloc_large(size, flags | KM_ZERO); -} - /* * Zone interfaces */ diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 8cf73fe4338e..9331f3516afa 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -333,6 +333,11 @@ xfs_agiblock_init( } for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); + if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) { + agi->agi_iblocks = cpu_to_be32(1); + if (xfs_sb_version_hasfinobt(&mp->m_sb)) + agi->agi_fblocks = cpu_to_be32(1); + } } typedef void (*aghdr_init_work_f)(struct xfs_mount *mp, struct xfs_buf *bp, diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 2e055c079f39..fd8e6418a0d3 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -428,7 +428,7 @@ xfs_attr_set( */ if (XFS_IFORK_Q(dp) == 0) { int sf_size = sizeof(struct xfs_attr_sf_hdr) + - XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, + xfs_attr_sf_entsize_byname(args->namelen, args->valuelen); error = xfs_bmap_add_attrfork(dp, sf_size, rsvd); @@ -523,6 +523,14 @@ out_trans_cancel: * External routines when attribute list is inside the inode *========================================================================*/ +static inline int xfs_attr_sf_totsize(struct xfs_inode *dp) +{ + struct xfs_attr_shortform *sf; + + sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data; + return be16_to_cpu(sf->hdr.totsize); +} + /* * Add a name to the shortform attribute list structure * This is the external routine. @@ -555,8 +563,8 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) args->valuelen >= XFS_ATTR_SF_ENTSIZE_MAX) return -ENOSPC; - newsize = XFS_ATTR_SF_TOTSIZE(args->dp); - newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen); + newsize = xfs_attr_sf_totsize(args->dp); + newsize += xfs_attr_sf_entsize_byname(args->namelen, args->valuelen); forkoff = xfs_attr_shortform_bytesfit(args->dp, newsize); if (!forkoff) diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 8623c815164a..bb128db220ac 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -653,8 +653,8 @@ xfs_attr_shortform_create( ASSERT(ifp->if_flags & XFS_IFINLINE); } xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK); - hdr = (xfs_attr_sf_hdr_t *)ifp->if_u1.if_data; - hdr->count = 0; + hdr = (struct xfs_attr_sf_hdr *)ifp->if_u1.if_data; + memset(hdr, 0, sizeof(*hdr)); hdr->totsize = cpu_to_be16(sizeof(*hdr)); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); } @@ -684,9 +684,9 @@ xfs_attr_sf_findname( sf = (struct xfs_attr_shortform *)args->dp->i_afp->if_u1.if_data; sfe = &sf->list[0]; end = sf->hdr.count; - for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), + for (i = 0; i < end; sfe = xfs_attr_sf_nextentry(sfe), base += size, i++) { - size = XFS_ATTR_SF_ENTSIZE(sfe); + size = xfs_attr_sf_entsize(sfe); if (!xfs_attr_match(args, sfe->namelen, sfe->nameval, sfe->flags)) continue; @@ -728,15 +728,15 @@ xfs_attr_shortform_add( ifp = dp->i_afp; ASSERT(ifp->if_flags & XFS_IFINLINE); - sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; if (xfs_attr_sf_findname(args, &sfe, NULL) == -EEXIST) ASSERT(0); offset = (char *)sfe - (char *)sf; - size = XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen); + size = xfs_attr_sf_entsize_byname(args->namelen, args->valuelen); xfs_idata_realloc(dp, size, XFS_ATTR_FORK); - sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; - sfe = (xfs_attr_sf_entry_t *)((char *)sf + offset); + sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; + sfe = (struct xfs_attr_sf_entry *)((char *)sf + offset); sfe->namelen = args->namelen; sfe->valuelen = args->valuelen; @@ -787,12 +787,12 @@ xfs_attr_shortform_remove( dp = args->dp; mp = dp->i_mount; - sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data; error = xfs_attr_sf_findname(args, &sfe, &base); if (error != -EEXIST) return error; - size = XFS_ATTR_SF_ENTSIZE(sfe); + size = xfs_attr_sf_entsize(sfe); /* * Fix up the attribute fork data, covering the hole @@ -837,8 +837,8 @@ xfs_attr_shortform_remove( int xfs_attr_shortform_lookup(xfs_da_args_t *args) { - xfs_attr_shortform_t *sf; - xfs_attr_sf_entry_t *sfe; + struct xfs_attr_shortform *sf; + struct xfs_attr_sf_entry *sfe; int i; struct xfs_ifork *ifp; @@ -846,10 +846,10 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args) ifp = args->dp->i_afp; ASSERT(ifp->if_flags & XFS_IFINLINE); - sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; - sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { + sfe = xfs_attr_sf_nextentry(sfe), i++) { if (xfs_attr_match(args, sfe->namelen, sfe->nameval, sfe->flags)) return -EEXIST; @@ -873,10 +873,10 @@ xfs_attr_shortform_getvalue( int i; ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE); - sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)args->dp->i_afp->if_u1.if_data; sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; - sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { + sfe = xfs_attr_sf_nextentry(sfe), i++) { if (xfs_attr_match(args, sfe->namelen, sfe->nameval, sfe->flags)) return xfs_attr_copy_value(args, @@ -908,12 +908,12 @@ xfs_attr_shortform_to_leaf( dp = args->dp; ifp = dp->i_afp; - sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; size = be16_to_cpu(sf->hdr.totsize); tmpbuffer = kmem_alloc(size, 0); ASSERT(tmpbuffer != NULL); memcpy(tmpbuffer, ifp->if_u1.if_data, size); - sf = (xfs_attr_shortform_t *)tmpbuffer; + sf = (struct xfs_attr_shortform *)tmpbuffer; xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); xfs_bmap_local_to_extents_empty(args->trans, dp, XFS_ATTR_FORK); @@ -951,7 +951,7 @@ xfs_attr_shortform_to_leaf( ASSERT(error != -ENOSPC); if (error) goto out; - sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + sfe = xfs_attr_sf_nextentry(sfe); } error = 0; *leaf_bp = bp; @@ -992,9 +992,8 @@ xfs_attr_shortform_allfit( return 0; if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX) return 0; - bytes += sizeof(struct xfs_attr_sf_entry) - 1 - + name_loc->namelen - + be16_to_cpu(name_loc->valuelen); + bytes += xfs_attr_sf_entsize_byname(name_loc->namelen, + be16_to_cpu(name_loc->valuelen)); } if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) && (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && @@ -1036,6 +1035,8 @@ xfs_attr_shortform_verify( * struct xfs_attr_sf_entry has a variable length. * Check the fixed-offset parts of the structure are * within the data buffer. + * xfs_attr_sf_entry is defined with a 1-byte variable + * array at the end, so we must subtract that off. */ if (((char *)sfep + sizeof(*sfep)) >= endp) return __this_address; @@ -1049,7 +1050,7 @@ xfs_attr_shortform_verify( * within the data buffer. The next entry starts after the * name component, so nextentry is an acceptable test. */ - next_sfep = XFS_ATTR_SF_NEXTENTRY(sfep); + next_sfep = xfs_attr_sf_nextentry(sfep); if ((char *)next_sfep > endp) return __this_address; diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h index bb004fb7944a..37578b369d9b 100644 --- a/fs/xfs/libxfs/xfs_attr_sf.h +++ b/fs/xfs/libxfs/xfs_attr_sf.h @@ -13,7 +13,6 @@ * to fit into the literal area of the inode. */ typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t; -typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t; /* * We generate this then sort it, attr_list() must return things in hash-order. @@ -27,16 +26,26 @@ typedef struct xfs_attr_sf_sort { unsigned char *name; /* name value, pointer into buffer */ } xfs_attr_sf_sort_t; -#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) /* space name/value uses */ \ - (((int)sizeof(xfs_attr_sf_entry_t)-1 + (nlen)+(vlen))) #define XFS_ATTR_SF_ENTSIZE_MAX /* max space for name&value */ \ ((1 << (NBBY*(int)sizeof(uint8_t))) - 1) -#define XFS_ATTR_SF_ENTSIZE(sfep) /* space an entry uses */ \ - ((int)sizeof(xfs_attr_sf_entry_t)-1 + (sfep)->namelen+(sfep)->valuelen) -#define XFS_ATTR_SF_NEXTENTRY(sfep) /* next entry in struct */ \ - ((xfs_attr_sf_entry_t *)((char *)(sfep) + XFS_ATTR_SF_ENTSIZE(sfep))) -#define XFS_ATTR_SF_TOTSIZE(dp) /* total space in use */ \ - (be16_to_cpu(((xfs_attr_shortform_t *) \ - ((dp)->i_afp->if_u1.if_data))->hdr.totsize)) + +/* space name/value uses */ +static inline int xfs_attr_sf_entsize_byname(uint8_t nlen, uint8_t vlen) +{ + return sizeof(struct xfs_attr_sf_entry) + nlen + vlen; +} + +/* space an entry uses */ +static inline int xfs_attr_sf_entsize(struct xfs_attr_sf_entry *sfep) +{ + return struct_size(sfep, nameval, sfep->namelen + sfep->valuelen); +} + +/* next entry in struct */ +static inline struct xfs_attr_sf_entry * +xfs_attr_sf_nextentry(struct xfs_attr_sf_entry *sfep) +{ + return (void *)sfep + xfs_attr_sf_entsize(sfep); +} #endif /* __XFS_ATTR_SF_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 9c40d5971035..1b0a01b06a05 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -6226,7 +6226,7 @@ xfs_bmap_validate_extent( isrt = XFS_IS_REALTIME_INODE(ip); endfsb = irec->br_startblock + irec->br_blockcount - 1; - if (isrt) { + if (isrt && whichfork == XFS_DATA_FORK) { if (!xfs_verify_rtbno(mp, irec->br_startblock)) return __this_address; if (!xfs_verify_rtbno(mp, endfsb)) diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 059ac108b1b3..b40a4e80f5ee 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -579,7 +579,7 @@ xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp) /* * Entries are packed toward the top as tight as possible. */ -typedef struct xfs_attr_shortform { +struct xfs_attr_shortform { struct xfs_attr_sf_hdr { /* constant-structure header block */ __be16 totsize; /* total bytes in shortform list */ __u8 count; /* count of active entries */ @@ -589,9 +589,9 @@ typedef struct xfs_attr_shortform { uint8_t namelen; /* actual length of name (no NULL) */ uint8_t valuelen; /* actual length of value (no NULL) */ uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ - uint8_t nameval[1]; /* name & value bytes concatenated */ + uint8_t nameval[]; /* name & value bytes concatenated */ } list[1]; /* variable sized array */ -} xfs_attr_shortform_t; +}; typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */ __be16 base; /* base of free region */ diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index 5a2db00b9d5f..6766417d5ba4 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -69,6 +69,13 @@ xfs_dquot_verify( ddq_type != XFS_DQTYPE_GROUP) return __this_address; + if ((ddq->d_type & XFS_DQTYPE_BIGTIME) && + !xfs_sb_version_hasbigtime(&mp->m_sb)) + return __this_address; + + if ((ddq->d_type & XFS_DQTYPE_BIGTIME) && !ddq->d_id) + return __this_address; + if (id != -1 && id != be32_to_cpu(ddq->d_id)) return __this_address; @@ -288,3 +295,31 @@ const struct xfs_buf_ops xfs_dquot_buf_ra_ops = { .verify_read = xfs_dquot_buf_readahead_verify, .verify_write = xfs_dquot_buf_write_verify, }; + +/* Convert an on-disk timer value into an incore timer value. */ +time64_t +xfs_dquot_from_disk_ts( + struct xfs_disk_dquot *ddq, + __be32 dtimer) +{ + uint32_t t = be32_to_cpu(dtimer); + + if (t != 0 && (ddq->d_type & XFS_DQTYPE_BIGTIME)) + return xfs_dq_bigtime_to_unix(t); + + return t; +} + +/* Convert an incore timer value into an on-disk timer value. */ +__be32 +xfs_dquot_to_disk_ts( + struct xfs_dquot *dqp, + time64_t timer) +{ + uint32_t t = timer; + + if (timer != 0 && (dqp->q_type & XFS_DQTYPE_BIGTIME)) + t = xfs_dq_unix_to_bigtime(timer); + + return cpu_to_be32(t); +} diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 31b7ece985bb..dd764da08f6f 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -449,10 +449,12 @@ xfs_sb_has_compat_feature( #define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */ #define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */ #define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflinked files */ +#define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3) /* inobt block counts */ #define XFS_SB_FEAT_RO_COMPAT_ALL \ (XFS_SB_FEAT_RO_COMPAT_FINOBT | \ XFS_SB_FEAT_RO_COMPAT_RMAPBT | \ - XFS_SB_FEAT_RO_COMPAT_REFLINK) + XFS_SB_FEAT_RO_COMPAT_REFLINK| \ + XFS_SB_FEAT_RO_COMPAT_INOBTCNT) #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL static inline bool xfs_sb_has_ro_compat_feature( @@ -465,10 +467,12 @@ xfs_sb_has_ro_compat_feature( #define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ #define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */ #define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */ +#define XFS_SB_FEAT_INCOMPAT_BIGTIME (1 << 3) /* large timestamps */ #define XFS_SB_FEAT_INCOMPAT_ALL \ (XFS_SB_FEAT_INCOMPAT_FTYPE| \ XFS_SB_FEAT_INCOMPAT_SPINODES| \ - XFS_SB_FEAT_INCOMPAT_META_UUID) + XFS_SB_FEAT_INCOMPAT_META_UUID| \ + XFS_SB_FEAT_INCOMPAT_BIGTIME) #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL static inline bool @@ -563,6 +567,23 @@ static inline bool xfs_sb_version_hasreflink(struct xfs_sb *sbp) (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK); } +static inline bool xfs_sb_version_hasbigtime(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_BIGTIME); +} + +/* + * Inode btree block counter. We record the number of inobt and finobt blocks + * in the AGI header so that we can skip the finobt walk at mount time when + * setting up per-AG reservations. + */ +static inline bool xfs_sb_version_hasinobtcounts(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT); +} + /* * end of superblock version macros */ @@ -765,6 +786,9 @@ typedef struct xfs_agi { __be32 agi_free_root; /* root of the free inode btree */ __be32 agi_free_level;/* levels in free inode btree */ + __be32 agi_iblocks; /* inobt blocks used */ + __be32 agi_fblocks; /* finobt blocks used */ + /* structure must be padded to 64 bit alignment */ } xfs_agi_t; @@ -785,7 +809,8 @@ typedef struct xfs_agi { #define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1) #define XFS_AGI_FREE_ROOT (1 << 11) #define XFS_AGI_FREE_LEVEL (1 << 12) -#define XFS_AGI_NUM_BITS_R2 13 +#define XFS_AGI_IBLOCKS (1 << 13) /* both inobt/finobt block counters */ +#define XFS_AGI_NUM_BITS_R2 14 /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log)) @@ -831,10 +856,87 @@ struct xfs_agfl { ASSERT(xfs_daddr_to_agno(mp, d) == \ xfs_daddr_to_agno(mp, (d) + (len) - 1))) -typedef struct xfs_timestamp { +/* + * XFS Timestamps + * ============== + * + * Traditional ondisk inode timestamps consist of signed 32-bit counters for + * seconds and nanoseconds; time zero is the Unix epoch, Jan 1 00:00:00 UTC + * 1970, which means that the timestamp epoch is the same as the Unix epoch. + * Therefore, the ondisk min and max defined here can be used directly to + * constrain the incore timestamps on a Unix system. Note that we actually + * encode a __be64 value on disk. + * + * When the bigtime feature is enabled, ondisk inode timestamps become an + * unsigned 64-bit nanoseconds counter. This means that the bigtime inode + * timestamp epoch is the start of the classic timestamp range, which is + * Dec 31 20:45:52 UTC 1901. Because the epochs are not the same, callers + * /must/ use the bigtime conversion functions when encoding and decoding raw + * timestamps. + */ +typedef __be64 xfs_timestamp_t; + +/* Legacy timestamp encoding format. */ +struct xfs_legacy_timestamp { __be32 t_sec; /* timestamp seconds */ __be32 t_nsec; /* timestamp nanoseconds */ -} xfs_timestamp_t; +}; + +/* + * Smallest possible ondisk seconds value with traditional timestamps. This + * corresponds exactly with the incore timestamp Dec 13 20:45:52 UTC 1901. + */ +#define XFS_LEGACY_TIME_MIN ((int64_t)S32_MIN) + +/* + * Largest possible ondisk seconds value with traditional timestamps. This + * corresponds exactly with the incore timestamp Jan 19 03:14:07 UTC 2038. + */ +#define XFS_LEGACY_TIME_MAX ((int64_t)S32_MAX) + +/* + * Smallest possible ondisk seconds value with bigtime timestamps. This + * corresponds (after conversion to a Unix timestamp) with the traditional + * minimum timestamp of Dec 13 20:45:52 UTC 1901. + */ +#define XFS_BIGTIME_TIME_MIN ((int64_t)0) + +/* + * Largest supported ondisk seconds value with bigtime timestamps. This + * corresponds (after conversion to a Unix timestamp) with an incore timestamp + * of Jul 2 20:20:24 UTC 2486. + * + * We round down the ondisk limit so that the bigtime quota and inode max + * timestamps will be the same. + */ +#define XFS_BIGTIME_TIME_MAX ((int64_t)((-1ULL / NSEC_PER_SEC) & ~0x3ULL)) + +/* + * Bigtime epoch is set exactly to the minimum time value that a traditional + * 32-bit timestamp can represent when using the Unix epoch as a reference. + * Hence the Unix epoch is at a fixed offset into the supported bigtime + * timestamp range. + * + * The bigtime epoch also matches the minimum value an on-disk 32-bit XFS + * timestamp can represent so we will not lose any fidelity in converting + * to/from unix and bigtime timestamps. + * + * The following conversion factor converts a seconds counter from the Unix + * epoch to the bigtime epoch. + */ +#define XFS_BIGTIME_EPOCH_OFFSET (-(int64_t)S32_MIN) + +/* Convert a timestamp from the Unix epoch to the bigtime epoch. */ +static inline uint64_t xfs_unix_to_bigtime(time64_t unix_seconds) +{ + return (uint64_t)unix_seconds + XFS_BIGTIME_EPOCH_OFFSET; +} + +/* Convert a timestamp from the bigtime epoch to the Unix epoch. */ +static inline time64_t xfs_bigtime_to_unix(uint64_t ondisk_seconds) +{ + return (time64_t)ondisk_seconds - XFS_BIGTIME_EPOCH_OFFSET; +} /* * On-disk inode structure. @@ -1061,12 +1163,22 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) #define XFS_DIFLAG2_DAX_BIT 0 /* use DAX for this inode */ #define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */ #define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */ +#define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */ + #define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT) #define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT) #define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT) +#define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT) #define XFS_DIFLAG2_ANY \ - (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE) + (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \ + XFS_DIFLAG2_BIGTIME) + +static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) +{ + return dip->di_version >= 3 && + (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_BIGTIME)); +} /* * Inode number format: @@ -1152,13 +1264,98 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) #define XFS_DQTYPE_USER 0x01 /* user dquot record */ #define XFS_DQTYPE_PROJ 0x02 /* project dquot record */ #define XFS_DQTYPE_GROUP 0x04 /* group dquot record */ +#define XFS_DQTYPE_BIGTIME 0x80 /* large expiry timestamps */ /* bitmask to determine if this is a user/group/project dquot */ #define XFS_DQTYPE_REC_MASK (XFS_DQTYPE_USER | \ XFS_DQTYPE_PROJ | \ XFS_DQTYPE_GROUP) -#define XFS_DQTYPE_ANY (XFS_DQTYPE_REC_MASK) +#define XFS_DQTYPE_ANY (XFS_DQTYPE_REC_MASK | \ + XFS_DQTYPE_BIGTIME) + +/* + * XFS Quota Timers + * ================ + * + * Traditional quota grace period expiration timers are an unsigned 32-bit + * seconds counter; time zero is the Unix epoch, Jan 1 00:00:01 UTC 1970. + * Note that an expiration value of zero means that the quota limit has not + * been reached, and therefore no expiration has been set. Therefore, the + * ondisk min and max defined here can be used directly to constrain the incore + * quota expiration timestamps on a Unix system. + * + * When bigtime is enabled, we trade two bits of precision to expand the + * expiration timeout range to match that of big inode timestamps. The min and + * max recorded here are the on-disk limits, not a Unix timestamp. + * + * The grace period for each quota type is stored in the root dquot (id = 0) + * and is applied to a non-root dquot when it exceeds the soft or hard limits. + * The length of quota grace periods are unsigned 32-bit quantities measured in + * units of seconds. A value of zero means to use the default period. + */ + +/* + * Smallest possible ondisk quota expiration value with traditional timestamps. + * This corresponds exactly with the incore expiration Jan 1 00:00:01 UTC 1970. + */ +#define XFS_DQ_LEGACY_EXPIRY_MIN ((int64_t)1) + +/* + * Largest possible ondisk quota expiration value with traditional timestamps. + * This corresponds exactly with the incore expiration Feb 7 06:28:15 UTC 2106. + */ +#define XFS_DQ_LEGACY_EXPIRY_MAX ((int64_t)U32_MAX) + +/* + * Smallest possible ondisk quota expiration value with bigtime timestamps. + * This corresponds (after conversion to a Unix timestamp) with the incore + * expiration of Jan 1 00:00:04 UTC 1970. + */ +#define XFS_DQ_BIGTIME_EXPIRY_MIN (XFS_DQ_LEGACY_EXPIRY_MIN) + +/* + * Largest supported ondisk quota expiration value with bigtime timestamps. + * This corresponds (after conversion to a Unix timestamp) with an incore + * expiration of Jul 2 20:20:24 UTC 2486. + * + * The ondisk field supports values up to -1U, which corresponds to an incore + * expiration in 2514. This is beyond the maximum the bigtime inode timestamp, + * so we cap the maximum bigtime quota expiration to the max inode timestamp. + */ +#define XFS_DQ_BIGTIME_EXPIRY_MAX ((int64_t)4074815106U) + +/* + * The following conversion factors assist in converting a quota expiration + * timestamp between the incore and ondisk formats. + */ +#define XFS_DQ_BIGTIME_SHIFT (2) +#define XFS_DQ_BIGTIME_SLACK ((int64_t)(1ULL << XFS_DQ_BIGTIME_SHIFT) - 1) + +/* Convert an incore quota expiration timestamp to an ondisk bigtime value. */ +static inline uint32_t xfs_dq_unix_to_bigtime(time64_t unix_seconds) +{ + /* + * Round the expiration timestamp up to the nearest bigtime timestamp + * that we can store, to give users the most time to fix problems. + */ + return ((uint64_t)unix_seconds + XFS_DQ_BIGTIME_SLACK) >> + XFS_DQ_BIGTIME_SHIFT; +} + +/* Convert an ondisk bigtime quota expiration value to an incore timestamp. */ +static inline time64_t xfs_dq_bigtime_to_unix(uint32_t ondisk_seconds) +{ + return (time64_t)ondisk_seconds << XFS_DQ_BIGTIME_SHIFT; +} + +/* + * Default quota grace periods, ranging from zero (use the compiled defaults) + * to ~136 years. These are applied to a non-root dquot that has exceeded + * either limit. + */ +#define XFS_DQ_GRACE_MIN ((int64_t)0) +#define XFS_DQ_GRACE_MAX ((int64_t)U32_MAX) /* * This is the main portion of the on-disk representation of quota information diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 84bcffa87753..2a2e3cfd94f0 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -249,6 +249,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_SPINODES (1 << 18) /* sparse inode chunks */ #define XFS_FSOP_GEOM_FLAGS_RMAPBT (1 << 19) /* reverse mapping btree */ #define XFS_FSOP_GEOM_FLAGS_REFLINK (1 << 20) /* files can share blocks */ +#define XFS_FSOP_GEOM_FLAGS_BIGTIME (1 << 21) /* 64-bit nsec timestamps */ /* * Minimum and maximum sizes need for growth checks. diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index f742a96a2fe1..974e71bc4a3a 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -688,7 +688,7 @@ xfs_ialloc_ag_alloc( args.minalignslop = igeo->cluster_align - 1; /* Allow space for the inode btree to split. */ - args.minleft = igeo->inobt_maxlevels - 1; + args.minleft = igeo->inobt_maxlevels; if ((error = xfs_alloc_vextent(&args))) return error; @@ -736,7 +736,7 @@ xfs_ialloc_ag_alloc( /* * Allow space for the inode btree to split. */ - args.minleft = igeo->inobt_maxlevels - 1; + args.minleft = igeo->inobt_maxlevels; if ((error = xfs_alloc_vextent(&args))) return error; } @@ -2473,6 +2473,7 @@ xfs_ialloc_log_agi( offsetof(xfs_agi_t, agi_unlinked), offsetof(xfs_agi_t, agi_free_root), offsetof(xfs_agi_t, agi_free_level), + offsetof(xfs_agi_t, agi_iblocks), sizeof(xfs_agi_t) }; #ifdef DEBUG @@ -2806,6 +2807,10 @@ xfs_ialloc_setup_geometry( uint64_t icount; uint inodes; + igeo->new_diflags2 = 0; + if (xfs_sb_version_hasbigtime(&mp->m_sb)) + igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME; + /* Compute inode btree geometry. */ igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog; igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1); diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 3c8aebc36e64..cc919a2ee870 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -67,6 +67,25 @@ xfs_finobt_set_root( XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL); } +/* Update the inode btree block counter for this btree. */ +static inline void +xfs_inobt_mod_blockcount( + struct xfs_btree_cur *cur, + int howmuch) +{ + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agi *agi = agbp->b_addr; + + if (!xfs_sb_version_hasinobtcounts(&cur->bc_mp->m_sb)) + return; + + if (cur->bc_btnum == XFS_BTNUM_FINO) + be32_add_cpu(&agi->agi_fblocks, howmuch); + else if (cur->bc_btnum == XFS_BTNUM_INO) + be32_add_cpu(&agi->agi_iblocks, howmuch); + xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_IBLOCKS); +} + STATIC int __xfs_inobt_alloc_block( struct xfs_btree_cur *cur, @@ -102,6 +121,7 @@ __xfs_inobt_alloc_block( new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno)); *stat = 1; + xfs_inobt_mod_blockcount(cur, 1); return 0; } @@ -134,6 +154,7 @@ __xfs_inobt_free_block( struct xfs_buf *bp, enum xfs_ag_resv_type resv) { + xfs_inobt_mod_blockcount(cur, -1); return xfs_free_extent(cur->bc_tp, XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1, &XFS_RMAP_OINFO_INOBT, resv); @@ -480,19 +501,29 @@ xfs_inobt_commit_staged_btree( { struct xfs_agi *agi = agbp->b_addr; struct xbtree_afakeroot *afake = cur->bc_ag.afake; + int fields; ASSERT(cur->bc_flags & XFS_BTREE_STAGING); if (cur->bc_btnum == XFS_BTNUM_INO) { + fields = XFS_AGI_ROOT | XFS_AGI_LEVEL; agi->agi_root = cpu_to_be32(afake->af_root); agi->agi_level = cpu_to_be32(afake->af_levels); - xfs_ialloc_log_agi(tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL); + if (xfs_sb_version_hasinobtcounts(&cur->bc_mp->m_sb)) { + agi->agi_iblocks = cpu_to_be32(afake->af_blocks); + fields |= XFS_AGI_IBLOCKS; + } + xfs_ialloc_log_agi(tp, agbp, fields); xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_inobt_ops); } else { + fields = XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL; agi->agi_free_root = cpu_to_be32(afake->af_root); agi->agi_free_level = cpu_to_be32(afake->af_levels); - xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREE_ROOT | - XFS_AGI_FREE_LEVEL); + if (xfs_sb_version_hasinobtcounts(&cur->bc_mp->m_sb)) { + agi->agi_fblocks = cpu_to_be32(afake->af_blocks); + fields |= XFS_AGI_IBLOCKS; + } + xfs_ialloc_log_agi(tp, agbp, fields); xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_finobt_ops); } } @@ -673,6 +704,28 @@ xfs_inobt_count_blocks( return error; } +/* Read finobt block count from AGI header. */ +static int +xfs_finobt_read_blocks( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_extlen_t *tree_blocks) +{ + struct xfs_buf *agbp; + struct xfs_agi *agi; + int error; + + error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + if (error) + return error; + + agi = agbp->b_addr; + *tree_blocks = be32_to_cpu(agi->agi_fblocks); + xfs_trans_brelse(tp, agbp); + return 0; +} + /* * Figure out how many blocks to reserve and how many are used by this btree. */ @@ -690,7 +743,11 @@ xfs_finobt_calc_reserves( if (!xfs_sb_version_hasfinobt(&mp->m_sb)) return 0; - error = xfs_inobt_count_blocks(mp, tp, agno, XFS_BTNUM_FINO, &tree_len); + if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) + error = xfs_finobt_read_blocks(mp, tp, agno, &tree_len); + else + error = xfs_inobt_count_blocks(mp, tp, agno, XFS_BTNUM_FINO, + &tree_len); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c index 52451809c478..b4164256993d 100644 --- a/fs/xfs/libxfs/xfs_iext_tree.c +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -603,7 +603,7 @@ xfs_iext_realloc_root( if (new_size / sizeof(struct xfs_iext_rec) == RECS_PER_LEAF) new_size = NODE_SIZE; - new = kmem_realloc(ifp->if_u1.if_root, new_size, KM_NOFS); + new = krealloc(ifp->if_u1.if_root, new_size, GFP_NOFS | __GFP_NOFAIL); memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes); ifp->if_u1.if_root = new; cur->leaf = new; diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 8d5dd08eab75..c667c63f2cb0 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -157,6 +157,36 @@ xfs_imap_to_bp( return 0; } +static inline struct timespec64 xfs_inode_decode_bigtime(uint64_t ts) +{ + struct timespec64 tv; + uint32_t n; + + tv.tv_sec = xfs_bigtime_to_unix(div_u64_rem(ts, NSEC_PER_SEC, &n)); + tv.tv_nsec = n; + + return tv; +} + +/* Convert an ondisk timestamp to an incore timestamp. */ +struct timespec64 +xfs_inode_from_disk_ts( + struct xfs_dinode *dip, + const xfs_timestamp_t ts) +{ + struct timespec64 tv; + struct xfs_legacy_timestamp *lts; + + if (xfs_dinode_has_bigtime(dip)) + return xfs_inode_decode_bigtime(be64_to_cpu(ts)); + + lts = (struct xfs_legacy_timestamp *)&ts; + tv.tv_sec = (int)be32_to_cpu(lts->t_sec); + tv.tv_nsec = (int)be32_to_cpu(lts->t_nsec); + + return tv; +} + int xfs_inode_from_disk( struct xfs_inode *ip, @@ -211,12 +241,9 @@ xfs_inode_from_disk( * a time before epoch is converted to a time long after epoch * on 64 bit systems. */ - inode->i_atime.tv_sec = (int)be32_to_cpu(from->di_atime.t_sec); - inode->i_atime.tv_nsec = (int)be32_to_cpu(from->di_atime.t_nsec); - inode->i_mtime.tv_sec = (int)be32_to_cpu(from->di_mtime.t_sec); - inode->i_mtime.tv_nsec = (int)be32_to_cpu(from->di_mtime.t_nsec); - inode->i_ctime.tv_sec = (int)be32_to_cpu(from->di_ctime.t_sec); - inode->i_ctime.tv_nsec = (int)be32_to_cpu(from->di_ctime.t_nsec); + inode->i_atime = xfs_inode_from_disk_ts(from, from->di_atime); + inode->i_mtime = xfs_inode_from_disk_ts(from, from->di_mtime); + inode->i_ctime = xfs_inode_from_disk_ts(from, from->di_ctime); to->di_size = be64_to_cpu(from->di_size); to->di_nblocks = be64_to_cpu(from->di_nblocks); @@ -229,8 +256,7 @@ xfs_inode_from_disk( if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { inode_set_iversion_queried(inode, be64_to_cpu(from->di_changecount)); - to->di_crtime.tv_sec = be32_to_cpu(from->di_crtime.t_sec); - to->di_crtime.tv_nsec = be32_to_cpu(from->di_crtime.t_nsec); + to->di_crtime = xfs_inode_from_disk_ts(from, from->di_crtime); to->di_flags2 = be64_to_cpu(from->di_flags2); to->di_cowextsize = be32_to_cpu(from->di_cowextsize); } @@ -252,6 +278,25 @@ out_destroy_data_fork: return error; } +/* Convert an incore timestamp to an ondisk timestamp. */ +static inline xfs_timestamp_t +xfs_inode_to_disk_ts( + struct xfs_inode *ip, + const struct timespec64 tv) +{ + struct xfs_legacy_timestamp *lts; + xfs_timestamp_t ts; + + if (xfs_inode_has_bigtime(ip)) + return cpu_to_be64(xfs_inode_encode_bigtime(tv)); + + lts = (struct xfs_legacy_timestamp *)&ts; + lts->t_sec = cpu_to_be32(tv.tv_sec); + lts->t_nsec = cpu_to_be32(tv.tv_nsec); + + return ts; +} + void xfs_inode_to_disk( struct xfs_inode *ip, @@ -271,12 +316,9 @@ xfs_inode_to_disk( to->di_projid_hi = cpu_to_be16(from->di_projid >> 16); memset(to->di_pad, 0, sizeof(to->di_pad)); - to->di_atime.t_sec = cpu_to_be32(inode->i_atime.tv_sec); - to->di_atime.t_nsec = cpu_to_be32(inode->i_atime.tv_nsec); - to->di_mtime.t_sec = cpu_to_be32(inode->i_mtime.tv_sec); - to->di_mtime.t_nsec = cpu_to_be32(inode->i_mtime.tv_nsec); - to->di_ctime.t_sec = cpu_to_be32(inode->i_ctime.tv_sec); - to->di_ctime.t_nsec = cpu_to_be32(inode->i_ctime.tv_nsec); + to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime); + to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime); + to->di_ctime = xfs_inode_to_disk_ts(ip, inode->i_ctime); to->di_nlink = cpu_to_be32(inode->i_nlink); to->di_gen = cpu_to_be32(inode->i_generation); to->di_mode = cpu_to_be16(inode->i_mode); @@ -295,8 +337,7 @@ xfs_inode_to_disk( if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { to->di_version = 3; to->di_changecount = cpu_to_be64(inode_peek_iversion(inode)); - to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.tv_sec); - to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.tv_nsec); + to->di_crtime = xfs_inode_to_disk_ts(ip, from->di_crtime); to->di_flags2 = cpu_to_be64(from->di_flags2); to->di_cowextsize = cpu_to_be32(from->di_cowextsize); to->di_ino = cpu_to_be64(ip->i_ino); @@ -310,58 +351,6 @@ xfs_inode_to_disk( } } -void -xfs_log_dinode_to_disk( - struct xfs_log_dinode *from, - struct xfs_dinode *to) -{ - to->di_magic = cpu_to_be16(from->di_magic); - to->di_mode = cpu_to_be16(from->di_mode); - to->di_version = from->di_version; - to->di_format = from->di_format; - to->di_onlink = 0; - to->di_uid = cpu_to_be32(from->di_uid); - to->di_gid = cpu_to_be32(from->di_gid); - to->di_nlink = cpu_to_be32(from->di_nlink); - to->di_projid_lo = cpu_to_be16(from->di_projid_lo); - to->di_projid_hi = cpu_to_be16(from->di_projid_hi); - memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); - - to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); - to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec); - to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec); - to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec); - to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec); - to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec); - - to->di_size = cpu_to_be64(from->di_size); - to->di_nblocks = cpu_to_be64(from->di_nblocks); - to->di_extsize = cpu_to_be32(from->di_extsize); - to->di_nextents = cpu_to_be32(from->di_nextents); - to->di_anextents = cpu_to_be16(from->di_anextents); - to->di_forkoff = from->di_forkoff; - to->di_aformat = from->di_aformat; - to->di_dmevmask = cpu_to_be32(from->di_dmevmask); - to->di_dmstate = cpu_to_be16(from->di_dmstate); - to->di_flags = cpu_to_be16(from->di_flags); - to->di_gen = cpu_to_be32(from->di_gen); - - if (from->di_version == 3) { - to->di_changecount = cpu_to_be64(from->di_changecount); - to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); - to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); - to->di_flags2 = cpu_to_be64(from->di_flags2); - to->di_cowextsize = cpu_to_be32(from->di_cowextsize); - to->di_ino = cpu_to_be64(from->di_ino); - to->di_lsn = cpu_to_be64(from->di_lsn); - memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); - uuid_copy(&to->di_uuid, &from->di_uuid); - to->di_flushiter = 0; - } else { - to->di_flushiter = cpu_to_be16(from->di_flushiter); - } -} - static xfs_failaddr_t xfs_dinode_verify_fork( struct xfs_dinode *dip, @@ -568,6 +557,11 @@ xfs_dinode_verify( if (fa) return fa; + /* bigtime iflag can only happen on bigtime filesystems */ + if (xfs_dinode_has_bigtime(dip) && + !xfs_sb_version_hasbigtime(&mp->m_sb)) + return __this_address; + return NULL; } diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 6b08b9d060c2..536666143fe7 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -32,6 +32,11 @@ struct xfs_icdinode { struct timespec64 di_crtime; /* time created */ }; +static inline bool xfs_icdinode_has_bigtime(const struct xfs_icdinode *icd) +{ + return icd->di_flags2 & XFS_DIFLAG2_BIGTIME; +} + /* * Inode location information. Stored in the inode and passed to * xfs_imap_to_bp() to get a buffer and dinode for a given inode. @@ -49,8 +54,6 @@ void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *); void xfs_inode_to_disk(struct xfs_inode *ip, struct xfs_dinode *to, xfs_lsn_t lsn); int xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from); -void xfs_log_dinode_to_disk(struct xfs_log_dinode *from, - struct xfs_dinode *to); xfs_failaddr_t xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino, struct xfs_dinode *dip); @@ -60,4 +63,12 @@ xfs_failaddr_t xfs_inode_validate_cowextsize(struct xfs_mount *mp, uint32_t cowextsize, uint16_t mode, uint16_t flags, uint64_t flags2); +static inline uint64_t xfs_inode_encode_bigtime(struct timespec64 tv) +{ + return xfs_unix_to_bigtime(tv.tv_sec) * NSEC_PER_SEC + tv.tv_nsec; +} + +struct timespec64 xfs_inode_from_disk_ts(struct xfs_dinode *dip, + const xfs_timestamp_t ts); + #endif /* __XFS_INODE_BUF_H__ */ diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 0cf853d42d62..7575de5cecb1 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -386,8 +386,8 @@ xfs_iroot_realloc( cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); new_max = cur_max + rec_diff; new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); - ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, - KM_NOFS); + ifp->if_broot = krealloc(ifp->if_broot, new_size, + GFP_NOFS | __GFP_NOFAIL); op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, ifp->if_broot_bytes); np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, @@ -496,8 +496,8 @@ xfs_idata_realloc( * in size so that it can be logged and stay on word boundaries. * We enforce that here. */ - ifp->if_u1.if_data = kmem_realloc(ifp->if_u1.if_data, - roundup(new_size, 4), KM_NOFS); + ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, roundup(new_size, 4), + GFP_NOFS | __GFP_NOFAIL); ifp->if_bytes = new_size; } diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index e3400c9c71cd..8bd00da6d2a4 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -368,10 +368,13 @@ static inline int xfs_ilog_fdata(int w) * directly mirrors the xfs_dinode structure as it must contain all the same * information. */ -typedef struct xfs_ictimestamp { +typedef uint64_t xfs_ictimestamp_t; + +/* Legacy timestamp encoding format. */ +struct xfs_legacy_ictimestamp { int32_t t_sec; /* timestamp seconds */ int32_t t_nsec; /* timestamp nanoseconds */ -} xfs_ictimestamp_t; +}; /* * Define the format of the inode core that is logged. This structure must be diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 641132d0e39d..3cca2bfe714c 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -121,7 +121,6 @@ struct xlog_recover { void xlog_buf_readahead(struct xlog *log, xfs_daddr_t blkno, uint len, const struct xfs_buf_ops *ops); bool xlog_is_buffer_cancelled(struct xlog *log, xfs_daddr_t blkno, uint len); -void xlog_recover_iodone(struct xfs_buf *bp); void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type, uint64_t intent_id); diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index 076bdc7037ee..0f0af4e35032 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -23,7 +23,8 @@ typedef uint8_t xfs_dqtype_t; #define XFS_DQTYPE_STRINGS \ { XFS_DQTYPE_USER, "USER" }, \ { XFS_DQTYPE_PROJ, "PROJ" }, \ - { XFS_DQTYPE_GROUP, "GROUP" } + { XFS_DQTYPE_GROUP, "GROUP" }, \ + { XFS_DQTYPE_BIGTIME, "BIGTIME" } /* * flags for q_flags field in the dquot. @@ -143,4 +144,9 @@ extern int xfs_calc_dquots_per_chunk(unsigned int nbblks); extern void xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb, xfs_dqid_t id, xfs_dqtype_t type); +struct xfs_dquot; +time64_t xfs_dquot_from_disk_ts(struct xfs_disk_dquot *ddq, + __be32 dtimer); +__be32 xfs_dquot_to_disk_ts(struct xfs_dquot *ddq, time64_t timer); + #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index ae9aaf1f34bf..5aeafa59ed27 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -954,7 +954,7 @@ xfs_log_sb( struct xfs_trans *tp) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_buf *bp = xfs_trans_getsb(tp, mp); + struct xfs_buf *bp = xfs_trans_getsb(tp); mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree); @@ -1084,7 +1084,7 @@ xfs_sync_sb_buf( if (error) return error; - bp = xfs_trans_getsb(tp, mp); + bp = xfs_trans_getsb(tp); xfs_log_sb(tp); xfs_trans_bhold(tp, bp); xfs_trans_set_sync(tp); @@ -1166,6 +1166,8 @@ xfs_fs_geometry( geo->flags |= XFS_FSOP_GEOM_FLAGS_RMAPBT; if (xfs_sb_version_hasreflink(sbp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_REFLINK; + if (xfs_sb_version_hasbigtime(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_BIGTIME; if (xfs_sb_version_hassector(sbp)) geo->logsectsize = sbp->sb_logsectsize; else diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 708feb8eac76..c795ae47b3c9 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -176,6 +176,9 @@ struct xfs_ino_geometry { unsigned int ialloc_align; unsigned int agino_log; /* #bits for agino in inum */ + + /* precomputed value for di_flags2 */ + uint64_t new_diflags2; }; #endif /* __XFS_SHARED_H__ */ diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index e15129647e00..90f1d5645052 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -110,9 +110,9 @@ xfs_trans_log_inode( * to log the timestamps, or will clear already cleared fields in the * worst case. */ - if (inode->i_state & (I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED)) { + if (inode->i_state & I_DIRTY_TIME) { spin_lock(&inode->i_lock); - inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED); + inode->i_state &= ~I_DIRTY_TIME; spin_unlock(&inode->i_lock); } @@ -132,6 +132,17 @@ xfs_trans_log_inode( } /* + * If we're updating the inode core or the timestamps and it's possible + * to upgrade this inode to bigtime format, do so now. + */ + if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) && + xfs_sb_version_hasbigtime(&ip->i_mount->m_sb) && + !xfs_inode_has_bigtime(ip)) { + ip->i_d.di_flags2 |= XFS_DIFLAG2_BIGTIME; + flags |= XFS_ILOG_CORE; + } + + /* * Record the specific change for fdatasync optimisation. This allows * fdatasync to skip log forces for inodes that are only timestamp * dirty. @@ -177,9 +188,9 @@ xfs_trans_log_inode( /* * Always OR in the bits from the ili_last_fields field. This is to - * coordinate with the xfs_iflush() and xfs_iflush_done() routines in - * the eventual clearing of the ili_fields bits. See the big comment in - * xfs_iflush() for an explanation of this coordination mechanism. + * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines + * in the eventual clearing of the ili_fields bits. See the big comment + * in xfs_iflush() for an explanation of this coordination mechanism. */ iip->ili_fields |= (flags | iip->ili_last_fields | iversion_flags); spin_unlock(&iip->ili_lock); diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index c6df01a2a158..7ad3659c5d2a 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -58,7 +58,7 @@ #define XFS_IALLOC_SPACE_RES(mp) \ (M_IGEO(mp)->ialloc_blks + \ ((xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1) * \ - (M_IGEO(mp)->inobt_maxlevels - 1))) + M_IGEO(mp)->inobt_maxlevels)) /* * Space reservation values for various transactions. diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index e9bcf1faa183..ae8e2e0ac64a 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -781,6 +781,35 @@ xchk_agi_xref_icounts( xchk_block_xref_set_corrupt(sc, sc->sa.agi_bp); } +/* Check agi_[fi]blocks against tree size */ +static inline void +xchk_agi_xref_fiblocks( + struct xfs_scrub *sc) +{ + struct xfs_agi *agi = sc->sa.agi_bp->b_addr; + xfs_agblock_t blocks; + int error = 0; + + if (!xfs_sb_version_hasinobtcounts(&sc->mp->m_sb)) + return; + + if (sc->sa.ino_cur) { + error = xfs_btree_count_blocks(sc->sa.ino_cur, &blocks); + if (!xchk_should_check_xref(sc, &error, &sc->sa.ino_cur)) + return; + if (blocks != be32_to_cpu(agi->agi_iblocks)) + xchk_block_xref_set_corrupt(sc, sc->sa.agi_bp); + } + + if (sc->sa.fino_cur) { + error = xfs_btree_count_blocks(sc->sa.fino_cur, &blocks); + if (!xchk_should_check_xref(sc, &error, &sc->sa.fino_cur)) + return; + if (blocks != be32_to_cpu(agi->agi_fblocks)) + xchk_block_xref_set_corrupt(sc, sc->sa.agi_bp); + } +} + /* Cross-reference with the other btrees. */ STATIC void xchk_agi_xref( @@ -804,6 +833,7 @@ xchk_agi_xref( xchk_agi_xref_icounts(sc); xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); xchk_xref_is_not_shared(sc, agbno, 1); + xchk_agi_xref_fiblocks(sc); /* scrub teardown will take care of sc->sa for us */ } diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index bca2ab1d4be9..401f71579ce6 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -810,10 +810,34 @@ xrep_agi_calc_from_btrees( error = xfs_ialloc_count_inodes(cur, &count, &freecount); if (error) goto err; + if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) { + xfs_agblock_t blocks; + + error = xfs_btree_count_blocks(cur, &blocks); + if (error) + goto err; + agi->agi_iblocks = cpu_to_be32(blocks); + } xfs_btree_del_cursor(cur, error); agi->agi_count = cpu_to_be32(count); agi->agi_freecount = cpu_to_be32(freecount); + + if (xfs_sb_version_hasfinobt(&mp->m_sb) && + xfs_sb_version_hasinobtcounts(&mp->m_sb)) { + xfs_agblock_t blocks; + + cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp, sc->sa.agno, + XFS_BTNUM_FINO); + if (error) + goto err; + error = xfs_btree_count_blocks(cur, &blocks); + if (error) + goto err; + xfs_btree_del_cursor(cur, error); + agi->agi_fblocks = cpu_to_be32(blocks); + } + return 0; err: xfs_btree_del_cursor(cur, error); diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 6d483ab29e63..3aa85b64de36 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -190,11 +190,30 @@ xchk_inode_flags2( if ((flags2 & XFS_DIFLAG2_DAX) && (flags2 & XFS_DIFLAG2_REFLINK)) goto bad; + /* no bigtime iflag without the bigtime feature */ + if (xfs_dinode_has_bigtime(dip) && + !xfs_sb_version_hasbigtime(&mp->m_sb)) + goto bad; + return; bad: xchk_ino_set_corrupt(sc, ino); } +static inline void +xchk_dinode_nsec( + struct xfs_scrub *sc, + xfs_ino_t ino, + struct xfs_dinode *dip, + const xfs_timestamp_t ts) +{ + struct timespec64 tv; + + tv = xfs_inode_from_disk_ts(dip, ts); + if (tv.tv_nsec < 0 || tv.tv_nsec >= NSEC_PER_SEC) + xchk_ino_set_corrupt(sc, ino); +} + /* Scrub all the ondisk inode fields. */ STATIC void xchk_dinode( @@ -293,12 +312,9 @@ xchk_dinode( } /* di_[amc]time.nsec */ - if (be32_to_cpu(dip->di_atime.t_nsec) >= NSEC_PER_SEC) - xchk_ino_set_corrupt(sc, ino); - if (be32_to_cpu(dip->di_mtime.t_nsec) >= NSEC_PER_SEC) - xchk_ino_set_corrupt(sc, ino); - if (be32_to_cpu(dip->di_ctime.t_nsec) >= NSEC_PER_SEC) - xchk_ino_set_corrupt(sc, ino); + xchk_dinode_nsec(sc, ino, dip, dip->di_atime); + xchk_dinode_nsec(sc, ino, dip, dip->di_mtime); + xchk_dinode_nsec(sc, ino, dip, dip->di_ctime); /* * di_size. xfs_dinode_verify checks for things that screw up @@ -403,8 +419,7 @@ xchk_dinode( } if (dip->di_version >= 3) { - if (be32_to_cpu(dip->di_crtime.t_nsec) >= NSEC_PER_SEC) - xchk_ino_set_corrupt(sc, ino); + xchk_dinode_nsec(sc, ino, dip, dip->di_crtime); xchk_inode_flags2(sc, dip, ino, mode, flags, flags2); xchk_inode_cowextsize(sc, dip, ino, mode, flags, flags2); diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index 5641ae512c9e..c08be5ede066 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -22,7 +22,7 @@ xchk_setup_symlink( struct xfs_inode *ip) { /* Allocate the buffer without the inode lock held. */ - sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, 0); + sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, GFP_KERNEL); if (!sc->buf) return -ENOMEM; diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index d4c687b5cd06..c544951a0c07 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -192,7 +192,7 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (acl) { args.valuelen = XFS_ACL_SIZE(acl->a_count); - args.value = kmem_zalloc_large(args.valuelen, 0); + args.value = kvzalloc(args.valuelen, GFP_KERNEL); if (!args.value) return -ENOMEM; xfs_acl_to_disk(args.value, acl); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index b35611882ff9..55d126d4e096 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -544,7 +544,7 @@ xfs_discard_page( page, ip->i_ino, offset); error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - PAGE_SIZE / i_blocksize(inode)); + i_blocks_per_page(inode, page)); if (error && !XFS_FORCED_SHUTDOWN(mp)) xfs_alert(mp, "page discard unable to remove delalloc mapping."); out_invalidate: diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 50f922cad91a..8f8837fe21cf 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -61,7 +61,7 @@ xfs_attr_shortform_list( int error = 0; ASSERT(dp->i_afp != NULL); - sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data; ASSERT(sf != NULL); if (!sf->hdr.count) return 0; @@ -96,7 +96,7 @@ xfs_attr_shortform_list( */ if (context->seen_enough) break; - sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + sfe = xfs_attr_sf_nextentry(sfe); } trace_xfs_attr_list_sf_all(context); return 0; @@ -136,7 +136,7 @@ xfs_attr_shortform_list( /* These are bytes, and both on-disk, don't endian-flip */ sbp->valuelen = sfe->valuelen; sbp->flags = sfe->flags; - sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + sfe = xfs_attr_sf_nextentry(sfe); sbp++; nsbuf++; } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 73cafc843cd7..f2a8a0e75e1f 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -946,6 +946,14 @@ xfs_free_file_space( startoffset_fsb = XFS_B_TO_FSB(mp, offset); endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); + /* We can only free complete realtime extents. */ + if (XFS_IS_REALTIME_INODE(ip)) { + xfs_extlen_t extsz = xfs_get_extsz_hint(ip); + + if ((startoffset_fsb | endoffset_fsb) & (extsz - 1)) + return -EINVAL; + } + /* * Need to zero the stuff we're not freeing, on disk. */ @@ -1139,6 +1147,14 @@ xfs_insert_file_space( trace_xfs_insert_file_space(ip); + /* We can only insert complete realtime extents. */ + if (XFS_IS_REALTIME_INODE(ip)) { + xfs_extlen_t extsz = xfs_get_extsz_hint(ip); + + if ((stop_fsb | shift_fsb) & (extsz - 1)) + return -EINVAL; + } + error = xfs_bmap_can_insert_extents(ip, stop_fsb, shift_fsb); if (error) return error; @@ -1165,7 +1181,7 @@ xfs_insert_file_space( goto out_trans_cancel; do { - error = xfs_trans_roll_inode(&tp, ip); + error = xfs_defer_finish(&tp); if (error) goto out_trans_cancel; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index d4cdcb6fb2fe..4e4cf91f4f9f 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -52,6 +52,15 @@ static kmem_zone_t *xfs_buf_zone; * b_lock (trylock due to inversion) */ +static int __xfs_buf_submit(struct xfs_buf *bp, bool wait); + +static inline int +xfs_buf_submit( + struct xfs_buf *bp) +{ + return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC)); +} + static inline int xfs_buf_is_vmapped( struct xfs_buf *bp) @@ -751,7 +760,7 @@ found: return 0; } -STATIC int +int _xfs_buf_read( xfs_buf_t *bp, xfs_buf_flags_t flags) @@ -759,7 +768,7 @@ _xfs_buf_read( ASSERT(!(flags & XBF_WRITE)); ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL); - bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD); + bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE); bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); return xfs_buf_submit(bp); @@ -1170,20 +1179,145 @@ xfs_buf_wait_unpin( set_current_state(TASK_RUNNING); } +static void +xfs_buf_ioerror_alert_ratelimited( + struct xfs_buf *bp) +{ + static unsigned long lasttime; + static struct xfs_buftarg *lasttarg; + + if (bp->b_target != lasttarg || + time_after(jiffies, (lasttime + 5*HZ))) { + lasttime = jiffies; + xfs_buf_ioerror_alert(bp, __this_address); + } + lasttarg = bp->b_target; +} + /* - * Buffer Utility Routines + * Account for this latest trip around the retry handler, and decide if + * we've failed enough times to constitute a permanent failure. */ +static bool +xfs_buf_ioerror_permanent( + struct xfs_buf *bp, + struct xfs_error_cfg *cfg) +{ + struct xfs_mount *mp = bp->b_mount; -void + if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && + ++bp->b_retries > cfg->max_retries) + return true; + if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && + time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time)) + return true; + + /* At unmount we may treat errors differently */ + if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount) + return true; + + return false; +} + +/* + * On a sync write or shutdown we just want to stale the buffer and let the + * caller handle the error in bp->b_error appropriately. + * + * If the write was asynchronous then no one will be looking for the error. If + * this is the first failure of this type, clear the error state and write the + * buffer out again. This means we always retry an async write failure at least + * once, but we also need to set the buffer up to behave correctly now for + * repeated failures. + * + * If we get repeated async write failures, then we take action according to the + * error configuration we have been set up to use. + * + * Returns true if this function took care of error handling and the caller must + * not touch the buffer again. Return false if the caller should proceed with + * normal I/O completion handling. + */ +static bool +xfs_buf_ioend_handle_error( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_mount; + struct xfs_error_cfg *cfg; + + /* + * If we've already decided to shutdown the filesystem because of I/O + * errors, there's no point in giving this a retry. + */ + if (XFS_FORCED_SHUTDOWN(mp)) + goto out_stale; + + xfs_buf_ioerror_alert_ratelimited(bp); + + /* + * We're not going to bother about retrying this during recovery. + * One strike! + */ + if (bp->b_flags & _XBF_LOGRECOVERY) { + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + return false; + } + + /* + * Synchronous writes will have callers process the error. + */ + if (!(bp->b_flags & XBF_ASYNC)) + goto out_stale; + + trace_xfs_buf_iodone_async(bp, _RET_IP_); + + cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); + if (bp->b_last_error != bp->b_error || + !(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL))) { + bp->b_last_error = bp->b_error; + if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && + !bp->b_first_retry_time) + bp->b_first_retry_time = jiffies; + goto resubmit; + } + + /* + * Permanent error - we need to trigger a shutdown if we haven't already + * to indicate that inconsistency will result from this action. + */ + if (xfs_buf_ioerror_permanent(bp, cfg)) { + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + goto out_stale; + } + + /* Still considered a transient error. Caller will schedule retries. */ + if (bp->b_flags & _XBF_INODES) + xfs_buf_inode_io_fail(bp); + else if (bp->b_flags & _XBF_DQUOTS) + xfs_buf_dquot_io_fail(bp); + else + ASSERT(list_empty(&bp->b_li_list)); + xfs_buf_ioerror(bp, 0); + xfs_buf_relse(bp); + return true; + +resubmit: + xfs_buf_ioerror(bp, 0); + bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL); + xfs_buf_submit(bp); + return true; +out_stale: + xfs_buf_stale(bp); + bp->b_flags |= XBF_DONE; + bp->b_flags &= ~XBF_WRITE; + trace_xfs_buf_error_relse(bp, _RET_IP_); + return false; +} + +static void xfs_buf_ioend( struct xfs_buf *bp) { - bool read = bp->b_flags & XBF_READ; - trace_xfs_buf_iodone(bp, _RET_IP_); - bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); - /* * Pull in IO completion errors now. We are guaranteed to be running * single threaded, so we don't need the lock to read b_io_error. @@ -1191,39 +1325,47 @@ xfs_buf_ioend( if (!bp->b_error && bp->b_io_error) xfs_buf_ioerror(bp, bp->b_io_error); - if (read) { + if (bp->b_flags & XBF_READ) { if (!bp->b_error && bp->b_ops) bp->b_ops->verify_read(bp); if (!bp->b_error) bp->b_flags |= XBF_DONE; - xfs_buf_ioend_finish(bp); - return; - } + } else { + if (!bp->b_error) { + bp->b_flags &= ~XBF_WRITE_FAIL; + bp->b_flags |= XBF_DONE; + } - if (!bp->b_error) { - bp->b_flags &= ~XBF_WRITE_FAIL; - bp->b_flags |= XBF_DONE; - } + if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp)) + return; - /* - * If this is a log recovery buffer, we aren't doing transactional IO - * yet so we need to let it handle IO completions. - */ - if (bp->b_flags & _XBF_LOGRECOVERY) { - xlog_recover_iodone(bp); - return; - } + /* clear the retry state */ + bp->b_last_error = 0; + bp->b_retries = 0; + bp->b_first_retry_time = 0; - if (bp->b_flags & _XBF_INODES) { - xfs_buf_inode_iodone(bp); - return; - } + /* + * Note that for things like remote attribute buffers, there may + * not be a buffer log item here, so processing the buffer log + * item must remain optional. + */ + if (bp->b_log_item) + xfs_buf_item_done(bp); + + if (bp->b_flags & _XBF_INODES) + xfs_buf_inode_iodone(bp); + else if (bp->b_flags & _XBF_DQUOTS) + xfs_buf_dquot_iodone(bp); - if (bp->b_flags & _XBF_DQUOTS) { - xfs_buf_dquot_iodone(bp); - return; } - xfs_buf_iodone(bp); + + bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD | + _XBF_LOGRECOVERY); + + if (bp->b_flags & XBF_ASYNC) + xfs_buf_relse(bp); + else + complete(&bp->b_iowait); } static void @@ -1506,7 +1648,7 @@ xfs_buf_iowait( * safe to reference the buffer after a call to this function unless the caller * holds an additional reference itself. */ -int +static int __xfs_buf_submit( struct xfs_buf *bp, bool wait) diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 755b652e695a..bfd2907e7bc4 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -249,6 +249,7 @@ int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags, int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr, size_t numblks, int flags, struct xfs_buf **bpp, const struct xfs_buf_ops *ops); +int _xfs_buf_read(struct xfs_buf *bp, xfs_buf_flags_t flags); void xfs_buf_hold(struct xfs_buf *bp); /* Releasing Buffers */ @@ -269,28 +270,12 @@ static inline void xfs_buf_relse(xfs_buf_t *bp) /* Buffer Read and Write Routines */ extern int xfs_bwrite(struct xfs_buf *bp); -extern void xfs_buf_ioend(struct xfs_buf *bp); -static inline void xfs_buf_ioend_finish(struct xfs_buf *bp) -{ - if (bp->b_flags & XBF_ASYNC) - xfs_buf_relse(bp); - else - complete(&bp->b_iowait); -} extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error, xfs_failaddr_t failaddr); #define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address) extern void xfs_buf_ioerror_alert(struct xfs_buf *bp, xfs_failaddr_t fa); void xfs_buf_ioend_fail(struct xfs_buf *); - -extern int __xfs_buf_submit(struct xfs_buf *bp, bool); -static inline int xfs_buf_submit(struct xfs_buf *bp) -{ - bool wait = bp->b_flags & XBF_ASYNC ? false : true; - return __xfs_buf_submit(bp, wait); -} - void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize); void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa); #define xfs_buf_mark_corrupt(bp) __xfs_buf_mark_corrupt((bp), __this_address) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 408d1b572d3f..0356f2e340a1 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -30,8 +30,6 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) return container_of(lip, struct xfs_buf_log_item, bli_item); } -static void xfs_buf_item_done(struct xfs_buf *bp); - /* Is this log iovec plausibly large enough to contain the buffer log format? */ bool xfs_buf_log_check_iovec( @@ -463,7 +461,7 @@ xfs_buf_item_unpin( */ if (bip->bli_flags & XFS_BLI_STALE_INODE) { xfs_buf_item_done(bp); - xfs_iflush_done(bp); + xfs_buf_inode_iodone(bp); ASSERT(list_empty(&bp->b_li_list)); } else { xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR); @@ -956,153 +954,10 @@ xfs_buf_item_relse( xfs_buf_item_free(bip); } -/* - * Decide if we're going to retry the write after a failure, and prepare - * the buffer for retrying the write. - */ -static bool -xfs_buf_ioerror_fail_without_retry( - struct xfs_buf *bp) -{ - struct xfs_mount *mp = bp->b_mount; - static ulong lasttime; - static xfs_buftarg_t *lasttarg; - - /* - * If we've already decided to shutdown the filesystem because of - * I/O errors, there's no point in giving this a retry. - */ - if (XFS_FORCED_SHUTDOWN(mp)) - return true; - - if (bp->b_target != lasttarg || - time_after(jiffies, (lasttime + 5*HZ))) { - lasttime = jiffies; - xfs_buf_ioerror_alert(bp, __this_address); - } - lasttarg = bp->b_target; - - /* synchronous writes will have callers process the error */ - if (!(bp->b_flags & XBF_ASYNC)) - return true; - return false; -} - -static bool -xfs_buf_ioerror_retry( - struct xfs_buf *bp, - struct xfs_error_cfg *cfg) -{ - if ((bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) && - bp->b_last_error == bp->b_error) - return false; - - bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL); - bp->b_last_error = bp->b_error; - if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && - !bp->b_first_retry_time) - bp->b_first_retry_time = jiffies; - return true; -} - -/* - * Account for this latest trip around the retry handler, and decide if - * we've failed enough times to constitute a permanent failure. - */ -static bool -xfs_buf_ioerror_permanent( - struct xfs_buf *bp, - struct xfs_error_cfg *cfg) -{ - struct xfs_mount *mp = bp->b_mount; - - if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && - ++bp->b_retries > cfg->max_retries) - return true; - if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && - time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time)) - return true; - - /* At unmount we may treat errors differently */ - if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount) - return true; - - return false; -} - -/* - * On a sync write or shutdown we just want to stale the buffer and let the - * caller handle the error in bp->b_error appropriately. - * - * If the write was asynchronous then no one will be looking for the error. If - * this is the first failure of this type, clear the error state and write the - * buffer out again. This means we always retry an async write failure at least - * once, but we also need to set the buffer up to behave correctly now for - * repeated failures. - * - * If we get repeated async write failures, then we take action according to the - * error configuration we have been set up to use. - * - * Multi-state return value: - * - * XBF_IOERROR_FINISH: clear IO error retry state and run callback completions - * XBF_IOERROR_DONE: resubmitted immediately, do not run any completions - * XBF_IOERROR_FAIL: transient error, run failure callback completions and then - * release the buffer - */ -enum { - XBF_IOERROR_FINISH, - XBF_IOERROR_DONE, - XBF_IOERROR_FAIL, -}; - -static int -xfs_buf_iodone_error( - struct xfs_buf *bp) -{ - struct xfs_mount *mp = bp->b_mount; - struct xfs_error_cfg *cfg; - - if (xfs_buf_ioerror_fail_without_retry(bp)) - goto out_stale; - - trace_xfs_buf_item_iodone_async(bp, _RET_IP_); - - cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); - if (xfs_buf_ioerror_retry(bp, cfg)) { - xfs_buf_ioerror(bp, 0); - xfs_buf_submit(bp); - return XBF_IOERROR_DONE; - } - - /* - * Permanent error - we need to trigger a shutdown if we haven't already - * to indicate that inconsistency will result from this action. - */ - if (xfs_buf_ioerror_permanent(bp, cfg)) { - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); - goto out_stale; - } - - /* Still considered a transient error. Caller will schedule retries. */ - return XBF_IOERROR_FAIL; - -out_stale: - xfs_buf_stale(bp); - bp->b_flags |= XBF_DONE; - trace_xfs_buf_error_relse(bp, _RET_IP_); - return XBF_IOERROR_FINISH; -} - -static void +void xfs_buf_item_done( struct xfs_buf *bp) { - struct xfs_buf_log_item *bip = bp->b_log_item; - - if (!bip) - return; - /* * If we are forcibly shutting down, this may well be off the AIL * already. That's because we simulate the log-committed callbacks to @@ -1111,113 +966,12 @@ xfs_buf_item_done( * xfs_trans_ail_delete() takes care of these. * * Either way, AIL is useless if we're forcing a shutdown. + * + * Note that log recovery writes might have buffer items that are not on + * the AIL even when the file system is not shut down. */ - xfs_trans_ail_delete(&bip->bli_item, SHUTDOWN_CORRUPT_INCORE); - bp->b_log_item = NULL; - xfs_buf_item_free(bip); - xfs_buf_rele(bp); -} - -static inline void -xfs_buf_clear_ioerror_retry_state( - struct xfs_buf *bp) -{ - bp->b_last_error = 0; - bp->b_retries = 0; - bp->b_first_retry_time = 0; -} - -/* - * Inode buffer iodone callback function. - */ -void -xfs_buf_inode_iodone( - struct xfs_buf *bp) -{ - if (bp->b_error) { - struct xfs_log_item *lip; - int ret = xfs_buf_iodone_error(bp); - - if (ret == XBF_IOERROR_FINISH) - goto finish_iodone; - if (ret == XBF_IOERROR_DONE) - return; - ASSERT(ret == XBF_IOERROR_FAIL); - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { - set_bit(XFS_LI_FAILED, &lip->li_flags); - } - xfs_buf_ioerror(bp, 0); - xfs_buf_relse(bp); - return; - } - -finish_iodone: - xfs_buf_clear_ioerror_retry_state(bp); - xfs_buf_item_done(bp); - xfs_iflush_done(bp); - xfs_buf_ioend_finish(bp); -} - -/* - * Dquot buffer iodone callback function. - */ -void -xfs_buf_dquot_iodone( - struct xfs_buf *bp) -{ - if (bp->b_error) { - struct xfs_log_item *lip; - int ret = xfs_buf_iodone_error(bp); - - if (ret == XBF_IOERROR_FINISH) - goto finish_iodone; - if (ret == XBF_IOERROR_DONE) - return; - ASSERT(ret == XBF_IOERROR_FAIL); - spin_lock(&bp->b_mount->m_ail->ail_lock); - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { - xfs_set_li_failed(lip, bp); - } - spin_unlock(&bp->b_mount->m_ail->ail_lock); - xfs_buf_ioerror(bp, 0); - xfs_buf_relse(bp); - return; - } - -finish_iodone: - xfs_buf_clear_ioerror_retry_state(bp); - /* a newly allocated dquot buffer might have a log item attached */ - xfs_buf_item_done(bp); - xfs_dquot_done(bp); - xfs_buf_ioend_finish(bp); -} - -/* - * Dirty buffer iodone callback function. - * - * Note that for things like remote attribute buffers, there may not be a buffer - * log item here, so processing the buffer log item must remain be optional. - */ -void -xfs_buf_iodone( - struct xfs_buf *bp) -{ - if (bp->b_error) { - int ret = xfs_buf_iodone_error(bp); - - if (ret == XBF_IOERROR_FINISH) - goto finish_iodone; - if (ret == XBF_IOERROR_DONE) - return; - ASSERT(ret == XBF_IOERROR_FAIL); - ASSERT(list_empty(&bp->b_li_list)); - xfs_buf_ioerror(bp, 0); - xfs_buf_relse(bp); - return; - } - -finish_iodone: - xfs_buf_clear_ioerror_retry_state(bp); - xfs_buf_item_done(bp); - xfs_buf_ioend_finish(bp); + xfs_trans_ail_delete(&bp->b_log_item->bli_item, + (bp->b_flags & _XBF_LOGRECOVERY) ? 0 : + SHUTDOWN_CORRUPT_INCORE); + xfs_buf_item_relse(bp); } diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 23507cbb4c41..50aa0f5ef959 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -50,12 +50,24 @@ struct xfs_buf_log_item { }; int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); +void xfs_buf_item_done(struct xfs_buf *bp); void xfs_buf_item_relse(struct xfs_buf *); bool xfs_buf_item_put(struct xfs_buf_log_item *); void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint); bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *); void xfs_buf_inode_iodone(struct xfs_buf *); +void xfs_buf_inode_io_fail(struct xfs_buf *bp); +#ifdef CONFIG_XFS_QUOTA void xfs_buf_dquot_iodone(struct xfs_buf *); +void xfs_buf_dquot_io_fail(struct xfs_buf *bp); +#else +static inline void xfs_buf_dquot_iodone(struct xfs_buf *bp) +{ +} +static inline void xfs_buf_dquot_io_fail(struct xfs_buf *bp) +{ +} +#endif /* CONFIG_XFS_QUOTA */ void xfs_buf_iodone(struct xfs_buf *); bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec); diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 8f0457d67d77..24c7a8d11e1a 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -414,7 +414,7 @@ xlog_recover_validate_buf_type( * * Write verifiers update the metadata LSN from log items attached to * the buffer. Therefore, initialize a bli purely to carry the LSN to - * the verifier. We'll clean it up in our ->iodone() callback. + * the verifier. */ if (bp->b_ops) { struct xfs_buf_log_item *bip; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index bcd73b9c2994..3072814e407d 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -98,12 +98,33 @@ xfs_qm_adjust_dqlimits( xfs_dquot_set_prealloc_limits(dq); } +/* Set the expiration time of a quota's grace period. */ +time64_t +xfs_dquot_set_timeout( + struct xfs_mount *mp, + time64_t timeout) +{ + struct xfs_quotainfo *qi = mp->m_quotainfo; + + return clamp_t(time64_t, timeout, qi->qi_expiry_min, + qi->qi_expiry_max); +} + +/* Set the length of the default grace period. */ +time64_t +xfs_dquot_set_grace_period( + time64_t grace) +{ + return clamp_t(time64_t, grace, XFS_DQ_GRACE_MIN, XFS_DQ_GRACE_MAX); +} + /* * Determine if this quota counter is over either limit and set the quota * timers as appropriate. */ static inline void xfs_qm_adjust_res_timer( + struct xfs_mount *mp, struct xfs_dquot_res *res, struct xfs_quota_limits *qlim) { @@ -112,7 +133,8 @@ xfs_qm_adjust_res_timer( if ((res->softlimit && res->count > res->softlimit) || (res->hardlimit && res->count > res->hardlimit)) { if (res->timer == 0) - res->timer = ktime_get_real_seconds() + qlim->time; + res->timer = xfs_dquot_set_timeout(mp, + ktime_get_real_seconds() + qlim->time); } else { if (res->timer == 0) res->warnings = 0; @@ -145,9 +167,9 @@ xfs_qm_adjust_dqtimers( ASSERT(dq->q_id); defq = xfs_get_defquota(qi, xfs_dquot_type(dq)); - xfs_qm_adjust_res_timer(&dq->q_blk, &defq->blk); - xfs_qm_adjust_res_timer(&dq->q_ino, &defq->ino); - xfs_qm_adjust_res_timer(&dq->q_rtb, &defq->rtb); + xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_blk, &defq->blk); + xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_ino, &defq->ino); + xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_rtb, &defq->rtb); } /* @@ -201,6 +223,8 @@ xfs_qm_init_dquot_blk( d->dd_diskdq.d_version = XFS_DQUOT_VERSION; d->dd_diskdq.d_id = cpu_to_be32(curid); d->dd_diskdq.d_type = type; + if (curid > 0 && xfs_sb_version_hasbigtime(&mp->m_sb)) + d->dd_diskdq.d_type |= XFS_DQTYPE_BIGTIME; if (xfs_sb_version_hascrc(&mp->m_sb)) { uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid); xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), @@ -514,9 +538,9 @@ xfs_dquot_from_disk( dqp->q_ino.warnings = be16_to_cpu(ddqp->d_iwarns); dqp->q_rtb.warnings = be16_to_cpu(ddqp->d_rtbwarns); - dqp->q_blk.timer = be32_to_cpu(ddqp->d_btimer); - dqp->q_ino.timer = be32_to_cpu(ddqp->d_itimer); - dqp->q_rtb.timer = be32_to_cpu(ddqp->d_rtbtimer); + dqp->q_blk.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_btimer); + dqp->q_ino.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_itimer); + dqp->q_rtb.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_rtbtimer); /* * Reservation counters are defined as reservation plus current usage @@ -559,9 +583,9 @@ xfs_dquot_to_disk( ddqp->d_iwarns = cpu_to_be16(dqp->q_ino.warnings); ddqp->d_rtbwarns = cpu_to_be16(dqp->q_rtb.warnings); - ddqp->d_btimer = cpu_to_be32(dqp->q_blk.timer); - ddqp->d_itimer = cpu_to_be32(dqp->q_ino.timer); - ddqp->d_rtbtimer = cpu_to_be32(dqp->q_rtb.timer); + ddqp->d_btimer = xfs_dquot_to_disk_ts(dqp, dqp->q_blk.timer); + ddqp->d_itimer = xfs_dquot_to_disk_ts(dqp, dqp->q_ino.timer); + ddqp->d_rtbtimer = xfs_dquot_to_disk_ts(dqp, dqp->q_rtb.timer); } /* Allocate and initialize the dquot buffer for this in-core dquot. */ @@ -1107,7 +1131,7 @@ xfs_qm_dqflush_done( } void -xfs_dquot_done( +xfs_buf_dquot_iodone( struct xfs_buf *bp) { struct xfs_log_item *lip, *n; @@ -1118,6 +1142,18 @@ xfs_dquot_done( } } +void +xfs_buf_dquot_io_fail( + struct xfs_buf *bp) +{ + struct xfs_log_item *lip; + + spin_lock(&bp->b_mount->m_ail->ail_lock); + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) + xfs_set_li_failed(lip, bp); + spin_unlock(&bp->b_mount->m_ail->ail_lock); +} + /* Check incore dquot for errors before we flush. */ static xfs_failaddr_t xfs_qm_dqflush_check( @@ -1145,6 +1181,14 @@ xfs_qm_dqflush_check( !dqp->q_rtb.timer) return __this_address; + /* bigtime flag should never be set on root dquots */ + if (dqp->q_type & XFS_DQTYPE_BIGTIME) { + if (!xfs_sb_version_hasbigtime(&dqp->q_mount->m_sb)) + return __this_address; + if (dqp->q_id == 0) + return __this_address; + } + return NULL; } diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 282a65da93c7..f642884a6834 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -237,4 +237,7 @@ typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq, int xfs_qm_dqiterate(struct xfs_mount *mp, xfs_dqtype_t type, xfs_qm_dqiterate_fn iter_fn, void *priv); +time64_t xfs_dquot_set_timeout(struct xfs_mount *mp, time64_t timeout); +time64_t xfs_dquot_set_grace_period(time64_t grace); + #endif /* __XFS_DQUOT_H__ */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index c31cd3be9fb2..3d1b95124744 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1008,6 +1008,21 @@ xfs_file_fadvise( return ret; } +/* Does this file, inode, or mount want synchronous writes? */ +static inline bool xfs_file_sync_writes(struct file *filp) +{ + struct xfs_inode *ip = XFS_I(file_inode(filp)); + + if (ip->i_mount->m_flags & XFS_MOUNT_WSYNC) + return true; + if (filp->f_flags & (__O_SYNC | O_DSYNC)) + return true; + if (IS_SYNC(file_inode(filp))) + return true; + + return false; +} + STATIC loff_t xfs_file_remap_range( struct file *file_in, @@ -1065,7 +1080,7 @@ xfs_file_remap_range( if (ret) goto out_unlock; - if (mp->m_flags & XFS_MOUNT_WSYNC) + if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out)) xfs_log_force_inode(dest); out_unlock: xfs_iunlock2_io_mmap(src, dest); @@ -1223,6 +1238,14 @@ __xfs_filemap_fault( return ret; } +static inline bool +xfs_is_write_fault( + struct vm_fault *vmf) +{ + return (vmf->flags & FAULT_FLAG_WRITE) && + (vmf->vma->vm_flags & VM_SHARED); +} + static vm_fault_t xfs_filemap_fault( struct vm_fault *vmf) @@ -1230,7 +1253,7 @@ xfs_filemap_fault( /* DAX can shortcut the normal fault path on write faults! */ return __xfs_filemap_fault(vmf, PE_SIZE_PTE, IS_DAX(file_inode(vmf->vma->vm_file)) && - (vmf->flags & FAULT_FLAG_WRITE)); + xfs_is_write_fault(vmf)); } static vm_fault_t @@ -1243,7 +1266,7 @@ xfs_filemap_huge_fault( /* DAX can shortcut the normal fault path on write faults! */ return __xfs_filemap_fault(vmf, pe_size, - (vmf->flags & FAULT_FLAG_WRITE)); + xfs_is_write_fault(vmf)); } static vm_fault_t diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 101028ebb571..deb99300d171 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -52,7 +52,6 @@ xfs_inode_alloc( XFS_STATS_INC(mp, vn_active); ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!xfs_isiflocked(ip)); ASSERT(ip->i_ino == 0); /* initialise the xfs inode */ @@ -123,7 +122,7 @@ void xfs_inode_free( struct xfs_inode *ip) { - ASSERT(!xfs_isiflocked(ip)); + ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING)); /* * Because we use RCU freeing we need to ensure the inode always @@ -1035,23 +1034,21 @@ xfs_reclaim_inode( if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) goto out; - if (!xfs_iflock_nowait(ip)) + if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING)) goto out_iunlock; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_iunpin_wait(ip); - /* xfs_iflush_abort() drops the flush lock */ xfs_iflush_abort(ip); goto reclaim; } if (xfs_ipincount(ip)) - goto out_ifunlock; + goto out_clear_flush; if (!xfs_inode_clean(ip)) - goto out_ifunlock; + goto out_clear_flush; - xfs_ifunlock(ip); + xfs_iflags_clear(ip, XFS_IFLUSHING); reclaim: - ASSERT(!xfs_isiflocked(ip)); /* * Because we use RCU freeing we need to ensure the inode always appears @@ -1101,8 +1098,8 @@ reclaim: __xfs_inode_free(ip); return; -out_ifunlock: - xfs_ifunlock(ip); +out_clear_flush: + xfs_iflags_clear(ip, XFS_IFLUSHING); out_iunlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); out: @@ -1211,7 +1208,7 @@ xfs_reclaim_inodes( while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { xfs_ail_push_all_sync(mp->m_ail); xfs_reclaim_inodes_ag(mp, &nr_to_scan); - }; + } } /* diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index c06129cffba9..49624973eecc 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -598,22 +598,6 @@ xfs_lock_two_inodes( } } -void -__xfs_iflock( - struct xfs_inode *ip) -{ - wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT); - DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); - - do { - prepare_to_wait_exclusive(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); - if (xfs_isiflocked(ip)) - io_schedule(); - } while (!xfs_iflock_nowait(ip)); - - finish_wait(wq, &wait.wq_entry); -} - STATIC uint _xfs_dic2xflags( uint16_t di_flags, @@ -840,7 +824,7 @@ xfs_ialloc( if (xfs_sb_version_has_v3inode(&mp->m_sb)) { inode_set_iversion(inode, 1); - ip->i_d.di_flags2 = 0; + ip->i_d.di_flags2 = mp->m_ino_geo.new_diflags2; ip->i_d.di_cowextsize = 0; ip->i_d.di_crtime = tv; } @@ -2531,11 +2515,8 @@ retry: * valid, the wrong inode or stale. */ spin_lock(&ip->i_flags_lock); - if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) { - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); - return; - } + if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) + goto out_iflags_unlock; /* * Don't try to lock/unlock the current inode, but we _cannot_ skip the @@ -2552,16 +2533,14 @@ retry: } } ip->i_flags |= XFS_ISTALE; - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); /* - * If we can't get the flush lock, the inode is already attached. All + * If the inode is flushing, it is already attached to the buffer. All * we needed to do here is mark the inode stale so buffer IO completion * will remove it from the AIL. */ iip = ip->i_itemp; - if (!xfs_iflock_nowait(ip)) { + if (__xfs_iflags_test(ip, XFS_IFLUSHING)) { ASSERT(!list_empty(&iip->ili_item.li_bio_list)); ASSERT(iip->ili_last_fields); goto out_iunlock; @@ -2573,10 +2552,12 @@ retry: * commit as the flock synchronises removal of the inode from the * cluster buffer against inode reclaim. */ - if (!iip || list_empty(&iip->ili_item.li_bio_list)) { - xfs_ifunlock(ip); + if (!iip || list_empty(&iip->ili_item.li_bio_list)) goto out_iunlock; - } + + __xfs_iflags_set(ip, XFS_IFLUSHING); + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); /* we have a dirty inode in memory that has not yet been flushed. */ spin_lock(&iip->ili_lock); @@ -2586,9 +2567,16 @@ retry: spin_unlock(&iip->ili_lock); ASSERT(iip->ili_last_fields); + if (ip != free_ip) + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return; + out_iunlock: if (ip != free_ip) xfs_iunlock(ip, XFS_ILOCK_EXCL); +out_iflags_unlock: + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); } /* @@ -2631,8 +2619,9 @@ xfs_ifree_cluster( /* * We obtain and lock the backing buffer first in the process - * here, as we have to ensure that any dirty inode that we - * can't get the flush lock on is attached to the buffer. + * here to ensure dirty inodes attached to the buffer remain in + * the flushing state while we mark them stale. + * * If we scan the in-memory inodes first, then buffer IO can * complete before we get a lock on it, and hence we may fail * to mark all the active inodes on the buffer stale. @@ -2717,7 +2706,7 @@ xfs_ifree( VFS_I(ip)->i_mode = 0; /* mark incore inode as free */ ip->i_d.di_flags = 0; - ip->i_d.di_flags2 = 0; + ip->i_d.di_flags2 = ip->i_mount->m_ino_geo.new_diflags2; ip->i_d.di_dmevmask = 0; ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; @@ -3443,7 +3432,7 @@ xfs_iflush( int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - ASSERT(xfs_isiflocked(ip)); + ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING)); ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); ASSERT(iip->ili_item.li_buf == bp); @@ -3553,8 +3542,8 @@ xfs_iflush( * * What we do is move the bits to the ili_last_fields field. When * logging the inode, these bits are moved back to the ili_fields field. - * In the xfs_iflush_done() routine we clear ili_last_fields, since we - * know that the information those bits represent is permanently on + * In the xfs_buf_inode_iodone() routine we clear ili_last_fields, since + * we know that the information those bits represent is permanently on * disk. As long as the flush completes before the inode is logged * again, then both ili_fields and ili_last_fields will be cleared. */ @@ -3568,7 +3557,7 @@ flush_out: /* * Store the current LSN of the inode so that we can tell whether the - * item has moved in the AIL from xfs_iflush_done(). + * item has moved in the AIL from xfs_buf_inode_iodone(). */ xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); @@ -3613,7 +3602,7 @@ xfs_iflush_cluster( /* * Quick and dirty check to avoid locks if possible. */ - if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) + if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) continue; if (xfs_ipincount(ip)) continue; @@ -3627,7 +3616,7 @@ xfs_iflush_cluster( */ spin_lock(&ip->i_flags_lock); ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE)); - if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) { + if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) { spin_unlock(&ip->i_flags_lock); continue; } @@ -3635,24 +3624,17 @@ xfs_iflush_cluster( /* * ILOCK will pin the inode against reclaim and prevent * concurrent transactions modifying the inode while we are - * flushing the inode. + * flushing the inode. If we get the lock, set the flushing + * state before we drop the i_flags_lock. */ if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { spin_unlock(&ip->i_flags_lock); continue; } + __xfs_iflags_set(ip, XFS_IFLUSHING); spin_unlock(&ip->i_flags_lock); /* - * Skip inodes that are already flush locked as they have - * already been written to the buffer. - */ - if (!xfs_iflock_nowait(ip)) { - xfs_iunlock(ip, XFS_ILOCK_SHARED); - continue; - } - - /* * Abort flushing this inode if we are shut down because the * inode may not currently be in the AIL. This can occur when * log I/O failure unpins the inode without inserting into the @@ -3661,7 +3643,6 @@ xfs_iflush_cluster( */ if (XFS_FORCED_SHUTDOWN(mp)) { xfs_iunpin_wait(ip); - /* xfs_iflush_abort() drops the flush lock */ xfs_iflush_abort(ip); xfs_iunlock(ip, XFS_ILOCK_SHARED); error = -EIO; @@ -3670,7 +3651,7 @@ xfs_iflush_cluster( /* don't block waiting on a log force to unpin dirty inodes */ if (xfs_ipincount(ip)) { - xfs_ifunlock(ip); + xfs_iflags_clear(ip, XFS_IFLUSHING); xfs_iunlock(ip, XFS_ILOCK_SHARED); continue; } @@ -3678,7 +3659,7 @@ xfs_iflush_cluster( if (!xfs_inode_clean(ip)) error = xfs_iflush(ip, bp); else - xfs_ifunlock(ip); + xfs_iflags_clear(ip, XFS_IFLUSHING); xfs_iunlock(ip, XFS_ILOCK_SHARED); if (error) break; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index e9a8bb184d1f..751a3d1d7d84 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -194,6 +194,11 @@ static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip) return ip->i_cowfp && ip->i_cowfp->if_bytes; } +static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) +{ + return ip->i_d.di_flags2 & XFS_DIFLAG2_BIGTIME; +} + /* * Return the buftarg used for data allocations on a given inode. */ @@ -211,8 +216,7 @@ static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip) #define XFS_INEW (1 << __XFS_INEW_BIT) #define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */ #define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */ -#define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */ -#define XFS_IFLOCK (1 << __XFS_IFLOCK_BIT) +#define XFS_IFLUSHING (1 << 7) /* inode is being flushed */ #define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */ #define XFS_IPINNED (1 << __XFS_IPINNED_BIT) #define XFS_IEOFBLOCKS (1 << 9) /* has the preallocblocks tag set */ @@ -234,36 +238,6 @@ static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip) XFS_IDIRTY_RELEASE | XFS_ITRUNCATED) /* - * Synchronize processes attempting to flush the in-core inode back to disk. - */ - -static inline int xfs_isiflocked(struct xfs_inode *ip) -{ - return xfs_iflags_test(ip, XFS_IFLOCK); -} - -extern void __xfs_iflock(struct xfs_inode *ip); - -static inline int xfs_iflock_nowait(struct xfs_inode *ip) -{ - return !xfs_iflags_test_and_set(ip, XFS_IFLOCK); -} - -static inline void xfs_iflock(struct xfs_inode *ip) -{ - if (!xfs_iflock_nowait(ip)) - __xfs_iflock(ip); -} - -static inline void xfs_ifunlock(struct xfs_inode *ip) -{ - ASSERT(xfs_isiflocked(ip)); - xfs_iflags_clear(ip, XFS_IFLOCK); - smp_mb(); - wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT); -} - -/* * Flags for inode locking. * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) * 1<<16 - 1<<32-1 -- lockdep annotation (integers) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 6c65938cee1c..17e20a6d8b4e 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -295,6 +295,28 @@ xfs_inode_item_format_attr_fork( } } +/* + * Convert an incore timestamp to a log timestamp. Note that the log format + * specifies host endian format! + */ +static inline xfs_ictimestamp_t +xfs_inode_to_log_dinode_ts( + struct xfs_inode *ip, + const struct timespec64 tv) +{ + struct xfs_legacy_ictimestamp *lits; + xfs_ictimestamp_t its; + + if (xfs_inode_has_bigtime(ip)) + return xfs_inode_encode_bigtime(tv); + + lits = (struct xfs_legacy_ictimestamp *)&its; + lits->t_sec = tv.tv_sec; + lits->t_nsec = tv.tv_nsec; + + return its; +} + static void xfs_inode_to_log_dinode( struct xfs_inode *ip, @@ -313,12 +335,9 @@ xfs_inode_to_log_dinode( memset(to->di_pad, 0, sizeof(to->di_pad)); memset(to->di_pad3, 0, sizeof(to->di_pad3)); - to->di_atime.t_sec = inode->i_atime.tv_sec; - to->di_atime.t_nsec = inode->i_atime.tv_nsec; - to->di_mtime.t_sec = inode->i_mtime.tv_sec; - to->di_mtime.t_nsec = inode->i_mtime.tv_nsec; - to->di_ctime.t_sec = inode->i_ctime.tv_sec; - to->di_ctime.t_nsec = inode->i_ctime.tv_nsec; + to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime); + to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime); + to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode->i_ctime); to->di_nlink = inode->i_nlink; to->di_gen = inode->i_generation; to->di_mode = inode->i_mode; @@ -340,8 +359,7 @@ xfs_inode_to_log_dinode( if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { to->di_version = 3; to->di_changecount = inode_peek_iversion(inode); - to->di_crtime.t_sec = from->di_crtime.tv_sec; - to->di_crtime.t_nsec = from->di_crtime.tv_nsec; + to->di_crtime = xfs_inode_to_log_dinode_ts(ip, from->di_crtime); to->di_flags2 = from->di_flags2; to->di_cowextsize = from->di_cowextsize; to->di_ino = ip->i_ino; @@ -491,8 +509,7 @@ xfs_inode_item_push( (ip->i_flags & XFS_ISTALE)) return XFS_ITEM_PINNED; - /* If the inode is already flush locked, we're already flushing. */ - if (xfs_isiflocked(ip)) + if (xfs_iflags_test(ip, XFS_IFLUSHING)) return XFS_ITEM_FLUSHING; if (!xfs_buf_trylock(bp)) @@ -703,7 +720,7 @@ xfs_iflush_finish( iip->ili_last_fields = 0; iip->ili_flush_lsn = 0; spin_unlock(&iip->ili_lock); - xfs_ifunlock(iip->ili_inode); + xfs_iflags_clear(iip->ili_inode, XFS_IFLUSHING); if (drop_buffer) xfs_buf_rele(bp); } @@ -711,11 +728,11 @@ xfs_iflush_finish( /* * Inode buffer IO completion routine. It is responsible for removing inodes - * attached to the buffer from the AIL if they have not been re-logged, as well - * as completing the flush and unlocking the inode. + * attached to the buffer from the AIL if they have not been re-logged and + * completing the inode flush. */ void -xfs_iflush_done( +xfs_buf_inode_iodone( struct xfs_buf *bp) { struct xfs_log_item *lip, *n; @@ -754,11 +771,21 @@ xfs_iflush_done( list_splice_tail(&flushed_inodes, &bp->b_li_list); } +void +xfs_buf_inode_io_fail( + struct xfs_buf *bp) +{ + struct xfs_log_item *lip; + + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) + set_bit(XFS_LI_FAILED, &lip->li_flags); +} + /* - * This is the inode flushing abort routine. It is called from xfs_iflush when + * This is the inode flushing abort routine. It is called when * the filesystem is shutting down to clean up the inode state. It is * responsible for removing the inode item from the AIL if it has not been - * re-logged, and unlocking the inode's flush lock. + * re-logged and clearing the inode's flush state. */ void xfs_iflush_abort( @@ -790,7 +817,7 @@ xfs_iflush_abort( list_del_init(&iip->ili_item.li_bio_list); spin_unlock(&iip->ili_lock); } - xfs_ifunlock(ip); + xfs_iflags_clear(ip, XFS_IFLUSHING); if (bp) xfs_buf_rele(bp); } diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 048b5e7dee90..4b926e32831c 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -25,8 +25,8 @@ struct xfs_inode_log_item { * * We need atomic changes between inode dirtying, inode flushing and * inode completion, but these all hold different combinations of - * ILOCK and iflock and hence we need some other method of serialising - * updates to the flush state. + * ILOCK and IFLUSHING and hence we need some other method of + * serialising updates to the flush state. */ spinlock_t ili_lock; /* flush state lock */ unsigned int ili_last_fields; /* fields when flushed */ @@ -43,7 +43,6 @@ static inline int xfs_inode_clean(struct xfs_inode *ip) extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); extern void xfs_inode_item_destroy(struct xfs_inode *); -extern void xfs_iflush_done(struct xfs_buf *); extern void xfs_iflush_abort(struct xfs_inode *); extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, struct xfs_inode_log_format *); diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index 5e0d291835b3..cb44f7653f03 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -115,6 +115,82 @@ out_free_ip: return error; } +static inline bool xfs_log_dinode_has_bigtime(const struct xfs_log_dinode *ld) +{ + return ld->di_version >= 3 && + (ld->di_flags2 & XFS_DIFLAG2_BIGTIME); +} + +/* Convert a log timestamp to an ondisk timestamp. */ +static inline xfs_timestamp_t +xfs_log_dinode_to_disk_ts( + struct xfs_log_dinode *from, + const xfs_ictimestamp_t its) +{ + struct xfs_legacy_timestamp *lts; + struct xfs_legacy_ictimestamp *lits; + xfs_timestamp_t ts; + + if (xfs_log_dinode_has_bigtime(from)) + return cpu_to_be64(its); + + lts = (struct xfs_legacy_timestamp *)&ts; + lits = (struct xfs_legacy_ictimestamp *)&its; + lts->t_sec = cpu_to_be32(lits->t_sec); + lts->t_nsec = cpu_to_be32(lits->t_nsec); + + return ts; +} + +STATIC void +xfs_log_dinode_to_disk( + struct xfs_log_dinode *from, + struct xfs_dinode *to) +{ + to->di_magic = cpu_to_be16(from->di_magic); + to->di_mode = cpu_to_be16(from->di_mode); + to->di_version = from->di_version; + to->di_format = from->di_format; + to->di_onlink = 0; + to->di_uid = cpu_to_be32(from->di_uid); + to->di_gid = cpu_to_be32(from->di_gid); + to->di_nlink = cpu_to_be32(from->di_nlink); + to->di_projid_lo = cpu_to_be16(from->di_projid_lo); + to->di_projid_hi = cpu_to_be16(from->di_projid_hi); + memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); + + to->di_atime = xfs_log_dinode_to_disk_ts(from, from->di_atime); + to->di_mtime = xfs_log_dinode_to_disk_ts(from, from->di_mtime); + to->di_ctime = xfs_log_dinode_to_disk_ts(from, from->di_ctime); + + to->di_size = cpu_to_be64(from->di_size); + to->di_nblocks = cpu_to_be64(from->di_nblocks); + to->di_extsize = cpu_to_be32(from->di_extsize); + to->di_nextents = cpu_to_be32(from->di_nextents); + to->di_anextents = cpu_to_be16(from->di_anextents); + to->di_forkoff = from->di_forkoff; + to->di_aformat = from->di_aformat; + to->di_dmevmask = cpu_to_be32(from->di_dmevmask); + to->di_dmstate = cpu_to_be16(from->di_dmstate); + to->di_flags = cpu_to_be16(from->di_flags); + to->di_gen = cpu_to_be32(from->di_gen); + + if (from->di_version == 3) { + to->di_changecount = cpu_to_be64(from->di_changecount); + to->di_crtime = xfs_log_dinode_to_disk_ts(from, + from->di_crtime); + to->di_flags2 = cpu_to_be64(from->di_flags2); + to->di_cowextsize = cpu_to_be32(from->di_cowextsize); + to->di_ino = cpu_to_be64(from->di_ino); + to->di_lsn = cpu_to_be64(from->di_lsn); + memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); + uuid_copy(&to->di_uuid, &from->di_uuid); + to->di_flushiter = 0; + } else { + to->di_flushiter = cpu_to_be16(from->di_flushiter); + } +} + STATIC int xlog_recover_inode_commit_pass2( struct xlog *log, diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 6f22a66777cd..bca7659fb5c6 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -404,7 +404,7 @@ xfs_ioc_attr_list( context.cursor.offset)) return -EINVAL; - buffer = kmem_zalloc_large(bufsize, 0); + buffer = kvzalloc(bufsize, GFP_KERNEL); if (!buffer) return -ENOMEM; @@ -1190,7 +1190,8 @@ xfs_flags2diflags2( unsigned int xflags) { uint64_t di_flags2 = - (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK); + (ip->i_d.di_flags2 & (XFS_DIFLAG2_REFLINK | + XFS_DIFLAG2_BIGTIME)); if (xflags & FS_XFLAG_DAX) di_flags2 |= XFS_DIFLAG2_DAX; @@ -1690,7 +1691,7 @@ xfs_ioc_getbmap( if (bmx.bmv_count > ULONG_MAX / recsize) return -ENOMEM; - buf = kmem_zalloc_large(bmx.bmv_count * sizeof(*buf), 0); + buf = kvzalloc(bmx.bmv_count * sizeof(*buf), GFP_KERNEL); if (!buf) return -ENOMEM; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index e2ec91b2d0f4..a17d788921d6 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -265,32 +265,6 @@ xlog_header_check_mount( return 0; } -void -xlog_recover_iodone( - struct xfs_buf *bp) -{ - if (bp->b_error) { - /* - * We're not going to bother about retrying - * this during recovery. One strike! - */ - if (!XFS_FORCED_SHUTDOWN(bp->b_mount)) { - xfs_buf_ioerror_alert(bp, __this_address); - xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR); - } - } - - /* - * On v5 supers, a bli could be attached to update the metadata LSN. - * Clean it up. - */ - if (bp->b_log_item) - xfs_buf_item_relse(bp); - ASSERT(bp->b_log_item == NULL); - bp->b_flags &= ~_XBF_LOGRECOVERY; - xfs_buf_ioend_finish(bp); -} - /* * This routine finds (to an approximation) the first block in the physical * log which contains the given cycle. It uses a binary search algorithm. @@ -2097,7 +2071,7 @@ xlog_recover_add_to_cont_trans( old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; old_len = item->ri_buf[item->ri_cnt-1].i_len; - ptr = kmem_realloc(old_ptr, len + old_len, 0); + ptr = krealloc(old_ptr, len + old_len, GFP_KERNEL | __GFP_NOFAIL); memcpy(&ptr[old_len], dp, len); item->ri_buf[item->ri_cnt-1].i_len += len; item->ri_buf[item->ri_cnt-1].i_addr = ptr; @@ -3294,14 +3268,14 @@ xlog_do_log_recovery( */ STATIC int xlog_do_recover( - struct xlog *log, - xfs_daddr_t head_blk, - xfs_daddr_t tail_blk) + struct xlog *log, + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk) { - struct xfs_mount *mp = log->l_mp; - int error; - xfs_buf_t *bp; - xfs_sb_t *sbp; + struct xfs_mount *mp = log->l_mp; + struct xfs_buf *bp = mp->m_sb_bp; + struct xfs_sb *sbp = &mp->m_sb; + int error; trace_xfs_log_recover(log, head_blk, tail_blk); @@ -3315,9 +3289,8 @@ xlog_do_recover( /* * If IO errors happened during recovery, bail out. */ - if (XFS_FORCED_SHUTDOWN(mp)) { + if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - } /* * We now update the tail_lsn since much of the recovery has completed @@ -3331,16 +3304,12 @@ xlog_do_recover( xlog_assign_tail_lsn(mp); /* - * Now that we've finished replaying all buffer and inode - * updates, re-read in the superblock and reverify it. + * Now that we've finished replaying all buffer and inode updates, + * re-read the superblock and reverify it. */ - bp = xfs_getsb(mp); - bp->b_flags &= ~(XBF_DONE | XBF_ASYNC); - ASSERT(!(bp->b_flags & XBF_WRITE)); - bp->b_flags |= XBF_READ; - bp->b_ops = &xfs_sb_buf_ops; - - error = xfs_buf_submit(bp); + xfs_buf_lock(bp); + xfs_buf_hold(bp); + error = _xfs_buf_read(bp, XBF_READ); if (error) { if (!XFS_FORCED_SHUTDOWN(mp)) { xfs_buf_ioerror_alert(bp, __this_address); @@ -3351,7 +3320,6 @@ xlog_do_recover( } /* Convert superblock from on-disk format */ - sbp = &mp->m_sb; xfs_sb_from_disk(sbp, bp->b_addr); xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index c8ae49a1e99c..150ee5cb8645 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -80,9 +80,9 @@ xfs_uuid_mount( } if (hole < 0) { - xfs_uuid_table = kmem_realloc(xfs_uuid_table, + xfs_uuid_table = krealloc(xfs_uuid_table, (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table), - 0); + GFP_KERNEL | __GFP_NOFAIL); hole = xfs_uuid_table_size++; } xfs_uuid_table[hole] = *uuid; @@ -1059,11 +1059,12 @@ xfs_unmountfs( * We can potentially deadlock here if we have an inode cluster * that has been freed has its buffer still pinned in memory because * the transaction is still sitting in a iclog. The stale inodes - * on that buffer will have their flush locks held until the - * transaction hits the disk and the callbacks run. the inode - * flush takes the flush lock unconditionally and with nothing to - * push out the iclog we will never get that unlocked. hence we - * need to force the log first. + * on that buffer will be pinned to the buffer until the + * transaction hits the disk and the callbacks run. Pushing the AIL will + * skip the stale inodes and may never see the pinned buffer, so + * nothing will push out the iclog and unpin the buffer. Hence we + * need to force the log here to ensure all items are flushed into the + * AIL before we go any further. */ xfs_log_force(mp, XFS_LOG_SYNC); @@ -1289,23 +1290,6 @@ xfs_mod_frextents( } /* - * xfs_getsb() is called to obtain the buffer for the superblock. - * The buffer is returned locked and read in from disk. - * The buffer should be released with a call to xfs_brelse(). - */ -struct xfs_buf * -xfs_getsb( - struct xfs_mount *mp) -{ - struct xfs_buf *bp = mp->m_sb_bp; - - xfs_buf_lock(bp); - xfs_buf_hold(bp); - ASSERT(bp->b_flags & XBF_DONE); - return bp; -} - -/* * Used to free the superblock along various error paths. */ void diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index a72cfcaa4ad1..dfa429b77ee2 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -410,7 +410,6 @@ extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved); extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta); -extern struct xfs_buf *xfs_getsb(xfs_mount_t *); extern int xfs_readsb(xfs_mount_t *, int); extern void xfs_freesb(xfs_mount_t *); extern bool xfs_fs_writable(struct xfs_mount *mp, int level); diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 5f04d8a5ab2a..0aa87c210104 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -15,6 +15,10 @@ "XFS: offsetof(" #structname ", " #member ") is wrong, " \ "expected " #off) +#define XFS_CHECK_VALUE(value, expected) \ + BUILD_BUG_ON_MSG((value) != (expected), \ + "XFS: value of " #value " is wrong, expected " #expected) + static inline void __init xfs_check_ondisk_structs(void) { @@ -23,7 +27,7 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_acl_entry, 12); XFS_CHECK_STRUCT_SIZE(struct xfs_agf, 224); XFS_CHECK_STRUCT_SIZE(struct xfs_agfl, 36); - XFS_CHECK_STRUCT_SIZE(struct xfs_agi, 336); + XFS_CHECK_STRUCT_SIZE(struct xfs_agi, 344); XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key, 8); XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block, 4); @@ -41,7 +45,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_rec, 12); XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_key, 20); XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_rec, 24); - XFS_CHECK_STRUCT_SIZE(struct xfs_timestamp, 8); + XFS_CHECK_STRUCT_SIZE(xfs_timestamp_t, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_timestamp, 8); XFS_CHECK_STRUCT_SIZE(xfs_alloc_key_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t, 8); @@ -84,12 +89,12 @@ xfs_check_ondisk_structs(void) XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8); XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9); XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 40); - XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.totsize, 0); - XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.count, 2); - XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].namelen, 4); - XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].valuelen, 5); - XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].flags, 6); - XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].nameval, 7); + XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.totsize, 0); + XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.count, 2); + XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].namelen, 4); + XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].valuelen, 5); + XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].flags, 6); + XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].nameval, 7); XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12); XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8); @@ -121,7 +126,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_extent_64, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_log_dinode, 176); XFS_CHECK_STRUCT_SIZE(struct xfs_icreate_log, 28); - XFS_CHECK_STRUCT_SIZE(struct xfs_ictimestamp, 8); + XFS_CHECK_STRUCT_SIZE(xfs_ictimestamp_t, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_ictimestamp, 8); XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_32, 52); XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20); @@ -152,6 +158,20 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_inumbers, 24); XFS_CHECK_STRUCT_SIZE(struct xfs_bulkstat_req, 64); XFS_CHECK_STRUCT_SIZE(struct xfs_inumbers_req, 64); + + /* + * Make sure the incore inode timestamp range corresponds to hand + * converted values based on the ondisk format specification. + */ + XFS_CHECK_VALUE(XFS_BIGTIME_TIME_MIN - XFS_BIGTIME_EPOCH_OFFSET, + XFS_LEGACY_TIME_MIN); + XFS_CHECK_VALUE(XFS_BIGTIME_TIME_MAX - XFS_BIGTIME_EPOCH_OFFSET, + 16299260424LL); + + /* Do the same with the incore quota expiration range. */ + XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MIN << XFS_DQ_BIGTIME_SHIFT, 4); + XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MAX << XFS_DQ_BIGTIME_SHIFT, + 16299260424LL); } #endif /* __XFS_ONDISK_H */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index be67570badf8..3f82e0c92c2d 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -661,6 +661,17 @@ xfs_qm_init_quotainfo( /* Precalc some constants */ qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); qinf->qi_dqperchunk = xfs_calc_dquots_per_chunk(qinf->qi_dqchunklen); + if (xfs_sb_version_hasbigtime(&mp->m_sb)) { + qinf->qi_expiry_min = + xfs_dq_bigtime_to_unix(XFS_DQ_BIGTIME_EXPIRY_MIN); + qinf->qi_expiry_max = + xfs_dq_bigtime_to_unix(XFS_DQ_BIGTIME_EXPIRY_MAX); + } else { + qinf->qi_expiry_min = XFS_DQ_LEGACY_EXPIRY_MIN; + qinf->qi_expiry_max = XFS_DQ_LEGACY_EXPIRY_MAX; + } + trace_xfs_quota_expiry_range(mp, qinf->qi_expiry_min, + qinf->qi_expiry_max); mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD); @@ -879,6 +890,8 @@ xfs_qm_reset_dqcounts( ddq->d_bwarns = 0; ddq->d_iwarns = 0; ddq->d_rtbwarns = 0; + if (xfs_sb_version_hasbigtime(&mp->m_sb)) + ddq->d_type |= XFS_DQTYPE_BIGTIME; } if (xfs_sb_version_hascrc(&mp->m_sb)) { diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 9c078c35d924..e3dabab44097 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -65,6 +65,10 @@ struct xfs_quotainfo { struct xfs_def_quota qi_grp_default; struct xfs_def_quota qi_prj_default; struct shrinker qi_shrinker; + + /* Minimum and maximum quota expiration timestamp values. */ + time64_t qi_expiry_min; + time64_t qi_expiry_max; }; static inline struct radix_tree_root * diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 1c542b4a5220..ca1b57d291dc 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -479,13 +479,19 @@ xfs_setqlim_warns( static inline void xfs_setqlim_timer( + struct xfs_mount *mp, struct xfs_dquot_res *res, struct xfs_quota_limits *qlim, s64 timer) { - res->timer = timer; - if (qlim) - qlim->time = timer; + if (qlim) { + /* Set the length of the default grace period. */ + res->timer = xfs_dquot_set_grace_period(timer); + qlim->time = res->timer; + } else { + /* Set the grace period expiration on a quota. */ + res->timer = xfs_dquot_set_timeout(mp, timer); + } } /* @@ -574,7 +580,7 @@ xfs_qm_scall_setqlim( if (newlim->d_fieldmask & QC_SPC_WARNS) xfs_setqlim_warns(res, qlim, newlim->d_spc_warns); if (newlim->d_fieldmask & QC_SPC_TIMER) - xfs_setqlim_timer(res, qlim, newlim->d_spc_timer); + xfs_setqlim_timer(mp, res, qlim, newlim->d_spc_timer); /* Blocks on the realtime device. */ hard = (newlim->d_fieldmask & QC_RT_SPC_HARD) ? @@ -590,7 +596,7 @@ xfs_qm_scall_setqlim( if (newlim->d_fieldmask & QC_RT_SPC_WARNS) xfs_setqlim_warns(res, qlim, newlim->d_rt_spc_warns); if (newlim->d_fieldmask & QC_RT_SPC_TIMER) - xfs_setqlim_timer(res, qlim, newlim->d_rt_spc_timer); + xfs_setqlim_timer(mp, res, qlim, newlim->d_rt_spc_timer); /* Inodes */ hard = (newlim->d_fieldmask & QC_INO_HARD) ? @@ -606,7 +612,7 @@ xfs_qm_scall_setqlim( if (newlim->d_fieldmask & QC_INO_WARNS) xfs_setqlim_warns(res, qlim, newlim->d_ino_warns); if (newlim->d_fieldmask & QC_INO_TIMER) - xfs_setqlim_timer(res, qlim, newlim->d_ino_timer); + xfs_setqlim_timer(mp, res, qlim, newlim->d_ino_timer); if (id != 0) { /* diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 06b22e35fc90..5a62398940d0 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -108,8 +108,6 @@ extern void xfs_qm_mount_quotas(struct xfs_mount *); extern void xfs_qm_unmount(struct xfs_mount *); extern void xfs_qm_unmount_quotas(struct xfs_mount *); -void xfs_dquot_done(struct xfs_buf *); - #else static inline int xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid, @@ -151,12 +149,6 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, #define xfs_qm_mount_quotas(mp) #define xfs_qm_unmount(mp) #define xfs_qm_unmount_quotas(mp) - -static inline void xfs_dquot_done(struct xfs_buf *bp) -{ - return; -} - #endif /* CONFIG_XFS_QUOTA */ #define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \ diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 6209e7b6b895..5b89c12f1566 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -247,6 +247,9 @@ xfs_rtallocate_extent_block( end = XFS_BLOCKTOBIT(mp, bbno + 1) - 1; i <= end; i++) { + /* Make sure we don't scan off the end of the rt volume. */ + maxlen = min(mp->m_sb.sb_rextents, i + maxlen) - i; + /* * See if there's a free extent of maxlen starting at i. * If it's not so then next will contain the first non-free. @@ -442,6 +445,14 @@ xfs_rtallocate_extent_near( */ if (bno >= mp->m_sb.sb_rextents) bno = mp->m_sb.sb_rextents - 1; + + /* Make sure we don't run off the end of the rt volume. */ + maxlen = min(mp->m_sb.sb_rextents, bno + maxlen) - bno; + if (maxlen < minlen) { + *rtblock = NULLRTBLOCK; + return 0; + } + /* * Try the exact allocation first. */ @@ -862,7 +873,7 @@ xfs_alloc_rsum_cache( * lower bound on the minimum level with any free extents. We can * continue without the cache if it couldn't be allocated. */ - mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, 0); + mp->m_rsum_cache = kvzalloc(rbmblocks, GFP_KERNEL); if (!mp->m_rsum_cache) xfs_warn(mp, "could not allocate realtime summary cache"); } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 71ac6c1cdc36..baf5de30eebb 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -654,11 +654,11 @@ xfs_fs_destroy_inode( ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM)); /* - * We always use background reclaim here because even if the - * inode is clean, it still may be under IO and hence we have - * to take the flush lock. The background reclaim path handles - * this more efficiently than we can here, so simply let background - * reclaim tear down all inodes. + * We always use background reclaim here because even if the inode is + * clean, it still may be under IO and hence we have wait for IO + * completion to occur before we can reclaim the inode. The background + * reclaim path handles this more efficiently than we can here, so + * simply let background reclaim tear down all inodes. */ xfs_inode_set_reclaim_tag(ip); } @@ -1484,8 +1484,14 @@ xfs_fc_fill_super( sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_max_links = XFS_MAXLINK; sb->s_time_gran = 1; - sb->s_time_min = S32_MIN; - sb->s_time_max = S32_MAX; + if (xfs_sb_version_hasbigtime(&mp->m_sb)) { + sb->s_time_min = xfs_bigtime_to_unix(XFS_BIGTIME_TIME_MIN); + sb->s_time_max = xfs_bigtime_to_unix(XFS_BIGTIME_TIME_MAX); + } else { + sb->s_time_min = XFS_LEGACY_TIME_MIN; + sb->s_time_max = XFS_LEGACY_TIME_MAX; + } + trace_xfs_inode_timestamp_range(mp, sb->s_time_min, sb->s_time_max); sb->s_iflags |= SB_I_CGROUPWB; set_posix_acl_flag(sb); @@ -1494,6 +1500,10 @@ xfs_fc_fill_super( if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) sb->s_flags |= SB_I_VERSION; + if (xfs_sb_version_hasbigtime(&mp->m_sb)) + xfs_warn(mp, + "EXPERIMENTAL big timestamp feature in use. Use at your own risk!"); + if (mp->m_flags & XFS_MOUNT_DAX_ALWAYS) { bool rtdev_is_dax = false, datadev_is_dax; @@ -1549,6 +1559,10 @@ xfs_fc_fill_super( goto out_filestream_unmount; } + if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) + xfs_warn(mp, + "EXPERIMENTAL inode btree counters feature in use. Use at your own risk!"); + error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index abb1d859f226..dcdcf99cfa5d 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -338,7 +338,7 @@ DEFINE_BUF_EVENT(xfs_buf_delwri_split); DEFINE_BUF_EVENT(xfs_buf_delwri_pushbuf); DEFINE_BUF_EVENT(xfs_buf_get_uncached); DEFINE_BUF_EVENT(xfs_buf_item_relse); -DEFINE_BUF_EVENT(xfs_buf_item_iodone_async); +DEFINE_BUF_EVENT(xfs_buf_iodone_async); DEFINE_BUF_EVENT(xfs_buf_error_relse); DEFINE_BUF_EVENT(xfs_buf_wait_buftarg); DEFINE_BUF_EVENT(xfs_trans_read_buf_shut); @@ -3676,7 +3676,6 @@ DEFINE_EVENT(xfs_kmem_class, name, \ DEFINE_KMEM_EVENT(kmem_alloc); DEFINE_KMEM_EVENT(kmem_alloc_io); DEFINE_KMEM_EVENT(kmem_alloc_large); -DEFINE_KMEM_EVENT(kmem_realloc); TRACE_EVENT(xfs_check_new_dalign, TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino), @@ -3844,6 +3843,32 @@ TRACE_EVENT(xfs_btree_bload_block, __entry->nr_records) ) +DECLARE_EVENT_CLASS(xfs_timestamp_range_class, + TP_PROTO(struct xfs_mount *mp, time64_t min, time64_t max), + TP_ARGS(mp, min, max), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(long long, min) + __field(long long, max) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->min = min; + __entry->max = max; + ), + TP_printk("dev %d:%d min %lld max %lld", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->min, + __entry->max) +) + +#define DEFINE_TIMESTAMP_RANGE_EVENT(name) \ +DEFINE_EVENT(xfs_timestamp_range_class, name, \ + TP_PROTO(struct xfs_mount *mp, long long min, long long max), \ + TP_ARGS(mp, min, max)) +DEFINE_TIMESTAMP_RANGE_EVENT(xfs_inode_timestamp_range); +DEFINE_TIMESTAMP_RANGE_EVENT(xfs_quota_expiry_range); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index ed72867b1a19..ca18a040336a 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -468,7 +468,7 @@ xfs_trans_apply_sb_deltas( xfs_buf_t *bp; int whole = 0; - bp = xfs_trans_getsb(tp, tp->t_mountp); + bp = xfs_trans_getsb(tp); sbp = bp->b_addr; /* diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index b752501818d2..f46534b75236 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -209,7 +209,7 @@ xfs_trans_read_buf( flags, bpp, ops); } -struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *); +struct xfs_buf *xfs_trans_getsb(struct xfs_trans *); void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *); void xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 11cd666cd99a..42d63b830cb9 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -166,50 +166,34 @@ xfs_trans_get_buf_map( } /* - * Get and lock the superblock buffer of this file system for the - * given transaction. - * - * We don't need to use incore_match() here, because the superblock - * buffer is a private buffer which we keep a pointer to in the - * mount structure. + * Get and lock the superblock buffer for the given transaction. */ -xfs_buf_t * +struct xfs_buf * xfs_trans_getsb( - xfs_trans_t *tp, - struct xfs_mount *mp) + struct xfs_trans *tp) { - xfs_buf_t *bp; - struct xfs_buf_log_item *bip; + struct xfs_buf *bp = tp->t_mountp->m_sb_bp; /* - * Default to just trying to lock the superblock buffer - * if tp is NULL. + * Just increment the lock recursion count if the buffer is already + * attached to this transaction. */ - if (tp == NULL) - return xfs_getsb(mp); - - /* - * If the superblock buffer already has this transaction - * pointer in its b_fsprivate2 field, then we know we already - * have it locked. In this case we just increment the lock - * recursion count and return the buffer to the caller. - */ - bp = mp->m_sb_bp; if (bp->b_transp == tp) { - bip = bp->b_log_item; + struct xfs_buf_log_item *bip = bp->b_log_item; + ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_recur++; + trace_xfs_trans_getsb_recur(bip); - return bp; - } + } else { + xfs_buf_lock(bp); + xfs_buf_hold(bp); + _xfs_trans_bjoin(tp, bp, 1); - bp = xfs_getsb(mp); - if (bp == NULL) - return NULL; + trace_xfs_trans_getsb(bp->b_log_item); + } - _xfs_trans_bjoin(tp, bp, 1); - trace_xfs_trans_getsb(bp->b_log_item); return bp; } diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index c6ba7ef18e06..133fc6fc3edd 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -55,6 +55,12 @@ xfs_trans_log_dquot( { ASSERT(XFS_DQ_IS_LOCKED(dqp)); + /* Upgrade the dquot to bigtime format if possible. */ + if (dqp->q_id != 0 && + xfs_sb_version_hasbigtime(&tp->t_mountp->m_sb) && + !(dqp->q_type & XFS_DQTYPE_BIGTIME)) + dqp->q_type |= XFS_DQTYPE_BIGTIME; + tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &dqp->q_logitem.qli_item.li_flags); } |