diff options
Diffstat (limited to 'fs')
150 files changed, 1776 insertions, 1093 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 92cd1d80218d..3576123d8299 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -213,7 +213,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) break; default: WARN_ONCE(1, "unknown lock status code: %d\n", status); - /* fall through */ + fallthrough; case P9_LOCK_ERROR: case P9_LOCK_GRACE: res = -ENOLCK; diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c index 30d526fecc3f..05e963402e25 100644 --- a/fs/adfs/dir_f.c +++ b/fs/adfs/dir_f.c @@ -18,11 +18,11 @@ static inline unsigned int adfs_readval(unsigned char *p, int len) switch (len) { case 4: val |= p[3] << 24; - /* fall through */ + fallthrough; case 3: val |= p[2] << 16; - /* fall through */ + fallthrough; case 2: val |= p[1] << 8; - /* fall through */ + fallthrough; default: val |= p[0]; } return val; @@ -32,11 +32,11 @@ static inline void adfs_writeval(unsigned char *p, int len, unsigned int val) { switch (len) { case 4: p[3] = val >> 24; - /* fall through */ + fallthrough; case 3: p[2] = val >> 16; - /* fall through */ + fallthrough; case 2: p[1] = val >> 8; - /* fall through */ + fallthrough; default: p[0] = val; } } diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index f708c45d5f66..29f11e10a7c7 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -420,24 +420,51 @@ affs_mode_to_prot(struct inode *inode) u32 prot = AFFS_I(inode)->i_protect; umode_t mode = inode->i_mode; + /* + * First, clear all RWED bits for owner, group, other. + * Then, recalculate them afresh. + * + * We'll always clear the delete-inhibit bit for the owner, as that is + * the classic single-user mode AmigaOS protection bit and we need to + * stay compatible with all scenarios. + * + * Since multi-user AmigaOS is an extension, we'll only set the + * delete-allow bit if any of the other bits in the same user class + * (group/other) are used. + */ + prot &= ~(FIBF_NOEXECUTE | FIBF_NOREAD + | FIBF_NOWRITE | FIBF_NODELETE + | FIBF_GRP_EXECUTE | FIBF_GRP_READ + | FIBF_GRP_WRITE | FIBF_GRP_DELETE + | FIBF_OTR_EXECUTE | FIBF_OTR_READ + | FIBF_OTR_WRITE | FIBF_OTR_DELETE); + + /* Classic single-user AmigaOS flags. These are inverted. */ if (!(mode & 0100)) prot |= FIBF_NOEXECUTE; if (!(mode & 0400)) prot |= FIBF_NOREAD; if (!(mode & 0200)) prot |= FIBF_NOWRITE; + + /* Multi-user extended flags. Not inverted. */ if (mode & 0010) prot |= FIBF_GRP_EXECUTE; if (mode & 0040) prot |= FIBF_GRP_READ; if (mode & 0020) prot |= FIBF_GRP_WRITE; + if (mode & 0070) + prot |= FIBF_GRP_DELETE; + if (mode & 0001) prot |= FIBF_OTR_EXECUTE; if (mode & 0004) prot |= FIBF_OTR_READ; if (mode & 0002) prot |= FIBF_OTR_WRITE; + if (mode & 0007) + prot |= FIBF_OTR_DELETE; AFFS_I(inode)->i_protect = prot; } diff --git a/fs/affs/file.c b/fs/affs/file.c index a26a0f96c119..d91b0133d95d 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -429,6 +429,24 @@ static int affs_write_begin(struct file *file, struct address_space *mapping, return ret; } +static int affs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned int len, unsigned int copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + int ret; + + ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + + /* Clear Archived bit on file writes, as AmigaOS would do */ + if (AFFS_I(inode)->i_protect & FIBF_ARCHIVED) { + AFFS_I(inode)->i_protect &= ~FIBF_ARCHIVED; + mark_inode_dirty(inode); + } + + return ret; +} + static sector_t _affs_bmap(struct address_space *mapping, sector_t block) { return generic_block_bmap(mapping,block,affs_get_block); @@ -438,7 +456,7 @@ const struct address_space_operations affs_aops = { .readpage = affs_readpage, .writepage = affs_writepage, .write_begin = affs_write_begin, - .write_end = generic_write_end, + .write_end = affs_write_end, .direct_IO = affs_direct_IO, .bmap = _affs_bmap }; @@ -795,6 +813,12 @@ done: if (tmp > inode->i_size) inode->i_size = AFFS_I(inode)->mmu_private = tmp; + /* Clear Archived bit on file writes, as AmigaOS would do */ + if (AFFS_I(inode)->i_protect & FIBF_ARCHIVED) { + AFFS_I(inode)->i_protect &= ~FIBF_ARCHIVED; + mark_inode_dirty(inode); + } + err_first_bh: unlock_page(page); put_page(page); diff --git a/fs/affs/inode.c b/fs/affs/inode.c index a346cf7659f1..044412110b52 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -93,7 +93,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) case ST_ROOT: inode->i_uid = sbi->s_uid; inode->i_gid = sbi->s_gid; - /* fall through */ + fallthrough; case ST_USERDIR: if (be32_to_cpu(tail->stype) == ST_USERDIR || affs_test_opt(sbi->s_flags, SF_SETMODE)) { diff --git a/fs/affs/super.c b/fs/affs/super.c index 47107c6712a6..a100cd9950c8 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -474,7 +474,7 @@ got_root: case MUFS_INTLFFS: case MUFS_DCFFS: affs_set_opt(sbi->s_flags, SF_MUFS); - /* fall thru */ + fallthrough; case FS_INTLFFS: case FS_DCFFS: affs_set_opt(sbi->s_flags, SF_INTL); @@ -486,7 +486,7 @@ got_root: break; case MUFS_OFS: affs_set_opt(sbi->s_flags, SF_MUFS); - /* fall through */ + fallthrough; case FS_OFS: affs_set_opt(sbi->s_flags, SF_OFS); sb->s_flags |= SB_NOEXEC; @@ -494,7 +494,7 @@ got_root: case MUFS_DCOFS: case MUFS_INTLOFS: affs_set_opt(sbi->s_flags, SF_MUFS); - /* fall through */ + fallthrough; case FS_DCOFS: case FS_INTLOFS: affs_set_opt(sbi->s_flags, SF_INTL); diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index bef413818af7..a4e9e6e07e93 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -252,7 +252,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) call->unmarshall++; /* extract the FID array and its count in two steps */ - /* fall through */ + fallthrough; case 1: _debug("extract FID count"); ret = afs_extract_data(call, true); @@ -271,7 +271,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) afs_extract_to_buf(call, call->count * 3 * 4); call->unmarshall++; - /* Fall through */ + fallthrough; case 2: _debug("extract FID array"); ret = afs_extract_data(call, true); @@ -297,7 +297,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) call->unmarshall++; /* extract the callback array and its count in two steps */ - /* fall through */ + fallthrough; case 3: _debug("extract CB count"); ret = afs_extract_data(call, true); @@ -312,7 +312,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) iov_iter_discard(&call->def_iter, READ, call->count2 * 3 * 4); call->unmarshall++; - /* Fall through */ + fallthrough; case 4: _debug("extract discard %zu/%u", iov_iter_count(call->iter), call->count2 * 3 * 4); @@ -391,7 +391,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) afs_extract_to_buf(call, 11 * sizeof(__be32)); call->unmarshall++; - /* Fall through */ + fallthrough; case 1: _debug("extract UUID"); ret = afs_extract_data(call, false); @@ -503,7 +503,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call) afs_extract_to_buf(call, 11 * sizeof(__be32)); call->unmarshall++; - /* Fall through */ + fallthrough; case 1: _debug("extract UUID"); ret = afs_extract_data(call, false); @@ -618,7 +618,7 @@ static int afs_deliver_yfs_cb_callback(struct afs_call *call) call->unmarshall++; /* extract the FID array and its count in two steps */ - /* Fall through */ + fallthrough; case 1: _debug("extract FID count"); ret = afs_extract_data(call, true); @@ -637,7 +637,7 @@ static int afs_deliver_yfs_cb_callback(struct afs_call *call) afs_extract_to_buf(call, size); call->unmarshall++; - /* Fall through */ + fallthrough; case 2: _debug("extract FID array"); ret = afs_extract_data(call, false); diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index b79879aacc02..7b784af604fd 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -382,15 +382,17 @@ void afs_dynroot_depopulate(struct super_block *sb) net->dynroot_sb = NULL; mutex_unlock(&net->proc_cells_lock); - inode_lock(root->d_inode); - - /* Remove all the pins for dirs created for manually added cells */ - list_for_each_entry_safe(subdir, tmp, &root->d_subdirs, d_child) { - if (subdir->d_fsdata) { - subdir->d_fsdata = NULL; - dput(subdir); + if (root) { + inode_lock(root->d_inode); + + /* Remove all the pins for dirs created for manually added cells */ + list_for_each_entry_safe(subdir, tmp, &root->d_subdirs, d_child) { + if (subdir->d_fsdata) { + subdir->d_fsdata = NULL; + dput(subdir); + } } - } - inode_unlock(root->d_inode); + inode_unlock(root->d_inode); + } } diff --git a/fs/afs/file.c b/fs/afs/file.c index 6f6ed1605cfe..371d1488cc54 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -311,7 +311,7 @@ int afs_page_filler(void *data, struct page *page) case -ENOBUFS: _debug("cache said ENOBUFS"); - /* fall through */ + fallthrough; default: go_on: req = kzalloc(struct_size(req, array, 1), GFP_KERNEL); diff --git a/fs/afs/flock.c b/fs/afs/flock.c index ffb8575345ca..cb3054c7843e 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -376,7 +376,6 @@ again: spin_unlock(&vnode->lock); return; - /* Fall through */ default: /* Looks like a lock request was withdrawn. */ spin_unlock(&vnode->lock); diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c index 24fd163c6323..97cab12b0a6c 100644 --- a/fs/afs/fs_operation.c +++ b/fs/afs/fs_operation.c @@ -235,6 +235,7 @@ int afs_put_operation(struct afs_operation *op) afs_end_cursor(&op->ac); afs_put_serverlist(op->net, op->server_list); afs_put_volume(op->net, op->volume, afs_volume_trace_put_put_op); + key_put(op->key); kfree(op); return ret; } diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c index 5d9ef517cf81..e7e98ad63a91 100644 --- a/fs/afs/fs_probe.c +++ b/fs/afs/fs_probe.c @@ -161,8 +161,8 @@ responded: } } - rtt_us = rxrpc_kernel_get_srtt(call->net->socket, call->rxcall); - if (rtt_us < server->probe.rtt) { + if (rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us) && + rtt_us < server->probe.rtt) { server->probe.rtt = rtt_us; server->rtt = rtt_us; alist->preferred = index; diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index acb4d0ca2649..1d95ed9dd86e 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -320,7 +320,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) call->tmp_u = htonl(0); afs_extract_to_tmp(call); } - /* Fall through */ + fallthrough; /* extract the returned data length */ case 1: @@ -348,7 +348,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) call->bvec[0].bv_page = req->pages[req->index]; iov_iter_bvec(&call->def_iter, READ, call->bvec, 1, size); ASSERTCMP(size, <=, PAGE_SIZE); - /* Fall through */ + fallthrough; /* extract the returned data */ case 2: @@ -375,7 +375,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) /* Discard any excess data the server gave us */ afs_extract_discard(call, req->actual_len - req->len); call->unmarshall = 3; - /* Fall through */ + fallthrough; case 3: _debug("extract discard %zu/%llu", @@ -388,7 +388,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) no_more_data: call->unmarshall = 4; afs_extract_to_buf(call, (21 + 3 + 6) * 4); - /* Fall through */ + fallthrough; /* extract the metadata */ case 4: @@ -1343,7 +1343,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) case 0: call->unmarshall++; afs_extract_to_buf(call, 12 * 4); - /* Fall through */ + fallthrough; /* extract the returned status record */ case 1: @@ -1356,7 +1356,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) xdr_decode_AFSFetchVolumeStatus(&bp, &op->volstatus.vs); call->unmarshall++; afs_extract_to_tmp(call); - /* Fall through */ + fallthrough; /* extract the volume name length */ case 2: @@ -1371,7 +1371,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) size = (call->count + 3) & ~3; /* It's padded */ afs_extract_to_buf(call, size); call->unmarshall++; - /* Fall through */ + fallthrough; /* extract the volume name */ case 3: @@ -1385,7 +1385,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) _debug("volname '%s'", p); afs_extract_to_tmp(call); call->unmarshall++; - /* Fall through */ + fallthrough; /* extract the offline message length */ case 4: @@ -1400,7 +1400,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) size = (call->count + 3) & ~3; /* It's padded */ afs_extract_to_buf(call, size); call->unmarshall++; - /* Fall through */ + fallthrough; /* extract the offline message */ case 5: @@ -1415,7 +1415,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) afs_extract_to_tmp(call); call->unmarshall++; - /* Fall through */ + fallthrough; /* extract the message of the day length */ case 6: @@ -1430,7 +1430,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) size = (call->count + 3) & ~3; /* It's padded */ afs_extract_to_buf(call, size); call->unmarshall++; - /* Fall through */ + fallthrough; /* extract the message of the day */ case 7: @@ -1682,7 +1682,7 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call) case 0: afs_extract_to_tmp(call); call->unmarshall++; - /* Fall through */ + fallthrough; /* Extract the capabilities word count */ case 1: @@ -1696,7 +1696,7 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call) call->count2 = count; afs_extract_discard(call, count * sizeof(__be32)); call->unmarshall++; - /* Fall through */ + fallthrough; /* Extract capabilities words */ case 2: @@ -1776,7 +1776,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) case 0: afs_extract_to_tmp(call); call->unmarshall++; - /* Fall through */ + fallthrough; /* Extract the file status count and array in two steps */ case 1: @@ -1794,7 +1794,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) call->unmarshall++; more_counts: afs_extract_to_buf(call, 21 * sizeof(__be32)); - /* Fall through */ + fallthrough; case 2: _debug("extract status array %u", call->count); @@ -1824,7 +1824,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) call->count = 0; call->unmarshall++; afs_extract_to_tmp(call); - /* Fall through */ + fallthrough; /* Extract the callback count and array in two steps */ case 3: @@ -1841,7 +1841,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) call->unmarshall++; more_cbs: afs_extract_to_buf(call, 3 * sizeof(__be32)); - /* Fall through */ + fallthrough; case 4: _debug("extract CB array"); @@ -1870,7 +1870,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) afs_extract_to_buf(call, 6 * sizeof(__be32)); call->unmarshall++; - /* Fall through */ + fallthrough; case 5: ret = afs_extract_data(call, false); @@ -1974,7 +1974,7 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call) case 0: afs_extract_to_tmp(call); call->unmarshall++; - /* Fall through */ + fallthrough; /* extract the returned data length */ case 1: @@ -1992,7 +1992,7 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call) acl->size = call->count2; afs_extract_begin(call, acl->data, size); call->unmarshall++; - /* Fall through */ + fallthrough; /* extract the returned data */ case 2: @@ -2002,7 +2002,7 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call) afs_extract_to_buf(call, (21 + 6) * 4); call->unmarshall++; - /* Fall through */ + fallthrough; /* extract the metadata */ case 3: diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 792ac711985e..18042b7dab6a 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -401,22 +401,24 @@ struct afs_vlserver { #define AFS_VLSERVER_FL_PROBED 0 /* The VL server has been probed */ #define AFS_VLSERVER_FL_PROBING 1 /* VL server is being probed */ #define AFS_VLSERVER_FL_IS_YFS 2 /* Server is YFS not AFS */ +#define AFS_VLSERVER_FL_RESPONDING 3 /* VL server is responding */ rwlock_t lock; /* Lock on addresses */ atomic_t usage; + unsigned int rtt; /* Server's current RTT in uS */ /* Probe state */ wait_queue_head_t probe_wq; atomic_t probe_outstanding; spinlock_t probe_lock; struct { - unsigned int rtt; /* RTT as ktime/64 */ + unsigned int rtt; /* RTT in uS */ u32 abort_code; short error; - bool have_result; - bool responded:1; - bool is_yfs:1; - bool not_yfs:1; - bool local_failure:1; + unsigned short flags; +#define AFS_VLSERVER_PROBE_RESPONDED 0x01 /* At least once response (may be abort) */ +#define AFS_VLSERVER_PROBE_IS_YFS 0x02 /* The peer appears to be YFS */ +#define AFS_VLSERVER_PROBE_NOT_YFS 0x04 /* The peer appears not to be YFS */ +#define AFS_VLSERVER_PROBE_LOCAL_FAILURE 0x08 /* A local failure prevented a probe */ } probe; u16 port; diff --git a/fs/afs/misc.c b/fs/afs/misc.c index 5334f1bd2bca..1d1a8debe472 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -120,42 +120,42 @@ void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code) if (e->error == -ETIMEDOUT || e->error == -ETIME) return; - /* Fall through */ + fallthrough; case -ETIMEDOUT: case -ETIME: if (e->error == -ENOMEM || e->error == -ENONET) return; - /* Fall through */ + fallthrough; case -ENOMEM: case -ENONET: if (e->error == -ERFKILL) return; - /* Fall through */ + fallthrough; case -ERFKILL: if (e->error == -EADDRNOTAVAIL) return; - /* Fall through */ + fallthrough; case -EADDRNOTAVAIL: if (e->error == -ENETUNREACH) return; - /* Fall through */ + fallthrough; case -ENETUNREACH: if (e->error == -EHOSTUNREACH) return; - /* Fall through */ + fallthrough; case -EHOSTUNREACH: if (e->error == -EHOSTDOWN) return; - /* Fall through */ + fallthrough; case -EHOSTDOWN: if (e->error == -ECONNREFUSED) return; - /* Fall through */ + fallthrough; case -ECONNREFUSED: if (e->error == -ECONNRESET) return; - /* Fall through */ + fallthrough; case -ECONNRESET: /* Responded, but call expired. */ if (e->responded) return; diff --git a/fs/afs/proc.c b/fs/afs/proc.c index e817fc740ba0..e8babb62ed44 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -310,6 +310,11 @@ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v) alist->preferred == i ? '>' : '-', &alist->addrs[i].transport); } + seq_printf(m, " info: fl=%lx rtt=%d\n", vlserver->flags, vlserver->rtt); + seq_printf(m, " probe: fl=%x e=%d ac=%d out=%d\n", + vlserver->probe.flags, vlserver->probe.error, + vlserver->probe.abort_code, + atomic_read(&vlserver->probe_outstanding)); return 0; } diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index 6a0935cb822f..d83f13c44b92 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -281,7 +281,7 @@ bool afs_select_fileserver(struct afs_operation *op) case -ETIME: if (op->error != -EDESTADDRREQ) goto iterate_address; - /* Fall through */ + fallthrough; case -ERFKILL: case -EADDRNOTAVAIL: case -ENETUNREACH: diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 8fc8fb406a5a..8be709cb8542 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -568,7 +568,7 @@ static void afs_deliver_to_call(struct afs_call *call) case -EIO: pr_err("kAFS: Call %u in bad state %u\n", call->debug_id, state); - /* Fall through */ + fallthrough; case -ENODATA: case -EBADMSG: case -EMSGSIZE: @@ -669,7 +669,7 @@ long afs_wait_for_call_to_complete(struct afs_call *call, ret = call->ret0; call->ret0 = 0; - /* Fall through */ + fallthrough; case -ECONNABORTED: ac->responded = true; break; @@ -872,7 +872,7 @@ void afs_send_empty_reply(struct afs_call *call) _debug("oom"); rxrpc_kernel_abort_call(net->socket, call->rxcall, RX_USER_ABORT, -ENOMEM, "KOO"); - /* Fall through */ + fallthrough; default: _leave(" [error]"); return; diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c index 8fea54eba0c2..38b2ba1d9ec0 100644 --- a/fs/afs/vl_list.c +++ b/fs/afs/vl_list.c @@ -21,6 +21,7 @@ struct afs_vlserver *afs_alloc_vlserver(const char *name, size_t name_len, rwlock_init(&vlserver->lock); init_waitqueue_head(&vlserver->probe_wq); spin_lock_init(&vlserver->probe_lock); + vlserver->rtt = UINT_MAX; vlserver->name_len = name_len; vlserver->port = port; memcpy(vlserver->name, name, name_len); diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c index e3aa013c2177..d1c7068b4346 100644 --- a/fs/afs/vl_probe.c +++ b/fs/afs/vl_probe.c @@ -11,15 +11,33 @@ #include "internal.h" #include "protocol_yfs.h" -static bool afs_vl_probe_done(struct afs_vlserver *server) + +/* + * Handle the completion of a set of probes. + */ +static void afs_finished_vl_probe(struct afs_vlserver *server) { - if (!atomic_dec_and_test(&server->probe_outstanding)) - return false; + if (!(server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED)) { + server->rtt = UINT_MAX; + clear_bit(AFS_VLSERVER_FL_RESPONDING, &server->flags); + } - wake_up_var(&server->probe_outstanding); clear_bit_unlock(AFS_VLSERVER_FL_PROBING, &server->flags); wake_up_bit(&server->flags, AFS_VLSERVER_FL_PROBING); - return true; +} + +/* + * Handle the completion of a probe RPC call. + */ +static void afs_done_one_vl_probe(struct afs_vlserver *server, bool wake_up) +{ + if (atomic_dec_and_test(&server->probe_outstanding)) { + afs_finished_vl_probe(server); + wake_up = true; + } + + if (wake_up) + wake_up_all(&server->probe_wq); } /* @@ -45,15 +63,20 @@ void afs_vlserver_probe_result(struct afs_call *call) server->probe.error = 0; goto responded; case -ECONNABORTED: - if (!server->probe.responded) { + if (!(server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED)) { server->probe.abort_code = call->abort_code; server->probe.error = ret; } goto responded; case -ENOMEM: case -ENONET: - server->probe.local_failure = true; - afs_io_error(call, afs_io_error_vl_probe_fail); + case -EKEYEXPIRED: + case -EKEYREVOKED: + case -EKEYREJECTED: + server->probe.flags |= AFS_VLSERVER_PROBE_LOCAL_FAILURE; + if (server->probe.error == 0) + server->probe.error = ret; + trace_afs_io_error(call->debug_id, ret, afs_io_error_vl_probe_fail); goto out; case -ECONNRESET: /* Responded, but call expired. */ case -ERFKILL: @@ -67,12 +90,12 @@ void afs_vlserver_probe_result(struct afs_call *call) default: clear_bit(index, &alist->responded); set_bit(index, &alist->failed); - if (!server->probe.responded && + if (!(server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED) && (server->probe.error == 0 || server->probe.error == -ETIMEDOUT || server->probe.error == -ETIME)) server->probe.error = ret; - afs_io_error(call, afs_io_error_vl_probe_fail); + trace_afs_io_error(call->debug_id, ret, afs_io_error_vl_probe_fail); goto out; } @@ -81,39 +104,36 @@ responded: clear_bit(index, &alist->failed); if (call->service_id == YFS_VL_SERVICE) { - server->probe.is_yfs = true; + server->probe.flags |= AFS_VLSERVER_PROBE_IS_YFS; set_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags); alist->addrs[index].srx_service = call->service_id; } else { - server->probe.not_yfs = true; - if (!server->probe.is_yfs) { + server->probe.flags |= AFS_VLSERVER_PROBE_NOT_YFS; + if (!(server->probe.flags & AFS_VLSERVER_PROBE_IS_YFS)) { clear_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags); alist->addrs[index].srx_service = call->service_id; } } - rtt_us = rxrpc_kernel_get_srtt(call->net->socket, call->rxcall); - if (rtt_us < server->probe.rtt) { + if (rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us) && + rtt_us < server->probe.rtt) { server->probe.rtt = rtt_us; + server->rtt = rtt_us; alist->preferred = index; - have_result = true; } smp_wmb(); /* Set rtt before responded. */ - server->probe.responded = true; + server->probe.flags |= AFS_VLSERVER_PROBE_RESPONDED; set_bit(AFS_VLSERVER_FL_PROBED, &server->flags); + set_bit(AFS_VLSERVER_FL_RESPONDING, &server->flags); + have_result = true; out: spin_unlock(&server->probe_lock); _debug("probe [%u][%u] %pISpc rtt=%u ret=%d", server_index, index, &alist->addrs[index].transport, rtt_us, ret); - have_result |= afs_vl_probe_done(server); - if (have_result) { - server->probe.have_result = true; - wake_up_var(&server->probe.have_result); - wake_up_all(&server->probe_wq); - } + afs_done_one_vl_probe(server, have_result); } /* @@ -151,11 +171,10 @@ static bool afs_do_probe_vlserver(struct afs_net *net, in_progress = true; } else { afs_prioritise_error(_e, PTR_ERR(call), ac.abort_code); + afs_done_one_vl_probe(server, false); } } - if (!in_progress) - afs_vl_probe_done(server); return in_progress; } @@ -193,7 +212,7 @@ int afs_wait_for_vl_probes(struct afs_vlserver_list *vllist, { struct wait_queue_entry *waits; struct afs_vlserver *server; - unsigned int rtt = UINT_MAX; + unsigned int rtt = UINT_MAX, rtt_s; bool have_responders = false; int pref = -1, i; @@ -205,7 +224,7 @@ int afs_wait_for_vl_probes(struct afs_vlserver_list *vllist, server = vllist->servers[i].server; if (!test_bit(AFS_VLSERVER_FL_PROBING, &server->flags)) __clear_bit(i, &untried); - if (server->probe.responded) + if (server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED) have_responders = true; } } @@ -231,7 +250,7 @@ int afs_wait_for_vl_probes(struct afs_vlserver_list *vllist, for (i = 0; i < vllist->nr_servers; i++) { if (test_bit(i, &untried)) { server = vllist->servers[i].server; - if (server->probe.responded) + if (server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED) goto stop; if (test_bit(AFS_VLSERVER_FL_PROBING, &server->flags)) still_probing = true; @@ -249,10 +268,11 @@ stop: for (i = 0; i < vllist->nr_servers; i++) { if (test_bit(i, &untried)) { server = vllist->servers[i].server; - if (server->probe.responded && - server->probe.rtt < rtt) { + rtt_s = READ_ONCE(server->rtt); + if (test_bit(AFS_VLSERVER_FL_RESPONDING, &server->flags) && + rtt_s < rtt) { pref = i; - rtt = server->probe.rtt; + rtt = rtt_s; } remove_wait_queue(&server->probe_wq, &waits[i]); diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c index f405ca8b240a..c0458c903b31 100644 --- a/fs/afs/vl_rotate.c +++ b/fs/afs/vl_rotate.c @@ -192,7 +192,8 @@ pick_server: for (i = 0; i < vc->server_list->nr_servers; i++) { struct afs_vlserver *s = vc->server_list->servers[i].server; - if (!test_bit(i, &vc->untried) || !s->probe.responded) + if (!test_bit(i, &vc->untried) || + !test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags)) continue; if (s->probe.rtt < rtt) { vc->index = i; @@ -262,10 +263,14 @@ no_more_servers: for (i = 0; i < vc->server_list->nr_servers; i++) { struct afs_vlserver *s = vc->server_list->servers[i].server; + if (test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags)) + e.responded = true; afs_prioritise_error(&e, READ_ONCE(s->probe.error), s->probe.abort_code); } + error = e.error; + failed_set_error: vc->error = error; failed: diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index fd82850cd424..dc9327332f06 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -196,7 +196,7 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call) /* Extract the returned uuid, uniquifier, nentries and * blkaddrs size */ - /* Fall through */ + fallthrough; case 1: ret = afs_extract_data(call, true); if (ret < 0) @@ -221,7 +221,7 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call) count = min(call->count, 4U); afs_extract_to_buf(call, count * sizeof(__be32)); - /* Fall through - and extract entries */ + fallthrough; /* and extract entries */ case 2: ret = afs_extract_data(call, call->count > 4); if (ret < 0) @@ -324,7 +324,7 @@ static int afs_deliver_vl_get_capabilities(struct afs_call *call) afs_extract_to_tmp(call); call->unmarshall++; - /* Fall through - and extract the capabilities word count */ + fallthrough; /* and extract the capabilities word count */ case 1: ret = afs_extract_data(call, true); if (ret < 0) @@ -337,7 +337,7 @@ static int afs_deliver_vl_get_capabilities(struct afs_call *call) call->unmarshall++; afs_extract_discard(call, count * sizeof(__be32)); - /* Fall through - and extract capabilities words */ + fallthrough; /* and extract capabilities words */ case 2: ret = afs_extract_data(call, false); if (ret < 0) @@ -436,7 +436,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) /* Extract the returned uuid, uniquifier, fsEndpoints count and * either the first fsEndpoint type or the volEndpoints * count if there are no fsEndpoints. */ - /* Fall through */ + fallthrough; case 1: ret = afs_extract_data(call, true); if (ret < 0) @@ -475,7 +475,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) afs_extract_to_buf(call, size); call->unmarshall = 2; - /* Fall through - and extract fsEndpoints[] entries */ + fallthrough; /* and extract fsEndpoints[] entries */ case 2: ret = afs_extract_data(call, true); if (ret < 0) @@ -526,7 +526,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) * extract the type of the next endpoint when we extract the * data of the current one, but this is the first... */ - /* Fall through */ + fallthrough; case 3: ret = afs_extract_data(call, true); if (ret < 0) @@ -552,7 +552,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) afs_extract_to_buf(call, size); call->unmarshall = 4; - /* Fall through - and extract volEndpoints[] entries */ + fallthrough; /* and extract volEndpoints[] entries */ case 4: ret = afs_extract_data(call, true); if (ret < 0) @@ -587,7 +587,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) afs_extract_discard(call, 0); call->unmarshall = 5; - /* Fall through - Done */ + fallthrough; /* Done */ case 5: ret = afs_extract_data(call, false); if (ret < 0) @@ -663,7 +663,7 @@ static int afs_deliver_yfsvl_get_cell_name(struct afs_call *call) afs_extract_to_tmp(call); call->unmarshall++; - /* Fall through - and extract the cell name length */ + fallthrough; /* and extract the cell name length */ case 1: ret = afs_extract_data(call, true); if (ret < 0) @@ -685,7 +685,7 @@ static int afs_deliver_yfsvl_get_cell_name(struct afs_call *call) afs_extract_begin(call, cell_name, namesz); call->unmarshall++; - /* Fall through - and extract cell name */ + fallthrough; /* and extract cell name */ case 2: ret = afs_extract_data(call, true); if (ret < 0) @@ -694,7 +694,7 @@ static int afs_deliver_yfsvl_get_cell_name(struct afs_call *call) afs_extract_discard(call, call->count2); call->unmarshall++; - /* Fall through - and extract padding */ + fallthrough; /* and extract padding */ case 3: ret = afs_extract_data(call, false); if (ret < 0) diff --git a/fs/afs/write.c b/fs/afs/write.c index a121c247d95a..4b2265cb1891 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -609,7 +609,7 @@ no_more: default: pr_notice("kAFS: Unexpected error from FS.StoreData %d\n", ret); - /* Fall through */ + fallthrough; case -EACCES: case -EPERM: case -ENOKEY: diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index 8c24fdc899e3..3b1239b7e90d 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -373,7 +373,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) req->offset = req->pos & (PAGE_SIZE - 1); afs_extract_to_tmp64(call); call->unmarshall++; - /* Fall through */ + fallthrough; /* extract the returned data length */ case 1: @@ -401,7 +401,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) call->bvec[0].bv_page = req->pages[req->index]; iov_iter_bvec(&call->def_iter, READ, call->bvec, 1, size); ASSERTCMP(size, <=, PAGE_SIZE); - /* Fall through */ + fallthrough; /* extract the returned data */ case 2: @@ -428,7 +428,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) /* Discard any excess data the server gave us */ afs_extract_discard(call, req->actual_len - req->len); call->unmarshall = 3; - /* Fall through */ + fallthrough; case 3: _debug("extract discard %zu/%llu", @@ -444,7 +444,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) sizeof(struct yfs_xdr_YFSFetchStatus) + sizeof(struct yfs_xdr_YFSCallBack) + sizeof(struct yfs_xdr_YFSVolSync)); - /* Fall through */ + fallthrough; /* extract the metadata */ case 4: @@ -461,7 +461,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) req->file_size = vp->scb.status.size; call->unmarshall++; - /* Fall through */ + fallthrough; case 5: break; @@ -1262,7 +1262,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call) case 0: call->unmarshall++; afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSFetchVolumeStatus)); - /* Fall through */ + fallthrough; /* extract the returned status record */ case 1: @@ -1275,7 +1275,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call) xdr_decode_YFSFetchVolumeStatus(&bp, &op->volstatus.vs); call->unmarshall++; afs_extract_to_tmp(call); - /* Fall through */ + fallthrough; /* extract the volume name length */ case 2: @@ -1290,7 +1290,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call) size = (call->count + 3) & ~3; /* It's padded */ afs_extract_to_buf(call, size); call->unmarshall++; - /* Fall through */ + fallthrough; /* extract the volume name */ case 3: @@ -1304,7 +1304,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call) _debug("volname '%s'", p); afs_extract_to_tmp(call); call->unmarshall++; - /* Fall through */ + fallthrough; /* extract the offline message length */ case 4: @@ -1319,7 +1319,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call) size = (call->count + 3) & ~3; /* It's padded */ afs_extract_to_buf(call, size); call->unmarshall++; - /* Fall through */ + fallthrough; /* extract the offline message */ case 5: @@ -1334,7 +1334,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call) afs_extract_to_tmp(call); call->unmarshall++; - /* Fall through */ + fallthrough; /* extract the message of the day length */ case 6: @@ -1349,7 +1349,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call) size = (call->count + 3) & ~3; /* It's padded */ afs_extract_to_buf(call, size); call->unmarshall++; - /* Fall through */ + fallthrough; /* extract the message of the day */ case 7: @@ -1363,7 +1363,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call) _debug("motd '%s'", p); call->unmarshall++; - /* Fall through */ + fallthrough; case 8: break; @@ -1622,7 +1622,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call) case 0: afs_extract_to_tmp(call); call->unmarshall++; - /* Fall through */ + fallthrough; /* Extract the file status count and array in two steps */ case 1: @@ -1640,7 +1640,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call) call->unmarshall++; more_counts: afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSFetchStatus)); - /* Fall through */ + fallthrough; case 2: _debug("extract status array %u", call->count); @@ -1670,7 +1670,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call) call->count = 0; call->unmarshall++; afs_extract_to_tmp(call); - /* Fall through */ + fallthrough; /* Extract the callback count and array in two steps */ case 3: @@ -1687,7 +1687,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call) call->unmarshall++; more_cbs: afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSCallBack)); - /* Fall through */ + fallthrough; case 4: _debug("extract CB array"); @@ -1716,7 +1716,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call) afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSVolSync)); call->unmarshall++; - /* Fall through */ + fallthrough; case 5: ret = afs_extract_data(call, false); @@ -1727,7 +1727,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call) xdr_decode_YFSVolSync(&bp, &op->volsync); call->unmarshall++; - /* Fall through */ + fallthrough; case 6: break; @@ -1804,7 +1804,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call) case 0: afs_extract_to_tmp(call); call->unmarshall++; - /* Fall through */ + fallthrough; /* Extract the file ACL length */ case 1: @@ -1826,7 +1826,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call) afs_extract_discard(call, size); } call->unmarshall++; - /* Fall through */ + fallthrough; /* Extract the file ACL */ case 2: @@ -1836,7 +1836,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call) afs_extract_to_tmp(call); call->unmarshall++; - /* Fall through */ + fallthrough; /* Extract the volume ACL length */ case 3: @@ -1858,7 +1858,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call) afs_extract_discard(call, size); } call->unmarshall++; - /* Fall through */ + fallthrough; /* Extract the volume ACL */ case 4: @@ -1871,7 +1871,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call) sizeof(struct yfs_xdr_YFSFetchStatus) + sizeof(struct yfs_xdr_YFSVolSync)); call->unmarshall++; - /* Fall through */ + fallthrough; /* extract the metadata */ case 5: @@ -1886,7 +1886,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call) xdr_decode_YFSVolSync(&bp, &op->volsync); call->unmarshall++; - /* Fall through */ + fallthrough; case 6: break; @@ -1511,7 +1511,7 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t ret) * may be already running. Just fail this IO with EINTR. */ ret = -EINTR; - /*FALLTHRU*/ + fallthrough; default: req->ki_complete(req, ret, 0); } diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index f2f9086ebe98..b9c658e0548e 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -576,7 +576,7 @@ static int load_flat_file(struct linux_binprm *bprm, goto err; } - len = data_len + extra; + len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); len = PAGE_ALIGN(len); realdatastart = vm_mmap(NULL, 0, len, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); @@ -590,7 +590,9 @@ static int load_flat_file(struct linux_binprm *bprm, vm_munmap(textpos, text_len); goto err; } - datapos = ALIGN(realdatastart, FLAT_DATA_ALIGN); + datapos = ALIGN(realdatastart + + MAX_SHARED_LIBS * sizeof(unsigned long), + FLAT_DATA_ALIGN); pr_debug("Allocated data+bss+stack (%u bytes): %lx\n", data_len + bss_len + stack_len, datapos); @@ -620,7 +622,7 @@ static int load_flat_file(struct linux_binprm *bprm, memp_size = len; } else { - len = text_len + data_len + extra; + len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(u32); len = PAGE_ALIGN(len); textpos = vm_mmap(NULL, 0, len, PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); @@ -635,7 +637,9 @@ static int load_flat_file(struct linux_binprm *bprm, } realdatastart = textpos + ntohl(hdr->data_start); - datapos = ALIGN(realdatastart, FLAT_DATA_ALIGN); + datapos = ALIGN(realdatastart + + MAX_SHARED_LIBS * sizeof(u32), + FLAT_DATA_ALIGN); reloc = (__be32 __user *) (datapos + (ntohl(hdr->reloc_start) - text_len)); @@ -652,9 +656,8 @@ static int load_flat_file(struct linux_binprm *bprm, (text_len + full_data - sizeof(struct flat_hdr)), 0); - if (datapos != realdatastart) - memmove((void *)datapos, (void *)realdatastart, - full_data); + memmove((void *) datapos, (void *) realdatastart, + full_data); #else /* * This is used on MMU systems mainly for testing. @@ -710,7 +713,8 @@ static int load_flat_file(struct linux_binprm *bprm, if (IS_ERR_VALUE(result)) { ret = result; pr_err("Unable to read code+data+bss, errno %d\n", ret); - vm_munmap(textpos, text_len + data_len + extra); + vm_munmap(textpos, text_len + data_len + extra + + MAX_SHARED_LIBS * sizeof(u32)); goto err; } } diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 613920c17ac1..ea8aaf36647e 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1798,7 +1798,6 @@ static struct btrfs_block_group *btrfs_create_block_group_cache( cache->fs_info = fs_info; cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start); - set_free_space_tree_thresholds(cache); cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED; @@ -1912,6 +1911,8 @@ static int read_one_block_group(struct btrfs_fs_info *info, if (ret < 0) goto error; + set_free_space_tree_thresholds(cache); + if (need_clear) { /* * When we mount with old space cache, we need to @@ -2132,6 +2133,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, return -ENOMEM; cache->length = size; + set_free_space_tree_thresholds(cache); cache->used = bytes_used; cache->flags = type; cache->last_byte_to_unpin = (u64)-1; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 70e49d8d4f6c..cd392da69b81 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -68,7 +68,7 @@ const char *btrfs_super_csum_driver(u16 csum_type) btrfs_csums[csum_type].name; } -size_t __const btrfs_get_num_csums(void) +size_t __attribute_const__ btrfs_get_num_csums(void) { return ARRAY_SIZE(btrfs_csums); } @@ -1297,6 +1297,8 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); + btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin), + eb_rewin, btrfs_header_level(eb_rewin)); btrfs_tree_read_lock(eb_rewin); __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm); WARN_ON(btrfs_header_nritems(eb_rewin) > @@ -1370,7 +1372,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq) if (!eb) return NULL; - btrfs_tree_read_lock(eb); if (old_root) { btrfs_set_header_bytenr(eb, eb->start); btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); @@ -1378,6 +1379,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq) btrfs_set_header_level(eb, old_root->level); btrfs_set_header_generation(eb, old_generation); } + btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, + btrfs_header_level(eb)); + btrfs_tree_read_lock(eb); if (tm) __tree_mod_log_rewind(fs_info, eb, time_seq, tm); else diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9c7e466f27a9..9a72896bed2e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2262,7 +2262,7 @@ BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, int btrfs_super_csum_size(const struct btrfs_super_block *s); const char *btrfs_super_csum_name(u16 csum_type); const char *btrfs_super_csum_driver(u16 csum_type); -size_t __const btrfs_get_num_csums(void); +size_t __attribute_const__ btrfs_get_num_csums(void); /* @@ -2518,7 +2518,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes); int btrfs_exclude_logged_extents(struct extent_buffer *eb); int btrfs_cross_ref_exist(struct btrfs_root *root, - u64 objectid, u64 offset, u64 bytenr); + u64 objectid, u64 offset, u64 bytenr, bool strict); struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, @@ -2934,7 +2934,7 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, u64 start, u64 len); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, - u64 *ram_bytes); + u64 *ram_bytes, bool strict); void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9ae25f632157..f6bba7eb1fa1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4551,6 +4551,7 @@ static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache) cache->io_ctl.inode = NULL; iput(inode); } + ASSERT(cache->io_ctl.pages == NULL); btrfs_put_block_group(cache); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index de6fe176fdfb..e9eedc053fc5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2306,7 +2306,8 @@ static noinline int check_delayed_ref(struct btrfs_root *root, static noinline int check_committed_ref(struct btrfs_root *root, struct btrfs_path *path, - u64 objectid, u64 offset, u64 bytenr) + u64 objectid, u64 offset, u64 bytenr, + bool strict) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *extent_root = fs_info->extent_root; @@ -2348,9 +2349,13 @@ static noinline int check_committed_ref(struct btrfs_root *root, btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) goto out; - /* If extent created before last snapshot => it's definitely shared */ - if (btrfs_extent_generation(leaf, ei) <= - btrfs_root_last_snapshot(&root->root_item)) + /* + * If extent created before last snapshot => it's shared unless the + * snapshot has been deleted. Use the heuristic if strict is false. + */ + if (!strict && + (btrfs_extent_generation(leaf, ei) <= + btrfs_root_last_snapshot(&root->root_item))) goto out; iref = (struct btrfs_extent_inline_ref *)(ei + 1); @@ -2375,7 +2380,7 @@ out: } int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, - u64 bytenr) + u64 bytenr, bool strict) { struct btrfs_path *path; int ret; @@ -2386,7 +2391,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, do { ret = check_committed_ref(root, path, objectid, - offset, bytenr); + offset, bytenr, strict); if (ret && ret != -ENOENT) goto out; @@ -4522,7 +4527,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, return ERR_PTR(-EUCLEAN); } - btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); + btrfs_set_buffer_lockdep_class(owner, buf, level); btrfs_tree_lock(buf); btrfs_clean_tree_block(buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6def411b2eba..a940edb1e64f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5655,9 +5655,9 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, } } -int read_extent_buffer_to_user(const struct extent_buffer *eb, - void __user *dstv, - unsigned long start, unsigned long len) +int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, + void __user *dstv, + unsigned long start, unsigned long len) { size_t cur; size_t offset; @@ -5677,7 +5677,7 @@ int read_extent_buffer_to_user(const struct extent_buffer *eb, cur = min(len, (PAGE_SIZE - offset)); kaddr = page_address(page); - if (copy_to_user(dst, kaddr + offset, cur)) { + if (copy_to_user_nofault(dst, kaddr + offset, cur)) { ret = -EFAULT; break; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 00a88f2eb5ab..30794ae58498 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -241,9 +241,9 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, void read_extent_buffer(const struct extent_buffer *eb, void *dst, unsigned long start, unsigned long len); -int read_extent_buffer_to_user(const struct extent_buffer *eb, - void __user *dst, unsigned long start, - unsigned long len); +int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, + void __user *dst, unsigned long start, + unsigned long len); void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *src); void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb, const void *src); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index bb824c7cb7c7..4507c3d09399 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1571,7 +1571,7 @@ static int check_can_nocow(struct btrfs_inode *inode, loff_t pos, } ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, - NULL, NULL, NULL); + NULL, NULL, NULL, false); if (ret <= 0) { ret = 0; if (!nowait) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index ef0fd7afb0b1..dc82fd0c80cb 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1186,7 +1186,6 @@ static int __btrfs_wait_cache_io(struct btrfs_root *root, ret = update_cache_item(trans, root, inode, path, offset, io_ctl->entries, io_ctl->bitmaps); out: - io_ctl_free(io_ctl); if (ret) { invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; @@ -1347,6 +1346,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, * them out later */ io_ctl_drop_pages(io_ctl); + io_ctl_free(io_ctl); unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, &cached_state); diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 8b1f5c8897b7..6b9faf3b0e96 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -22,6 +22,10 @@ void set_free_space_tree_thresholds(struct btrfs_block_group *cache) size_t bitmap_size; u64 num_bitmaps, total_bitmap_size; + if (WARN_ON(cache->length == 0)) + btrfs_warn(cache->fs_info, "block group %llu length is zero", + cache->start); + /* * We convert to bitmaps when the disk space required for using extents * exceeds that required for using bitmaps. diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 51fcd82d41c0..9570458aa847 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1610,7 +1610,7 @@ next_slot: goto out_check; ret = btrfs_cross_ref_exist(root, ino, found_key.offset - - extent_offset, disk_bytenr); + extent_offset, disk_bytenr, false); if (ret) { /* * ret could be -EIO if the above fails to read @@ -2161,11 +2161,8 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio, u64 bio_offset) { struct inode *inode = private_data; - blk_status_t ret = 0; - ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); - BUG_ON(ret); /* -ENOMEM */ - return 0; + return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); } /* @@ -6953,6 +6950,8 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, * @orig_start: (optional) Return the original file offset of the file extent * @orig_len: (optional) Return the original on-disk length of the file extent * @ram_bytes: (optional) Return the ram_bytes of the file extent + * @strict: if true, omit optimizations that might force us into unnecessary + * cow. e.g., don't trust generation number. * * This function will flush ordered extents in the range to ensure proper * nocow checks for (nowait == false) case. @@ -6967,7 +6966,7 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, */ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, - u64 *ram_bytes) + u64 *ram_bytes, bool strict) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_path *path; @@ -7045,8 +7044,9 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, * Do the same check as in btrfs_cross_ref_exist but without the * unnecessary search. */ - if (btrfs_file_extent_generation(leaf, fi) <= - btrfs_root_last_snapshot(&root->root_item)) + if (!strict && + (btrfs_file_extent_generation(leaf, fi) <= + btrfs_root_last_snapshot(&root->root_item))) goto out; backref_offset = btrfs_file_extent_offset(leaf, fi); @@ -7082,7 +7082,8 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, */ ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)), - key.offset - backref_offset, disk_bytenr); + key.offset - backref_offset, disk_bytenr, + strict); if (ret) { ret = 0; goto out; @@ -7303,7 +7304,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, block_start = em->block_start + (start - em->start); if (can_nocow_extent(inode, start, &len, &orig_start, - &orig_block_len, &ram_bytes) == 1 && + &orig_block_len, &ram_bytes, false) == 1 && btrfs_inc_nocow_writers(fs_info, block_start)) { struct extent_map *em2; @@ -7619,10 +7620,8 @@ static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data, struct bio *bio, u64 offset) { struct inode *inode = private_data; - blk_status_t ret; - ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, offset, 1); - BUG_ON(ret); /* -ENOMEM */ - return 0; + + return btrfs_csum_one_bio(BTRFS_I(inode), bio, offset, 1); } static void btrfs_end_dio_bio(struct bio *bio) @@ -10136,7 +10135,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, free_extent_map(em); em = NULL; - ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL); + ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, true); if (ret < 0) { goto out; } else if (ret) { diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index bd3511c5ca81..ac45f022b495 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2086,9 +2086,14 @@ static noinline int copy_to_sk(struct btrfs_path *path, sh.len = item_len; sh.transid = found_transid; - /* copy search result header */ - if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) { - ret = -EFAULT; + /* + * Copy search result header. If we fault then loop again so we + * can fault in the pages and -EFAULT there if there's a + * problem. Otherwise we'll fault and then copy the buffer in + * properly this next time through + */ + if (copy_to_user_nofault(ubuf + *sk_offset, &sh, sizeof(sh))) { + ret = 0; goto out; } @@ -2096,10 +2101,14 @@ static noinline int copy_to_sk(struct btrfs_path *path, if (item_len) { char __user *up = ubuf + *sk_offset; - /* copy the item */ - if (read_extent_buffer_to_user(leaf, up, - item_off, item_len)) { - ret = -EFAULT; + /* + * Copy the item, same behavior as above, but reset the + * * sk_offset so we copy the full thing again. + */ + if (read_extent_buffer_to_user_nofault(leaf, up, + item_off, item_len)) { + ret = 0; + *sk_offset -= sizeof(sh); goto out; } @@ -2184,6 +2193,10 @@ static noinline int search_ioctl(struct inode *inode, key.offset = sk->min_offset; while (1) { + ret = fault_in_pages_writeable(ubuf, *buf_size - sk_offset); + if (ret) + break; + ret = btrfs_search_forward(root, &key, path, sk->min_transid); if (ret != 0) { if (ret > 0) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 5a6cb9db512e..354ab9985a34 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3716,50 +3716,84 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, return 0; } +static void scrub_workers_put(struct btrfs_fs_info *fs_info) +{ + if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt, + &fs_info->scrub_lock)) { + struct btrfs_workqueue *scrub_workers = NULL; + struct btrfs_workqueue *scrub_wr_comp = NULL; + struct btrfs_workqueue *scrub_parity = NULL; + + scrub_workers = fs_info->scrub_workers; + scrub_wr_comp = fs_info->scrub_wr_completion_workers; + scrub_parity = fs_info->scrub_parity_workers; + + fs_info->scrub_workers = NULL; + fs_info->scrub_wr_completion_workers = NULL; + fs_info->scrub_parity_workers = NULL; + mutex_unlock(&fs_info->scrub_lock); + + btrfs_destroy_workqueue(scrub_workers); + btrfs_destroy_workqueue(scrub_wr_comp); + btrfs_destroy_workqueue(scrub_parity); + } +} + /* * get a reference count on fs_info->scrub_workers. start worker if necessary */ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, int is_dev_replace) { + struct btrfs_workqueue *scrub_workers = NULL; + struct btrfs_workqueue *scrub_wr_comp = NULL; + struct btrfs_workqueue *scrub_parity = NULL; unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; int max_active = fs_info->thread_pool_size; + int ret = -ENOMEM; - lockdep_assert_held(&fs_info->scrub_lock); + if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt)) + return 0; - if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) { - ASSERT(fs_info->scrub_workers == NULL); - fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", - flags, is_dev_replace ? 1 : max_active, 4); - if (!fs_info->scrub_workers) - goto fail_scrub_workers; - - ASSERT(fs_info->scrub_wr_completion_workers == NULL); - fs_info->scrub_wr_completion_workers = - btrfs_alloc_workqueue(fs_info, "scrubwrc", flags, - max_active, 2); - if (!fs_info->scrub_wr_completion_workers) - goto fail_scrub_wr_completion_workers; + scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags, + is_dev_replace ? 1 : max_active, 4); + if (!scrub_workers) + goto fail_scrub_workers; - ASSERT(fs_info->scrub_parity_workers == NULL); - fs_info->scrub_parity_workers = - btrfs_alloc_workqueue(fs_info, "scrubparity", flags, + scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags, max_active, 2); - if (!fs_info->scrub_parity_workers) - goto fail_scrub_parity_workers; + if (!scrub_wr_comp) + goto fail_scrub_wr_completion_workers; + scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags, + max_active, 2); + if (!scrub_parity) + goto fail_scrub_parity_workers; + + mutex_lock(&fs_info->scrub_lock); + if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) { + ASSERT(fs_info->scrub_workers == NULL && + fs_info->scrub_wr_completion_workers == NULL && + fs_info->scrub_parity_workers == NULL); + fs_info->scrub_workers = scrub_workers; + fs_info->scrub_wr_completion_workers = scrub_wr_comp; + fs_info->scrub_parity_workers = scrub_parity; refcount_set(&fs_info->scrub_workers_refcnt, 1); - } else { - refcount_inc(&fs_info->scrub_workers_refcnt); + mutex_unlock(&fs_info->scrub_lock); + return 0; } - return 0; + /* Other thread raced in and created the workers for us */ + refcount_inc(&fs_info->scrub_workers_refcnt); + mutex_unlock(&fs_info->scrub_lock); + ret = 0; + btrfs_destroy_workqueue(scrub_parity); fail_scrub_parity_workers: - btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); + btrfs_destroy_workqueue(scrub_wr_comp); fail_scrub_wr_completion_workers: - btrfs_destroy_workqueue(fs_info->scrub_workers); + btrfs_destroy_workqueue(scrub_workers); fail_scrub_workers: - return -ENOMEM; + return ret; } int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, @@ -3770,9 +3804,6 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, int ret; struct btrfs_device *dev; unsigned int nofs_flag; - struct btrfs_workqueue *scrub_workers = NULL; - struct btrfs_workqueue *scrub_wr_comp = NULL; - struct btrfs_workqueue *scrub_parity = NULL; if (btrfs_fs_closing(fs_info)) return -EAGAIN; @@ -3819,13 +3850,17 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, if (IS_ERR(sctx)) return PTR_ERR(sctx); + ret = scrub_workers_get(fs_info, is_dev_replace); + if (ret) + goto out_free_ctx; + mutex_lock(&fs_info->fs_devices->device_list_mutex); dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true); if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && !is_dev_replace)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); ret = -ENODEV; - goto out_free_ctx; + goto out; } if (!is_dev_replace && !readonly && @@ -3834,7 +3869,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable", rcu_str_deref(dev->name)); ret = -EROFS; - goto out_free_ctx; + goto out; } mutex_lock(&fs_info->scrub_lock); @@ -3843,7 +3878,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); ret = -EIO; - goto out_free_ctx; + goto out; } down_read(&fs_info->dev_replace.rwsem); @@ -3854,17 +3889,10 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); ret = -EINPROGRESS; - goto out_free_ctx; + goto out; } up_read(&fs_info->dev_replace.rwsem); - ret = scrub_workers_get(fs_info, is_dev_replace); - if (ret) { - mutex_unlock(&fs_info->scrub_lock); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); - goto out_free_ctx; - } - sctx->readonly = readonly; dev->scrub_ctx = sctx; mutex_unlock(&fs_info->fs_devices->device_list_mutex); @@ -3917,24 +3945,14 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_lock(&fs_info->scrub_lock); dev->scrub_ctx = NULL; - if (refcount_dec_and_test(&fs_info->scrub_workers_refcnt)) { - scrub_workers = fs_info->scrub_workers; - scrub_wr_comp = fs_info->scrub_wr_completion_workers; - scrub_parity = fs_info->scrub_parity_workers; - - fs_info->scrub_workers = NULL; - fs_info->scrub_wr_completion_workers = NULL; - fs_info->scrub_parity_workers = NULL; - } mutex_unlock(&fs_info->scrub_lock); - btrfs_destroy_workqueue(scrub_workers); - btrfs_destroy_workqueue(scrub_wr_comp); - btrfs_destroy_workqueue(scrub_parity); + scrub_workers_put(fs_info); scrub_put_ctx(sctx); return ret; - +out: + scrub_workers_put(fs_info); out_free_ctx: scrub_free_ctx(sctx); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index e529ddb35b87..25967ecaaf0a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -625,6 +625,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, } else if (strncmp(args[0].from, "lzo", 3) == 0) { compress_type = "lzo"; info->compress_type = BTRFS_COMPRESS_LZO; + info->compress_level = 0; btrfs_set_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, NODATACOW); btrfs_clear_opt(info->mount_opt, NODATASUM); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 517b44300a05..7b1fee630f97 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -984,7 +984,7 @@ static int check_inode_item(struct extent_buffer *leaf, /* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */ if (btrfs_inode_transid(leaf, iitem) > super_gen + 1) { inode_item_err(leaf, slot, - "invalid inode generation: has %llu expect [0, %llu]", + "invalid inode transid: has %llu expect [0, %llu]", btrfs_inode_transid(leaf, iitem), super_gen + 1); return -EUCLEAN; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 696dd861cc3c..39da9db35278 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3449,11 +3449,13 @@ fail: btrfs_free_path(path); out_unlock: mutex_unlock(&dir->log_mutex); - if (ret == -ENOSPC) { + if (err == -ENOSPC) { btrfs_set_log_full_commit(trans); - ret = 0; - } else if (ret < 0) - btrfs_abort_transaction(trans, ret); + err = 0; + } else if (err < 0 && err != -ENOENT) { + /* ENOENT can be returned if the entry hasn't been fsynced yet */ + btrfs_abort_transaction(trans, err); + } btrfs_end_log_trans(root); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ee96c5869f57..214856c4ccb1 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4462,6 +4462,7 @@ int btrfs_uuid_scan_kthread(void *data) goto skip; } update_tree: + btrfs_release_path(path); if (!btrfs_is_empty_uuid(root_item.uuid)) { ret = btrfs_uuid_tree_add(trans, root_item.uuid, BTRFS_UUID_KEY_SUBVOL, @@ -4486,6 +4487,7 @@ update_tree: } skip: + btrfs_release_path(path); if (trans) { ret = btrfs_end_transaction(trans); trans = NULL; @@ -4493,7 +4495,6 @@ skip: break; } - btrfs_release_path(path); if (key.offset < (u64)-1) { key.offset++; } else if (key.type < BTRFS_ROOT_ITEM_KEY) { diff --git a/fs/buffer.c b/fs/buffer.c index 061dd202979d..50bbc99e3d96 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1958,7 +1958,7 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, */ set_buffer_new(bh); set_buffer_unwritten(bh); - /* FALLTHRU */ + fallthrough; case IOMAP_MAPPED: if ((iomap->flags & IOMAP_F_NEW) || offset >= i_size_read(inode)) @@ -3157,6 +3157,15 @@ int __sync_dirty_buffer(struct buffer_head *bh, int op_flags) WARN_ON(atomic_read(&bh->b_count) < 1); lock_buffer(bh); if (test_clear_buffer_dirty(bh)) { + /* + * The bh should be mapped, but it might not be if the + * device was hot-removed. Not much we can do but fail the I/O. + */ + if (!buffer_mapped(bh)) { + unlock_buffer(bh); + return -EIO; + } + get_bh(bh); bh->b_end_io = end_buffer_write_sync; ret = submit_bh(REQ_OP_WRITE, op_flags, bh); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 55ccccf77cea..034b3f4fdd3a 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -887,8 +887,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) int have = ci->i_snap_caps; if ((have & mask) == mask) { - dout("__ceph_caps_issued_mask ino 0x%lx snap issued %s" - " (mask %s)\n", ci->vfs_inode.i_ino, + dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s" + " (mask %s)\n", ceph_ino(&ci->vfs_inode), ceph_cap_string(have), ceph_cap_string(mask)); return 1; @@ -899,8 +899,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) if (!__cap_is_valid(cap)) continue; if ((cap->issued & mask) == mask) { - dout("__ceph_caps_issued_mask ino 0x%lx cap %p issued %s" - " (mask %s)\n", ci->vfs_inode.i_ino, cap, + dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s" + " (mask %s)\n", ceph_ino(&ci->vfs_inode), cap, ceph_cap_string(cap->issued), ceph_cap_string(mask)); if (touch) @@ -911,8 +911,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) /* does a combination of caps satisfy mask? */ have |= cap->issued; if ((have & mask) == mask) { - dout("__ceph_caps_issued_mask ino 0x%lx combo issued %s" - " (mask %s)\n", ci->vfs_inode.i_ino, + dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s" + " (mask %s)\n", ceph_ino(&ci->vfs_inode), ceph_cap_string(cap->issued), ceph_cap_string(mask)); if (touch) { @@ -2872,7 +2872,7 @@ int ceph_get_caps(struct file *filp, int need, int want, struct cap_wait cw; DEFINE_WAIT_FUNC(wait, woken_wake_function); - cw.ino = inode->i_ino; + cw.ino = ceph_ino(inode); cw.tgid = current->tgid; cw.need = need; cw.want = want; diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 97539b497e4c..3e3fcda9b276 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -202,7 +202,7 @@ static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p) { struct seq_file *s = p; - seq_printf(s, "0x%-17lx%-17s%-17s\n", inode->i_ino, + seq_printf(s, "0x%-17llx%-17s%-17s\n", ceph_ino(inode), ceph_cap_string(cap->issued), ceph_cap_string(cap->implemented)); return 0; @@ -247,7 +247,7 @@ static int caps_show(struct seq_file *s, void *p) spin_lock(&mdsc->caps_list_lock); list_for_each_entry(cw, &mdsc->cap_wait_list, list) { - seq_printf(s, "%-13d0x%-17lx%-17s%-17s\n", cw->tgid, cw->ino, + seq_printf(s, "%-13d0x%-17llx%-17s%-17s\n", cw->tgid, cw->ino, ceph_cap_string(cw->need), ceph_cap_string(cw->want)); } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 060bdcc5ce32..d72e4a12bb69 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -259,9 +259,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, dentry, dentry, d_inode(dentry)); ctx->pos = di->offset; if (!dir_emit(ctx, dentry->d_name.name, - dentry->d_name.len, - ceph_translate_ino(dentry->d_sb, - d_inode(dentry)->i_ino), + dentry->d_name.len, ceph_present_inode(d_inode(dentry)), d_inode(dentry)->i_mode >> 12)) { dput(dentry); err = 0; @@ -324,18 +322,21 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) /* always start with . and .. */ if (ctx->pos == 0) { dout("readdir off 0 -> '.'\n"); - if (!dir_emit(ctx, ".", 1, - ceph_translate_ino(inode->i_sb, inode->i_ino), + if (!dir_emit(ctx, ".", 1, ceph_present_inode(inode), inode->i_mode >> 12)) return 0; ctx->pos = 1; } if (ctx->pos == 1) { - ino_t ino = parent_ino(file->f_path.dentry); + u64 ino; + struct dentry *dentry = file->f_path.dentry; + + spin_lock(&dentry->d_lock); + ino = ceph_present_inode(dentry->d_parent->d_inode); + spin_unlock(&dentry->d_lock); + dout("readdir off 1 -> '..'\n"); - if (!dir_emit(ctx, "..", 2, - ceph_translate_ino(inode->i_sb, ino), - inode->i_mode >> 12)) + if (!dir_emit(ctx, "..", 2, ino, inode->i_mode >> 12)) return 0; ctx->pos = 2; } @@ -507,9 +508,6 @@ more: } for (; i < rinfo->dir_nr; i++) { struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; - struct ceph_vino vino; - ino_t ino; - u32 ftype; BUG_ON(rde->offset < ctx->pos); @@ -519,13 +517,10 @@ more: rde->name_len, rde->name, &rde->inode.in); BUG_ON(!rde->inode.in); - ftype = le32_to_cpu(rde->inode.in->mode) >> 12; - vino.ino = le64_to_cpu(rde->inode.in->ino); - vino.snap = le64_to_cpu(rde->inode.in->snapid); - ino = ceph_vino_to_ino(vino); if (!dir_emit(ctx, rde->name, rde->name_len, - ceph_translate_ino(inode->i_sb, ino), ftype)) { + ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)), + le32_to_cpu(rde->inode.in->mode) >> 12)) { dout("filldir stopping us...\n"); return 0; } @@ -1161,7 +1156,7 @@ retry: if (try_async && op == CEPH_MDS_OP_UNLINK && (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) { - dout("async unlink on %lu/%.*s caps=%s", dir->i_ino, + dout("async unlink on %llu/%.*s caps=%s", ceph_ino(dir), dentry->d_name.len, dentry->d_name.name, ceph_cap_string(req->r_dir_caps)); set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags); @@ -1745,7 +1740,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) case -ENOENT: if (d_really_is_negative(dentry)) valid = 1; - /* Fallthrough */ + fallthrough; default: break; } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index d51c3f2fdca0..3f4c993dfc6f 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -252,7 +252,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) case S_IFREG: ceph_fscache_register_inode_cookie(inode); ceph_fscache_file_set_cookie(inode, file); - /* fall through */ + fallthrough; case S_IFDIR: ret = ceph_init_file_info(inode, file, fmode, S_ISDIR(inode->i_mode)); @@ -630,8 +630,8 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, } else { struct dentry *dn; - dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__, - vino.ino, dir->i_ino, dentry->d_name.name); + dout("%s d_adding new inode 0x%llx to 0x%llx/%s\n", __func__, + vino.ino, ceph_ino(dir), dentry->d_name.name); ceph_dir_clear_ordered(dir); ceph_init_inode_acls(inode, as_ctx); if (inode->i_state & I_NEW) { @@ -2507,6 +2507,7 @@ const struct file_operations ceph_file_fops = { .mmap = ceph_mmap, .fsync = ceph_fsync, .lock = ceph_lock, + .setlease = simple_nosetlease, .flock = ceph_flock, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 357c937699d5..d163fa96cb40 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -41,8 +41,10 @@ static void ceph_inode_work(struct work_struct *work); */ static int ceph_set_ino_cb(struct inode *inode, void *data) { - ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; - inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data); + struct ceph_inode_info *ci = ceph_inode(inode); + + ci->i_vino = *(struct ceph_vino *)data; + inode->i_ino = ceph_vino_to_ino_t(ci->i_vino); inode_set_iversion_raw(inode, 0); return 0; } @@ -50,17 +52,14 @@ static int ceph_set_ino_cb(struct inode *inode, void *data) struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino) { struct inode *inode; - ino_t t = ceph_vino_to_ino(vino); - inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino); + inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare, + ceph_set_ino_cb, &vino); if (!inode) return ERR_PTR(-ENOMEM); - if (inode->i_state & I_NEW) - dout("get_inode created new inode %p %llx.%llx ino %llx\n", - inode, ceph_vinop(inode), (u64)inode->i_ino); - dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino, - vino.snap, inode); + dout("get_inode on %llu=%llx.%llx got %p new %d\n", ceph_present_inode(inode), + ceph_vinop(inode), inode, !!(inode->i_state & I_NEW)); return inode; } @@ -2378,7 +2377,7 @@ int ceph_getattr(const struct path *path, struct kstat *stat, } generic_fillattr(inode, stat); - stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino); + stat->ino = ceph_present_inode(inode); /* * btime on newly-allocated inodes is 0, so if this is still set to diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index bc9e95937d7c..658800605bfb 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -372,7 +372,7 @@ struct ceph_quotarealm_inode { struct cap_wait { struct list_head list; - unsigned long ino; + u64 ino; pid_t tgid; int need; int want; diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 198ddde5c1e6..cc2c4d40b022 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -23,12 +23,12 @@ static inline bool ceph_has_realms_with_quotas(struct inode *inode) { struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct super_block *sb = mdsc->fsc->sb; + struct inode *root = d_inode(sb->s_root); if (atomic64_read(&mdsc->quotarealms_count) > 0) return true; /* if root is the real CephFS root, we don't have quota realms */ - if (sb->s_root->d_inode && - (sb->s_root->d_inode->i_ino == CEPH_INO_ROOT)) + if (root && ceph_ino(root) == CEPH_INO_ROOT) return false; /* otherwise, we can't know for sure */ return true; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 4c3c964b1c54..a3995ebe0623 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -457,15 +457,7 @@ ceph_vino(const struct inode *inode) return ceph_inode(inode)->i_vino; } -/* - * ino_t is <64 bits on many architectures, blech. - * - * i_ino (kernel inode) st_ino (userspace) - * i386 32 32 - * x86_64+ino32 64 32 - * x86_64 64 64 - */ -static inline u32 ceph_ino_to_ino32(__u64 vino) +static inline u32 ceph_ino_to_ino32(u64 vino) { u32 ino = vino & 0xffffffff; ino ^= vino >> 32; @@ -475,34 +467,17 @@ static inline u32 ceph_ino_to_ino32(__u64 vino) } /* - * kernel i_ino value + * Inode numbers in cephfs are 64 bits, but inode->i_ino is 32-bits on + * some arches. We generally do not use this value inside the ceph driver, but + * we do want to set it to something, so that generic vfs code has an + * appropriate value for tracepoints and the like. */ -static inline ino_t ceph_vino_to_ino(struct ceph_vino vino) +static inline ino_t ceph_vino_to_ino_t(struct ceph_vino vino) { -#if BITS_PER_LONG == 32 - return ceph_ino_to_ino32(vino.ino); -#else + if (sizeof(ino_t) == sizeof(u32)) + return ceph_ino_to_ino32(vino.ino); return (ino_t)vino.ino; -#endif -} - -/* - * user-visible ino (stat, filldir) - */ -#if BITS_PER_LONG == 32 -static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino) -{ - return ino; -} -#else -static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino) -{ - if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32)) - ino = ceph_ino_to_ino32(ino); - return ino; } -#endif - /* for printf-style formatting */ #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap @@ -511,11 +486,34 @@ static inline u64 ceph_ino(struct inode *inode) { return ceph_inode(inode)->i_vino.ino; } + static inline u64 ceph_snap(struct inode *inode) { return ceph_inode(inode)->i_vino.snap; } +/** + * ceph_present_ino - format an inode number for presentation to userland + * @sb: superblock where the inode lives + * @ino: inode number to (possibly) convert + * + * If the user mounted with the ino32 option, then the 64-bit value needs + * to be converted to something that can fit inside 32 bits. Note that + * internal kernel code never uses this value, so this is entirely for + * userland consumption. + */ +static inline u64 ceph_present_ino(struct super_block *sb, u64 ino) +{ + if (unlikely(ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))) + return ceph_ino_to_ino32(ino); + return ino; +} + +static inline u64 ceph_present_inode(struct inode *inode) +{ + return ceph_present_ino(inode->i_sb, ceph_ino(inode)); +} + static inline int ceph_ino_compare(struct inode *inode, void *data) { struct ceph_vino *pvino = (struct ceph_vino *)data; @@ -524,11 +522,16 @@ static inline int ceph_ino_compare(struct inode *inode, void *data) ci->i_vino.snap == pvino->snap; } + static inline struct inode *ceph_find_inode(struct super_block *sb, struct ceph_vino vino) { - ino_t t = ceph_vino_to_ino(vino); - return ilookup5(sb, t, ceph_ino_compare, &vino); + /* + * NB: The hashval will be run through the fs/inode.c hash function + * anyway, so there is no need to squash the inode number down to + * 32-bits first. Just use low-order bits on arches with 32-bit long. + */ + return ilookup5(sb, (unsigned long)vino.ino, ceph_ino_compare, &vino); } diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index b296964b8afa..b565d83ba89e 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -2031,4 +2031,19 @@ static inline bool is_smb1_server(struct TCP_Server_Info *server) return strcmp(server->vals->version_string, SMB1_VERSION_STRING) == 0; } +static inline bool is_tcon_dfs(struct cifs_tcon *tcon) +{ + /* + * For SMB1, see MS-CIFS 2.4.55 SMB_COM_TREE_CONNECT_ANDX (0x75) and MS-CIFS 3.3.4.4 DFS + * Subsystem Notifies That a Share Is a DFS Share. + * + * For SMB2+, see MS-SMB2 2.2.10 SMB2 TREE_CONNECT Response and MS-SMB2 3.3.4.14 Server + * Application Updates a Share. + */ + if (!tcon || !tcon->ses || !tcon->ses->server) + return false; + return is_smb1_server(tcon->ses->server) ? tcon->Flags & SMB_SHARE_IS_IN_DFS : + tcon->share_flags & (SHI1005_FLAGS_DFS | SHI1005_FLAGS_DFS_ROOT); +} + #endif /* _CIFS_GLOB_H */ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 0e763d2dcf16..0496934feecb 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -581,7 +581,7 @@ should_set_ext_sec_flag(enum securityEnum sectype) if (global_secflags & (CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP)) return true; - /* Fallthrough */ + fallthrough; default: return false; } diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index a275ee399dce..a5731dd6e656 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1378,25 +1378,25 @@ static int cifs_parse_security_flavors(char *value, return 1; case Opt_sec_krb5i: vol->sign = true; - /* Fallthrough */ + fallthrough; case Opt_sec_krb5: vol->sectype = Kerberos; break; case Opt_sec_ntlmsspi: vol->sign = true; - /* Fallthrough */ + fallthrough; case Opt_sec_ntlmssp: vol->sectype = RawNTLMSSP; break; case Opt_sec_ntlmi: vol->sign = true; - /* Fallthrough */ + fallthrough; case Opt_ntlm: vol->sectype = NTLM; break; case Opt_sec_ntlmv2i: vol->sign = true; - /* Fallthrough */ + fallthrough; case Opt_sec_ntlmv2: vol->sectype = NTLMv2; break; @@ -2187,7 +2187,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->password = NULL; break; } - /* Fallthrough - to Opt_pass below.*/ + fallthrough; /* to Opt_pass below */ case Opt_pass: /* Obtain the value string */ value = strchr(data, '='); @@ -4909,7 +4909,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol) if (!tcon) continue; /* Make sure that requests go through new root servers */ - if (tcon->share_flags & (SHI1005_FLAGS_DFS | SHI1005_FLAGS_DFS_ROOT)) { + if (is_tcon_dfs(tcon)) { put_root_ses(root_ses); set_root_ses(cifs_sb, ses, &root_ses); } diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 69cd5856621b..de564368a887 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -798,7 +798,7 @@ cifs_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested) if ((server->sec_kerberos || server->sec_mskerberos) && (global_secflags & CIFSSEC_MAY_KRB5)) return Kerberos; - /* Fallthrough */ + fallthrough; default: return Unspecified; } @@ -815,7 +815,7 @@ cifs_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested) default: break; } - /* Fallthrough - to attempt LANMAN authentication next */ + fallthrough; /* to attempt LANMAN authentication next */ case CIFS_NEGFLAVOR_LANMAN: switch (requested) { case LANMAN: @@ -823,7 +823,7 @@ cifs_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested) case Unspecified: if (global_secflags & CIFSSEC_MAY_LANMAN) return LANMAN; - /* Fallthrough */ + fallthrough; default: return Unspecified; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 667d70aa335f..96c172d94fba 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1101,7 +1101,7 @@ smb2_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested) if ((server->sec_kerberos || server->sec_mskerberos) && (global_secflags & CIFSSEC_MAY_KRB5)) return Kerberos; - /* Fallthrough */ + fallthrough; default: return Unspecified; } diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index cb733652ecca..ca2273727225 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1688,11 +1688,11 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) switch (whence) { case 1: offset += file->f_pos; - /* fall through */ + fallthrough; case 0: if (offset >= 0) break; - /* fall through */ + fallthrough; default: return -EINVAL; } @@ -1367,7 +1367,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, ret = dax_load_hole(&xas, mapping, &entry, vmf); goto finish_iomap; } - /*FALLTHRU*/ + fallthrough; default: WARN_ON_ONCE(1); error = -EIO; diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 18d81599522f..002123efc6b0 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -5817,7 +5817,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, break; case -EAGAIN: error = 0; - /* fall through */ + fallthrough; default: __put_lkb(ls, lkb); goto out; diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 7d40d78ea864..ae325541884e 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -359,7 +359,7 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, return z_erofs_extent_lookback(m, m->delta[0]); case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: map->m_flags &= ~EROFS_MAP_ZIPPED; - /* fallthrough */ + fallthrough; case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: map->m_la = (lcn << lclusterbits) | m->clusterofs; break; @@ -416,7 +416,7 @@ int z_erofs_map_blocks_iter(struct inode *inode, case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: if (endoff >= m.clusterofs) map->m_flags &= ~EROFS_MAP_ZIPPED; - /* fallthrough */ + fallthrough; case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: if (endoff >= m.clusterofs) { map->m_la = (m.lcn << lclusterbits) | m.clusterofs; @@ -433,7 +433,7 @@ int z_erofs_map_blocks_iter(struct inode *inode, end = (m.lcn << lclusterbits) | m.clusterofs; map->m_flags |= EROFS_MAP_FULL_MAPPED; m.delta[0] = 1; - /* fallthrough */ + fallthrough; case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: /* get the correspoinding first chunk */ err = z_erofs_extent_lookback(&m, m.delta[0]); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 12eebcdea9c8..8107e06d7f6f 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1994,9 +1994,11 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) * not already there, and calling reverse_path_check() * during ep_insert(). */ - if (list_empty(&epi->ffd.file->f_tfile_llink)) - list_add(&epi->ffd.file->f_tfile_llink, - &tfile_check_list); + if (list_empty(&epi->ffd.file->f_tfile_llink)) { + if (get_file_rcu(epi->ffd.file)) + list_add(&epi->ffd.file->f_tfile_llink, + &tfile_check_list); + } } } mutex_unlock(&ep->mtx); @@ -2040,6 +2042,7 @@ static void clear_tfile_check_list(void) file = list_first_entry(&tfile_check_list, struct file, f_tfile_llink); list_del_init(&file->f_tfile_llink); + fput(file); } INIT_LIST_HEAD(&tfile_check_list); } @@ -2200,25 +2203,22 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, full_check = 1; if (is_file_epoll(tf.file)) { error = -ELOOP; - if (ep_loop_check(ep, tf.file) != 0) { - clear_tfile_check_list(); + if (ep_loop_check(ep, tf.file) != 0) goto error_tgt_fput; - } - } else + } else { + get_file(tf.file); list_add(&tf.file->f_tfile_llink, &tfile_check_list); + } error = epoll_mutex_lock(&ep->mtx, 0, nonblock); - if (error) { -out_del: - list_del(&tf.file->f_tfile_llink); + if (error) goto error_tgt_fput; - } if (is_file_epoll(tf.file)) { tep = tf.file->private_data; error = epoll_mutex_lock(&tep->mtx, 1, nonblock); if (error) { mutex_unlock(&ep->mtx); - goto out_del; + goto error_tgt_fput; } } } @@ -2239,8 +2239,6 @@ out_del: error = ep_insert(ep, epds, tf.file, fd, full_check); } else error = -EEXIST; - if (full_check) - clear_tfile_check_list(); break; case EPOLL_CTL_DEL: if (epi) @@ -2263,8 +2261,10 @@ out_del: mutex_unlock(&ep->mtx); error_tgt_fput: - if (full_check) + if (full_check) { + clear_tfile_check_list(); mutex_unlock(&epmutex); + } fdput(tf); error_fput: diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 60378ddf1424..96044f5dbc0e 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -93,8 +93,10 @@ static vm_fault_t ext2_dax_fault(struct vm_fault *vmf) struct inode *inode = file_inode(vmf->vma->vm_file); struct ext2_inode_info *ei = EXT2_I(inode); vm_fault_t ret; + bool write = (vmf->flags & FAULT_FLAG_WRITE) && + (vmf->vma->vm_flags & VM_SHARED); - if (vmf->flags & FAULT_FLAG_WRITE) { + if (write) { sb_start_pagefault(inode->i_sb); file_update_time(vmf->vma->vm_file); } @@ -103,7 +105,7 @@ static vm_fault_t ext2_dax_fault(struct vm_fault *vmf) ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, NULL, &ext2_iomap_ops); up_read(&ei->dax_sem); - if (vmf->flags & FAULT_FLAG_WRITE) + if (write) sb_end_pagefault(inode->i_sb); return ret; } diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 80662e1f7889..415c21f0e750 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1241,7 +1241,7 @@ do_indirects: mark_inode_dirty(inode); ext2_free_branches(inode, &nr, &nr+1, 1); } - /* fall through */ + fallthrough; case EXT2_IND_BLOCK: nr = i_data[EXT2_DIND_BLOCK]; if (nr) { @@ -1249,7 +1249,7 @@ do_indirects: mark_inode_dirty(inode); ext2_free_branches(inode, &nr, &nr+1, 2); } - /* fall through */ + fallthrough; case EXT2_DIND_BLOCK: nr = i_data[EXT2_TIND_BLOCK]; if (nr) { diff --git a/fs/ext2/super.c b/fs/ext2/super.c index dda860562ca3..7fab2b3b5b39 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -587,7 +587,7 @@ static int parse_options(char *options, struct super_block *sb, case Opt_xip: ext2_msg(sb, KERN_INFO, "use dax instead of xip"); set_opt(opts->s_mount_opt, XIP); - /* Fall through */ + fallthrough; case Opt_dax: #ifdef CONFIG_FS_DAX ext2_msg(sb, KERN_WARNING, diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 1afa5a4bcb5f..619dd35ddd48 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -110,7 +110,7 @@ config EXT4_KUNIT_TESTS This builds the ext4 KUnit tests. KUnit tests run during boot and output the results to the debug log - in TAP format (http://testanything.org/). Only useful for kernel devs + in TAP format (https://testanything.org/). Only useful for kernel devs running KUnit test harness and are not for inclusion into a production build. diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 1ba46d87cdf1..48c3df47748d 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -413,7 +413,8 @@ verified: * Return buffer_head on success or an ERR_PTR in case of failure. */ struct buffer_head * -ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) +ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, + bool ignore_locked) { struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -441,6 +442,12 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) return ERR_PTR(-ENOMEM); } + if (ignore_locked && buffer_locked(bh)) { + /* buffer under IO already, return if called for prefetching */ + put_bh(bh); + return NULL; + } + if (bitmap_uptodate(bh)) goto verify; @@ -487,10 +494,11 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) * submit the buffer_head for reading */ set_buffer_new(bh); - trace_ext4_read_block_bitmap_load(sb, block_group); + trace_ext4_read_block_bitmap_load(sb, block_group, ignore_locked); bh->b_end_io = ext4_end_bitmap_read; get_bh(bh); - submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh); + submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO | + (ignore_locked ? REQ_RAHEAD : 0), bh); return bh; verify: err = ext4_validate_block_bitmap(sb, desc, block_group, bh); @@ -534,7 +542,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) struct buffer_head *bh; int err; - bh = ext4_read_block_bitmap_nowait(sb, block_group); + bh = ext4_read_block_bitmap_nowait(sb, block_group, false); if (IS_ERR(bh)) return bh; err = ext4_wait_block_bitmap(sb, block_group, bh); diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 16e9b2fda03a..c54ba52f2dd4 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -24,6 +24,7 @@ struct ext4_system_zone { struct rb_node node; ext4_fsblk_t start_blk; unsigned int count; + u32 ino; }; static struct kmem_cache *ext4_system_zone_cachep; @@ -45,7 +46,8 @@ void ext4_exit_system_zone(void) static inline int can_merge(struct ext4_system_zone *entry1, struct ext4_system_zone *entry2) { - if ((entry1->start_blk + entry1->count) == entry2->start_blk) + if ((entry1->start_blk + entry1->count) == entry2->start_blk && + entry1->ino == entry2->ino) return 1; return 0; } @@ -66,9 +68,9 @@ static void release_system_zone(struct ext4_system_blocks *system_blks) */ static int add_system_zone(struct ext4_system_blocks *system_blks, ext4_fsblk_t start_blk, - unsigned int count) + unsigned int count, u32 ino) { - struct ext4_system_zone *new_entry = NULL, *entry; + struct ext4_system_zone *new_entry, *entry; struct rb_node **n = &system_blks->root.rb_node, *node; struct rb_node *parent = NULL, *new_node = NULL; @@ -79,30 +81,21 @@ static int add_system_zone(struct ext4_system_blocks *system_blks, n = &(*n)->rb_left; else if (start_blk >= (entry->start_blk + entry->count)) n = &(*n)->rb_right; - else { - if (start_blk + count > (entry->start_blk + - entry->count)) - entry->count = (start_blk + count - - entry->start_blk); - new_node = *n; - new_entry = rb_entry(new_node, struct ext4_system_zone, - node); - break; - } + else /* Unexpected overlap of system zones. */ + return -EFSCORRUPTED; } - if (!new_entry) { - new_entry = kmem_cache_alloc(ext4_system_zone_cachep, - GFP_KERNEL); - if (!new_entry) - return -ENOMEM; - new_entry->start_blk = start_blk; - new_entry->count = count; - new_node = &new_entry->node; - - rb_link_node(new_node, parent, n); - rb_insert_color(new_node, &system_blks->root); - } + new_entry = kmem_cache_alloc(ext4_system_zone_cachep, + GFP_KERNEL); + if (!new_entry) + return -ENOMEM; + new_entry->start_blk = start_blk; + new_entry->count = count; + new_entry->ino = ino; + new_node = &new_entry->node; + + rb_link_node(new_node, parent, n); + rb_insert_color(new_node, &system_blks->root); /* Can we merge to the left? */ node = rb_prev(new_node); @@ -151,40 +144,6 @@ static void debug_print_tree(struct ext4_sb_info *sbi) printk(KERN_CONT "\n"); } -/* - * Returns 1 if the passed-in block region (start_blk, - * start_blk+count) is valid; 0 if some part of the block region - * overlaps with filesystem metadata blocks. - */ -static int ext4_data_block_valid_rcu(struct ext4_sb_info *sbi, - struct ext4_system_blocks *system_blks, - ext4_fsblk_t start_blk, - unsigned int count) -{ - struct ext4_system_zone *entry; - struct rb_node *n; - - if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || - (start_blk + count < start_blk) || - (start_blk + count > ext4_blocks_count(sbi->s_es))) - return 0; - - if (system_blks == NULL) - return 1; - - n = system_blks->root.rb_node; - while (n) { - entry = rb_entry(n, struct ext4_system_zone, node); - if (start_blk + count - 1 < entry->start_blk) - n = n->rb_left; - else if (start_blk >= (entry->start_blk + entry->count)) - n = n->rb_right; - else - return 0; - } - return 1; -} - static int ext4_protect_reserved_inode(struct super_block *sb, struct ext4_system_blocks *system_blks, u32 ino) @@ -214,19 +173,18 @@ static int ext4_protect_reserved_inode(struct super_block *sb, if (n == 0) { i++; } else { - if (!ext4_data_block_valid_rcu(sbi, system_blks, - map.m_pblk, n)) { - err = -EFSCORRUPTED; - __ext4_error(sb, __func__, __LINE__, -err, - map.m_pblk, "blocks %llu-%llu " - "from inode %u overlap system zone", - map.m_pblk, - map.m_pblk + map.m_len - 1, ino); + err = add_system_zone(system_blks, map.m_pblk, n, ino); + if (err < 0) { + if (err == -EFSCORRUPTED) { + __ext4_error(sb, __func__, __LINE__, + -err, map.m_pblk, + "blocks %llu-%llu from inode %u overlap system zone", + map.m_pblk, + map.m_pblk + map.m_len - 1, + ino); + } break; } - err = add_system_zone(system_blks, map.m_pblk, n); - if (err < 0) - break; i += n; } } @@ -262,14 +220,6 @@ int ext4_setup_system_zone(struct super_block *sb) int flex_size = ext4_flex_bg_size(sbi); int ret; - if (!test_opt(sb, BLOCK_VALIDITY)) { - if (sbi->system_blks) - ext4_release_system_zone(sb); - return 0; - } - if (sbi->system_blks) - return 0; - system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL); if (!system_blks) return -ENOMEM; @@ -277,22 +227,25 @@ int ext4_setup_system_zone(struct super_block *sb) for (i=0; i < ngroups; i++) { cond_resched(); if (ext4_bg_has_super(sb, i) && - ((i < 5) || ((i % flex_size) == 0))) - add_system_zone(system_blks, + ((i < 5) || ((i % flex_size) == 0))) { + ret = add_system_zone(system_blks, ext4_group_first_block_no(sb, i), - ext4_bg_num_gdb(sb, i) + 1); + ext4_bg_num_gdb(sb, i) + 1, 0); + if (ret) + goto err; + } gdp = ext4_get_group_desc(sb, i, NULL); ret = add_system_zone(system_blks, - ext4_block_bitmap(sb, gdp), 1); + ext4_block_bitmap(sb, gdp), 1, 0); if (ret) goto err; ret = add_system_zone(system_blks, - ext4_inode_bitmap(sb, gdp), 1); + ext4_inode_bitmap(sb, gdp), 1, 0); if (ret) goto err; ret = add_system_zone(system_blks, ext4_inode_table(sb, gdp), - sbi->s_itb_per_group); + sbi->s_itb_per_group, 0); if (ret) goto err; } @@ -341,11 +294,24 @@ void ext4_release_system_zone(struct super_block *sb) call_rcu(&system_blks->rcu, ext4_destroy_system_zone); } -int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, +/* + * Returns 1 if the passed-in block region (start_blk, + * start_blk+count) is valid; 0 if some part of the block region + * overlaps with some other filesystem metadata blocks. + */ +int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk, unsigned int count) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_system_blocks *system_blks; - int ret; + struct ext4_system_zone *entry; + struct rb_node *n; + int ret = 1; + + if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || + (start_blk + count < start_blk) || + (start_blk + count > ext4_blocks_count(sbi->s_es))) + return 0; /* * Lock the system zone to prevent it being released concurrently @@ -354,8 +320,22 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, */ rcu_read_lock(); system_blks = rcu_dereference(sbi->system_blks); - ret = ext4_data_block_valid_rcu(sbi, system_blks, start_blk, - count); + if (system_blks == NULL) + goto out_rcu; + + n = system_blks->root.rb_node; + while (n) { + entry = rb_entry(n, struct ext4_system_zone, node); + if (start_blk + count - 1 < entry->start_blk) + n = n->rb_left; + else if (start_blk >= (entry->start_blk + entry->count)) + n = n->rb_right; + else { + ret = (entry->ino == inode->i_ino); + break; + } + } +out_rcu: rcu_read_unlock(); return ret; } @@ -374,8 +354,7 @@ int ext4_check_blockref(const char *function, unsigned int line, while (bref < p+max) { blk = le32_to_cpu(*bref++); if (blk && - unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), - blk, 1))) { + unlikely(!ext4_inode_block_valid(inode, blk, 1))) { ext4_error_inode(inode, function, line, blk, "invalid block"); return -EFSCORRUPTED; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 42f5060f3cdf..523e00d7b392 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -434,10 +434,36 @@ struct flex_groups { #define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded directory */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ -#define EXT4_FL_USER_VISIBLE 0x725BDFFF /* User visible flags */ -#define EXT4_FL_USER_MODIFIABLE 0x624BC0FF /* User modifiable flags */ - -/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */ +/* User modifiable flags */ +#define EXT4_FL_USER_MODIFIABLE (EXT4_SECRM_FL | \ + EXT4_UNRM_FL | \ + EXT4_COMPR_FL | \ + EXT4_SYNC_FL | \ + EXT4_IMMUTABLE_FL | \ + EXT4_APPEND_FL | \ + EXT4_NODUMP_FL | \ + EXT4_NOATIME_FL | \ + EXT4_JOURNAL_DATA_FL | \ + EXT4_NOTAIL_FL | \ + EXT4_DIRSYNC_FL | \ + EXT4_TOPDIR_FL | \ + EXT4_EXTENTS_FL | \ + 0x00400000 /* EXT4_EOFBLOCKS_FL */ | \ + EXT4_DAX_FL | \ + EXT4_PROJINHERIT_FL | \ + EXT4_CASEFOLD_FL) + +/* User visible flags */ +#define EXT4_FL_USER_VISIBLE (EXT4_FL_USER_MODIFIABLE | \ + EXT4_DIRTY_FL | \ + EXT4_COMPRBLK_FL | \ + EXT4_NOCOMPR_FL | \ + EXT4_ENCRYPT_FL | \ + EXT4_INDEX_FL | \ + EXT4_VERITY_FL | \ + EXT4_INLINE_DATA_FL) + +/* Flags we can manipulate with through FS_IOC_FSSETXATTR */ #define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \ EXT4_IMMUTABLE_FL | \ EXT4_APPEND_FL | \ @@ -669,8 +695,6 @@ enum { /* * ioctl commands */ -#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS -#define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS #define EXT4_IOC_GETVERSION _IOR('f', 3, long) #define EXT4_IOC_SETVERSION _IOW('f', 4, long) #define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION @@ -687,17 +711,11 @@ enum { #define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) #define EXT4_IOC_SWAP_BOOT _IO('f', 17) #define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18) -#define EXT4_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY -#define EXT4_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT -#define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY /* ioctl codes 19--39 are reserved for fscrypt */ #define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40) #define EXT4_IOC_GETSTATE _IOW('f', 41, __u32) #define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap) -#define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR -#define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR - #define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32) /* @@ -722,8 +740,6 @@ enum { /* * ioctl commands in 32 bit emulation */ -#define EXT4_IOC32_GETFLAGS FS_IOC32_GETFLAGS -#define EXT4_IOC32_SETFLAGS FS_IOC32_SETFLAGS #define EXT4_IOC32_GETVERSION _IOR('f', 3, int) #define EXT4_IOC32_SETVERSION _IOW('f', 4, int) #define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) @@ -1054,6 +1070,7 @@ struct ext4_inode_info { struct timespec64 i_crtime; /* mballoc */ + atomic_t i_prealloc_active; struct list_head i_prealloc_list; spinlock_t i_prealloc_lock; @@ -1172,6 +1189,7 @@ struct ext4_inode_info { #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */ +#define EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS 0x4000000 #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ @@ -1501,10 +1519,13 @@ struct ext4_sb_info { unsigned int s_mb_stats; unsigned int s_mb_order2_reqs; unsigned int s_mb_group_prealloc; + unsigned int s_mb_max_inode_prealloc; unsigned int s_max_dir_size_kb; /* where last allocation was done - for stream allocation */ unsigned long s_mb_last_group; unsigned long s_mb_last_start; + unsigned int s_mb_prefetch; + unsigned int s_mb_prefetch_limit; /* stats for buddy allocator */ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ @@ -1572,6 +1593,8 @@ struct ext4_sb_info { struct ratelimit_state s_err_ratelimit_state; struct ratelimit_state s_warning_ratelimit_state; struct ratelimit_state s_msg_ratelimit_state; + atomic_t s_warning_count; + atomic_t s_msg_count; /* Encryption context for '-o test_dummy_encryption' */ struct fscrypt_dummy_context s_dummy_enc_ctx; @@ -1585,6 +1608,9 @@ struct ext4_sb_info { #ifdef CONFIG_EXT4_DEBUG unsigned long s_simulate_fail; #endif + /* Record the errseq of the backing block device */ + errseq_t s_bdev_wb_err; + spinlock_t s_bdev_wb_lock; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -2313,9 +2339,15 @@ struct ext4_lazy_init { struct mutex li_list_mtx; }; +enum ext4_li_mode { + EXT4_LI_MODE_PREFETCH_BBITMAP, + EXT4_LI_MODE_ITABLE, +}; + struct ext4_li_request { struct super_block *lr_super; - struct ext4_sb_info *lr_sbi; + enum ext4_li_mode lr_mode; + ext4_group_t lr_first_not_zeroed; ext4_group_t lr_next_group; struct list_head lr_request; unsigned long lr_next_sched; @@ -2446,7 +2478,8 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb, - ext4_group_t block_group); + ext4_group_t block_group, + bool ignore_locked); extern int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, struct buffer_head *bh); @@ -2651,9 +2684,15 @@ extern int ext4_mb_release(struct super_block *); extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, struct ext4_allocation_request *, int *); extern int ext4_mb_reserve_blocks(struct super_block *, int); -extern void ext4_discard_preallocations(struct inode *); +extern void ext4_discard_preallocations(struct inode *, unsigned int); extern int __init ext4_init_mballoc(void); extern void ext4_exit_mballoc(void); +extern ext4_group_t ext4_mb_prefetch(struct super_block *sb, + ext4_group_t group, + unsigned int nr, int *cnt); +extern void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, + unsigned int nr); + extern void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t block, unsigned long count, int flags); @@ -2765,8 +2804,7 @@ extern int ext4_search_dir(struct buffer_head *bh, struct ext4_filename *fname, unsigned int offset, struct ext4_dir_entry_2 **res_dir); -extern int ext4_generic_delete_entry(handle_t *handle, - struct inode *dir, +extern int ext4_generic_delete_entry(struct inode *dir, struct ext4_dir_entry_2 *de_del, struct buffer_head *bh, void *entry_buf, @@ -2924,12 +2962,6 @@ do { \ #endif -extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, - __u32 compat); -extern int ext4_update_rocompat_feature(handle_t *handle, - struct super_block *sb, __u32 rocompat); -extern int ext4_update_incompat_feature(handle_t *handle, - struct super_block *sb, __u32 incompat); extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, struct ext4_group_desc *bg); extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, @@ -3145,6 +3177,7 @@ struct ext4_group_info { (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT) #define EXT4_GROUP_INFO_IBITMAP_CORRUPT \ (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT) +#define EXT4_GROUP_INFO_BBITMAP_READ_BIT 4 #define EXT4_MB_GRP_NEED_INIT(grp) \ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) @@ -3159,6 +3192,8 @@ struct ext4_group_info { (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \ (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_TEST_AND_SET_READ(grp) \ + (test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state))) #define EXT4_MAX_CONTENTION 8 #define EXT4_CONTENTION_THRESHOLD 2 @@ -3363,9 +3398,9 @@ extern void ext4_release_system_zone(struct super_block *sb); extern int ext4_setup_system_zone(struct super_block *sb); extern int __init ext4_init_system_zone(void); extern void ext4_exit_system_zone(void); -extern int ext4_data_block_valid(struct ext4_sb_info *sbi, - ext4_fsblk_t start_blk, - unsigned int count); +extern int ext4_inode_block_valid(struct inode *inode, + ext4_fsblk_t start_blk, + unsigned int count); extern int ext4_check_blockref(const char *, unsigned int, struct inode *, __le32 *, unsigned int); diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 0c76cdd44d90..760b9ee49dc0 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -195,6 +195,28 @@ static void ext4_journal_abort_handle(const char *caller, unsigned int line, jbd2_journal_abort_handle(handle); } +static void ext4_check_bdev_write_error(struct super_block *sb) +{ + struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; + struct ext4_sb_info *sbi = EXT4_SB(sb); + int err; + + /* + * If the block device has write error flag, it may have failed to + * async write out metadata buffers in the background. In this case, + * we could read old data from disk and write it out again, which + * may lead to on-disk filesystem inconsistency. + */ + if (errseq_check(&mapping->wb_err, READ_ONCE(sbi->s_bdev_wb_err))) { + spin_lock(&sbi->s_bdev_wb_lock); + err = errseq_check_and_advance(&mapping->wb_err, &sbi->s_bdev_wb_err); + spin_unlock(&sbi->s_bdev_wb_lock); + if (err) + ext4_error_err(sb, -err, + "Error while async write back metadata"); + } +} + int __ext4_journal_get_write_access(const char *where, unsigned int line, handle_t *handle, struct buffer_head *bh) { @@ -202,6 +224,9 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line, might_sleep(); + if (bh->b_bdev->bd_super) + ext4_check_bdev_write_error(bh->b_bdev->bd_super); + if (ext4_handle_valid(handle)) { err = jbd2_journal_get_write_access(handle, bh); if (err) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 221f240eae60..a0481582187a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -100,7 +100,7 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) * i_mutex. So we can safely drop the i_data_sem here. */ BUG_ON(EXT4_JOURNAL(inode) == NULL); - ext4_discard_preallocations(inode); + ext4_discard_preallocations(inode, 0); up_write(&EXT4_I(inode)->i_data_sem); *dropped = 1; return 0; @@ -340,7 +340,7 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) */ if (lblock + len <= lblock) return 0; - return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); + return ext4_inode_block_valid(inode, block, len); } static int ext4_valid_extent_idx(struct inode *inode, @@ -348,7 +348,7 @@ static int ext4_valid_extent_idx(struct inode *inode, { ext4_fsblk_t block = ext4_idx_pblock(ext_idx); - return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1); + return ext4_inode_block_valid(inode, block, 1); } static int ext4_valid_extent_entries(struct inode *inode, @@ -507,14 +507,10 @@ __read_extent_tree_block(const char *function, unsigned int line, } if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE)) return bh; - if (!ext4_has_feature_journal(inode->i_sb) || - (inode->i_ino != - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) { - err = __ext4_ext_check(function, line, inode, - ext_block_hdr(bh), depth, pblk); - if (err) - goto errout; - } + err = __ext4_ext_check(function, line, inode, + ext_block_hdr(bh), depth, pblk); + if (err) + goto errout; set_buffer_verified(bh); /* * If this is a leaf block, cache all of its entries @@ -693,10 +689,8 @@ void ext4_ext_drop_refs(struct ext4_ext_path *path) return; depth = path->p_depth; for (i = 0; i <= depth; i++, path++) { - if (path->p_bh) { - brelse(path->p_bh); - path->p_bh = NULL; - } + brelse(path->p_bh); + path->p_bh = NULL; } } @@ -1915,7 +1909,7 @@ out: /* * ext4_ext_insert_extent: - * tries to merge requsted extent into the existing extent or + * tries to merge requested extent into the existing extent or * inserts requested extent as new one into the tree, * creating new leaf in the no-space case. */ @@ -3125,7 +3119,7 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) * * * Splits extent [a, b] into two extents [a, @split) and [@split, b], states - * of which are deterimined by split_flag. + * of which are determined by split_flag. * * There are two cases: * a> the extent are splitted into two extent. @@ -3650,7 +3644,7 @@ static int ext4_split_convert_extents(handle_t *handle, eof_block = map->m_lblk + map->m_len; /* * It is safe to convert extent to initialized via explicit - * zeroout only if extent is fully insde i_size or new_size. + * zeroout only if extent is fully inside i_size or new_size. */ depth = ext_depth(inode); ex = path[depth].p_ext; @@ -4272,7 +4266,7 @@ got_allocated_blocks: * not a good idea to call discard here directly, * but otherwise we'd need to call it every free(). */ - ext4_discard_preallocations(inode); + ext4_discard_preallocations(inode, 0); if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE; ext4_free_blocks(handle, inode, NULL, newblock, @@ -4495,7 +4489,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, } /* - * Round up offset. This is not fallocate, we neet to zero out + * Round up offset. This is not fallocate, we need to zero out * blocks, so convert interior block aligned part of the range to * unwritten and possibly manually zero out unaligned parts of the * range. @@ -5299,7 +5293,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) } down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode); + ext4_discard_preallocations(inode, 0); ret = ext4_es_remove_extent(inode, punch_start, EXT_MAX_BLOCKS - punch_start); @@ -5313,7 +5307,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } - ext4_discard_preallocations(inode); + ext4_discard_preallocations(inode, 0); ret = ext4_ext_shift_extents(inode, handle, punch_stop, punch_stop - punch_start, SHIFT_LEFT); @@ -5445,7 +5439,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) goto out_stop; down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode); + ext4_discard_preallocations(inode, 0); path = ext4_find_extent(inode, offset_lblk, NULL, 0); if (IS_ERR(path)) { @@ -5579,7 +5573,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, } ex1 = path1[path1->p_depth].p_ext; ex2 = path2[path2->p_depth].p_ext; - /* Do we have somthing to swap ? */ + /* Do we have something to swap ? */ if (unlikely(!ex2 || !ex1)) goto finish; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 129cc1dd6b79..7d61069531d3 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -145,10 +145,9 @@ static int ext4_release_file(struct inode *inode, struct file *filp) /* if we are the last writer on the inode, drop the block reservation */ if ((filp->f_mode & FMODE_WRITE) && (atomic_read(&inode->i_writecount) == 1) && - !EXT4_I(inode)->i_reserved_data_blocks) - { + !EXT4_I(inode)->i_reserved_data_blocks) { down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode); + ext4_discard_preallocations(inode, 0); up_write(&EXT4_I(inode)->i_data_sem); } if (is_dx(inode) && filp->private_data) @@ -428,6 +427,10 @@ restart: */ if (*ilock_shared && (!IS_NOSEC(inode) || *extend || !ext4_overwrite_io(inode, offset, count))) { + if (iocb->ki_flags & IOCB_NOWAIT) { + ret = -EAGAIN; + goto out; + } inode_unlock_shared(inode); *ilock_shared = false; inode_lock(inode); @@ -812,7 +815,7 @@ out: return err; } -static int ext4_file_open(struct inode * inode, struct file * filp) +static int ext4_file_open(struct inode *inode, struct file *filp) { int ret; diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index 3e133793a5a3..2924261226e0 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -233,7 +233,7 @@ static int __ext4fs_dirhash(const char *name, int len, break; case DX_HASH_HALF_MD4_UNSIGNED: str2hashbuf = str2hashbuf_unsigned; - /* fall through */ + fallthrough; case DX_HASH_HALF_MD4: p = name; while (len > 0) { @@ -247,7 +247,7 @@ static int __ext4fs_dirhash(const char *name, int len, break; case DX_HASH_TEA_UNSIGNED: str2hashbuf = str2hashbuf_unsigned; - /* fall through */ + fallthrough; case DX_HASH_TEA: p = name; while (len > 0) { diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index be2b66eb65f7..80c9f33800be 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -696,7 +696,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode, * i_mutex. So we can safely drop the i_data_sem here. */ BUG_ON(EXT4_JOURNAL(inode) == NULL); - ext4_discard_preallocations(inode); + ext4_discard_preallocations(inode, 0); up_write(&EXT4_I(inode)->i_data_sem); *dropped = 1; return 0; @@ -858,8 +858,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, else if (ext4_should_journal_data(inode)) flags |= EXT4_FREE_BLOCKS_FORGET; - if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, - count)) { + if (!ext4_inode_block_valid(inode, block_to_free, count)) { EXT4_ERROR_INODE(inode, "attempt to clear invalid " "blocks %llu len %lu", (unsigned long long) block_to_free, count); @@ -1004,8 +1003,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, if (!nr) continue; /* A hole */ - if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), - nr, 1)) { + if (!ext4_inode_block_valid(inode, nr, 1)) { EXT4_ERROR_INODE(inode, "invalid indirect mapped " "block %lu (level %d)", @@ -1182,21 +1180,21 @@ do_indirects: ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); i_data[EXT4_IND_BLOCK] = 0; } - /* fall through */ + fallthrough; case EXT4_IND_BLOCK: nr = i_data[EXT4_DIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); i_data[EXT4_DIND_BLOCK] = 0; } - /* fall through */ + fallthrough; case EXT4_DIND_BLOCK: nr = i_data[EXT4_TIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); i_data[EXT4_TIND_BLOCK] = 0; } - /* fall through */ + fallthrough; case EXT4_TIND_BLOCK: ; } @@ -1436,7 +1434,7 @@ do_indirects: ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); i_data[EXT4_IND_BLOCK] = 0; } - /* fall through */ + fallthrough; case EXT4_IND_BLOCK: if (++n >= n2) break; @@ -1445,7 +1443,7 @@ do_indirects: ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); i_data[EXT4_DIND_BLOCK] = 0; } - /* fall through */ + fallthrough; case EXT4_DIND_BLOCK: if (++n >= n2) break; @@ -1454,7 +1452,7 @@ do_indirects: ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); i_data[EXT4_TIND_BLOCK] = 0; } - /* fall through */ + fallthrough; case EXT4_TIND_BLOCK: ; } diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index c3a1ad2db122..75c97bca0815 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -276,7 +276,7 @@ static int ext4_create_inline_data(handle_t *handle, len = 0; } - /* Insert the the xttr entry. */ + /* Insert the xttr entry. */ i.value = value; i.value_len = len; @@ -1706,7 +1706,7 @@ int ext4_delete_inline_entry(handle_t *handle, if (err) goto out; - err = ext4_generic_delete_entry(handle, dir, de_del, bh, + err = ext4_generic_delete_entry(dir, de_del, bh, inline_start, inline_size, 0); if (err) goto out; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 44bad4bb8831..bf596467c234 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -383,7 +383,7 @@ void ext4_da_update_reserve_space(struct inode *inode, */ if ((ei->i_reserved_data_blocks == 0) && !inode_is_open_for_write(inode)) - ext4_discard_preallocations(inode); + ext4_discard_preallocations(inode, 0); } static int __check_block_validity(struct inode *inode, const char *func, @@ -394,8 +394,7 @@ static int __check_block_validity(struct inode *inode, const char *func, (inode->i_ino == le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) return 0; - if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk, - map->m_len)) { + if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) { ext4_error_inode(inode, func, line, map->m_pblk, "lblock %lu mapped to illegal pblock %llu " "(length %d)", (unsigned long) map->m_lblk, @@ -3288,7 +3287,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait) if (PageChecked(page)) return 0; if (journal) - return jbd2_journal_try_to_free_buffers(journal, page, wait); + return jbd2_journal_try_to_free_buffers(journal, page); else return try_to_free_buffers(page); } @@ -4056,7 +4055,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) if (stop_block > first_block) { down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode); + ext4_discard_preallocations(inode, 0); ret = ext4_es_remove_extent(inode, first_block, stop_block - first_block); @@ -4163,7 +4162,7 @@ int ext4_truncate(struct inode *inode) trace_ext4_truncate_enter(inode); if (!ext4_can_truncate(inode)) - return 0; + goto out_trace; if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); @@ -4172,16 +4171,14 @@ int ext4_truncate(struct inode *inode) int has_inline = 1; err = ext4_inline_data_truncate(inode, &has_inline); - if (err) - return err; - if (has_inline) - return 0; + if (err || has_inline) + goto out_trace; } /* If we zero-out tail of the page, we have to create jinode for jbd2 */ if (inode->i_size & (inode->i_sb->s_blocksize - 1)) { if (ext4_inode_attach_jinode(inode) < 0) - return 0; + goto out_trace; } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) @@ -4190,8 +4187,10 @@ int ext4_truncate(struct inode *inode) credits = ext4_blocks_for_truncate(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); - if (IS_ERR(handle)) - return PTR_ERR(handle); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto out_trace; + } if (inode->i_size & (inode->i_sb->s_blocksize - 1)) ext4_block_truncate_page(handle, mapping, inode->i_size); @@ -4211,7 +4210,7 @@ int ext4_truncate(struct inode *inode) down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode); + ext4_discard_preallocations(inode, 0); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) err = ext4_ext_truncate(handle, inode); @@ -4242,6 +4241,7 @@ out_stop: err = err2; ext4_journal_stop(handle); +out_trace: trace_ext4_truncate_exit(inode); return err; } @@ -4760,7 +4760,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ret = 0; if (ei->i_file_acl && - !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { + !ext4_inode_block_valid(inode, ei->i_file_acl, 1)) { ext4_error_inode(inode, function, line, 0, "iget: bad extended attribute block %llu", ei->i_file_acl); @@ -4901,7 +4901,7 @@ static void __ext4_update_other_inode_time(struct super_block *sb, (inode->i_state & I_DIRTY_TIME)) { struct ext4_inode_info *ei = EXT4_I(inode); - inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED); + inode->i_state &= ~I_DIRTY_TIME; spin_unlock(&inode->i_lock); spin_lock(&ei->i_raw_lock); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 999cf6add39c..36eca3bc036a 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -202,7 +202,7 @@ static long swap_inode_boot_loader(struct super_block *sb, reset_inode_seed(inode); reset_inode_seed(inode_bl); - ext4_discard_preallocations(inode); + ext4_discard_preallocations(inode, 0); err = ext4_mark_inode_dirty(handle, inode); if (err < 0) { @@ -819,12 +819,12 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) switch (cmd) { case FS_IOC_GETFSMAP: return ext4_ioc_getfsmap(sb, (void __user *)arg); - case EXT4_IOC_GETFLAGS: + case FS_IOC_GETFLAGS: flags = ei->i_flags & EXT4_FL_USER_VISIBLE; if (S_ISREG(inode->i_mode)) flags &= ~EXT4_PROJINHERIT_FL; return put_user(flags, (int __user *) arg); - case EXT4_IOC_SETFLAGS: { + case FS_IOC_SETFLAGS: { int err; if (!inode_owner_or_capable(inode)) @@ -1129,12 +1129,12 @@ resizefs_out: case EXT4_IOC_PRECACHE_EXTENTS: return ext4_ext_precache(inode); - case EXT4_IOC_SET_ENCRYPTION_POLICY: + case FS_IOC_SET_ENCRYPTION_POLICY: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); - case EXT4_IOC_GET_ENCRYPTION_PWSALT: { + case FS_IOC_GET_ENCRYPTION_PWSALT: { #ifdef CONFIG_FS_ENCRYPTION int err, err2; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -1174,7 +1174,7 @@ resizefs_out: return -EOPNOTSUPP; #endif } - case EXT4_IOC_GET_ENCRYPTION_POLICY: + case FS_IOC_GET_ENCRYPTION_POLICY: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; return fscrypt_ioctl_get_policy(filp, (void __user *)arg); @@ -1236,7 +1236,7 @@ resizefs_out: case EXT4_IOC_GET_ES_CACHE: return ext4_ioctl_get_es_cache(filp, arg); - case EXT4_IOC_FSGETXATTR: + case FS_IOC_FSGETXATTR: { struct fsxattr fa; @@ -1247,7 +1247,7 @@ resizefs_out: return -EFAULT; return 0; } - case EXT4_IOC_FSSETXATTR: + case FS_IOC_FSSETXATTR: { struct fsxattr fa, old_fa; int err; @@ -1313,11 +1313,11 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { /* These are just misnamed, they actually get/put from/to user an int */ switch (cmd) { - case EXT4_IOC32_GETFLAGS: - cmd = EXT4_IOC_GETFLAGS; + case FS_IOC32_GETFLAGS: + cmd = FS_IOC_GETFLAGS; break; - case EXT4_IOC32_SETFLAGS: - cmd = EXT4_IOC_SETFLAGS; + case FS_IOC32_SETFLAGS: + cmd = FS_IOC_SETFLAGS; break; case EXT4_IOC32_GETVERSION: cmd = EXT4_IOC_GETVERSION; @@ -1361,9 +1361,9 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC_RESIZE_FS: case FITRIM: case EXT4_IOC_PRECACHE_EXTENTS: - case EXT4_IOC_SET_ENCRYPTION_POLICY: - case EXT4_IOC_GET_ENCRYPTION_PWSALT: - case EXT4_IOC_GET_ENCRYPTION_POLICY: + case FS_IOC_SET_ENCRYPTION_POLICY: + case FS_IOC_GET_ENCRYPTION_PWSALT: + case FS_IOC_GET_ENCRYPTION_POLICY: case FS_IOC_GET_ENCRYPTION_POLICY_EX: case FS_IOC_ADD_ENCRYPTION_KEY: case FS_IOC_REMOVE_ENCRYPTION_KEY: @@ -1377,8 +1377,8 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC_CLEAR_ES_CACHE: case EXT4_IOC_GETSTATE: case EXT4_IOC_GET_ES_CACHE: - case EXT4_IOC_FSGETXATTR: - case EXT4_IOC_FSSETXATTR: + case FS_IOC_FSGETXATTR: + case FS_IOC_FSSETXATTR: break; default: return -ENOIOCTLCMD; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c0a331e2feb0..132c118d12e1 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -922,7 +922,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) bh[i] = NULL; continue; } - bh[i] = ext4_read_block_bitmap_nowait(sb, group); + bh[i] = ext4_read_block_bitmap_nowait(sb, group, false); if (IS_ERR(bh[i])) { err = PTR_ERR(bh[i]); bh[i] = NULL; @@ -1279,9 +1279,6 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, e4b->bd_buddy_page = page; e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); - BUG_ON(e4b->bd_bitmap_page == NULL); - BUG_ON(e4b->bd_buddy_page == NULL); - return 0; err: @@ -1743,10 +1740,6 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, } -/* - * regular allocator, for general purposes allocation - */ - static void ext4_mb_check_limits(struct ext4_allocation_context *ac, struct ext4_buddy *e4b, int finish_group) @@ -2119,13 +2112,11 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, BUG_ON(cr < 0 || cr >= 4); - free = grp->bb_free; - if (free == 0) - return false; - if (cr <= 2 && free < ac->ac_g_ex.fe_len) + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) return false; - if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) + free = grp->bb_free; + if (free == 0) return false; fragments = grp->bb_fragments; @@ -2142,8 +2133,10 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, ((group % flex_size) == 0)) return false; - if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) || - (free / fragments) >= ac->ac_g_ex.fe_len) + if (free < ac->ac_g_ex.fe_len) + return false; + + if (ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) return true; if (grp->bb_largest_free_order < ac->ac_2order) @@ -2177,6 +2170,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, { struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); struct super_block *sb = ac->ac_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK; ext4_grpblk_t free; int ret = 0; @@ -2195,7 +2189,25 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { - ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS); + struct ext4_group_desc *gdp = + ext4_get_group_desc(sb, group, NULL); + int ret; + + /* cr=0/1 is a very optimistic search to find large + * good chunks almost for free. If buddy data is not + * ready, then this optimization makes no sense. But + * we never skip the first block group in a flex_bg, + * since this gets used for metadata block allocation, + * and we want to make sure we locate metadata blocks + * in the first block group in the flex_bg if possible. + */ + if (cr < 2 && + (!sbi->s_log_groups_per_flex || + ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) && + !(ext4_has_group_desc_csum(sb) && + (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) + return 0; + ret = ext4_mb_init_group(sb, group, GFP_NOFS); if (ret) return ret; } @@ -2209,15 +2221,95 @@ out: return ret; } +/* + * Start prefetching @nr block bitmaps starting at @group. + * Return the next group which needs to be prefetched. + */ +ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, + unsigned int nr, int *cnt) +{ + ext4_group_t ngroups = ext4_get_groups_count(sb); + struct buffer_head *bh; + struct blk_plug plug; + + blk_start_plug(&plug); + while (nr-- > 0) { + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, + NULL); + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + + /* + * Prefetch block groups with free blocks; but don't + * bother if it is marked uninitialized on disk, since + * it won't require I/O to read. Also only try to + * prefetch once, so we avoid getblk() call, which can + * be expensive. + */ + if (!EXT4_MB_GRP_TEST_AND_SET_READ(grp) && + EXT4_MB_GRP_NEED_INIT(grp) && + ext4_free_group_clusters(sb, gdp) > 0 && + !(ext4_has_group_desc_csum(sb) && + (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) { + bh = ext4_read_block_bitmap_nowait(sb, group, true); + if (bh && !IS_ERR(bh)) { + if (!buffer_uptodate(bh) && cnt) + (*cnt)++; + brelse(bh); + } + } + if (++group >= ngroups) + group = 0; + } + blk_finish_plug(&plug); + return group; +} + +/* + * Prefetching reads the block bitmap into the buffer cache; but we + * need to make sure that the buddy bitmap in the page cache has been + * initialized. Note that ext4_mb_init_group() will block if the I/O + * is not yet completed, or indeed if it was not initiated by + * ext4_mb_prefetch did not start the I/O. + * + * TODO: We should actually kick off the buddy bitmap setup in a work + * queue when the buffer I/O is completed, so that we don't block + * waiting for the block allocation bitmap read to finish when + * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator(). + */ +void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, + unsigned int nr) +{ + while (nr-- > 0) { + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, + NULL); + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + + if (!group) + group = ext4_get_groups_count(sb); + group--; + grp = ext4_get_group_info(sb, group); + + if (EXT4_MB_GRP_NEED_INIT(grp) && + ext4_free_group_clusters(sb, gdp) > 0 && + !(ext4_has_group_desc_csum(sb) && + (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) { + if (ext4_mb_init_group(sb, group, GFP_NOFS)) + break; + } + } +} + static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { - ext4_group_t ngroups, group, i; + ext4_group_t prefetch_grp = 0, ngroups, group, i; int cr = -1; int err = 0, first_err = 0; + unsigned int nr = 0, prefetch_ios = 0; struct ext4_sb_info *sbi; struct super_block *sb; struct ext4_buddy e4b; + int lost; sb = ac->ac_sb; sbi = EXT4_SB(sb); @@ -2237,8 +2329,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) goto out; /* - * ac->ac2_order is set only if the fe_len is a power of 2 - * if ac2_order is set we also set criteria to 0 so that we + * ac->ac_2order is set only if the fe_len is a power of 2 + * if ac->ac_2order is set we also set criteria to 0 so that we * try exact allocation using buddy. */ i = fls(ac->ac_g_ex.fe_len); @@ -2282,6 +2374,7 @@ repeat: * from the goal value specified */ group = ac->ac_g_ex.fe_group; + prefetch_grp = group; for (i = 0; i < ngroups; group++, i++) { int ret = 0; @@ -2293,6 +2386,29 @@ repeat: if (group >= ngroups) group = 0; + /* + * Batch reads of the block allocation bitmaps + * to get multiple READs in flight; limit + * prefetching at cr=0/1, otherwise mballoc can + * spend a lot of time loading imperfect groups + */ + if ((prefetch_grp == group) && + (cr > 1 || + prefetch_ios < sbi->s_mb_prefetch_limit)) { + unsigned int curr_ios = prefetch_ios; + + nr = sbi->s_mb_prefetch; + if (ext4_has_feature_flex_bg(sb)) { + nr = (group / sbi->s_mb_prefetch) * + sbi->s_mb_prefetch; + nr = nr + sbi->s_mb_prefetch - group; + } + prefetch_grp = ext4_mb_prefetch(sb, group, + nr, &prefetch_ios); + if (prefetch_ios == curr_ios) + nr = 0; + } + /* This now checks without needing the buddy page */ ret = ext4_mb_good_group_nolock(ac, group, cr); if (ret <= 0) { @@ -2341,22 +2457,24 @@ repeat: * We've been searching too long. Let's try to allocate * the best chunk we've found so far */ - ext4_mb_try_best_found(ac, &e4b); if (ac->ac_status != AC_STATUS_FOUND) { /* * Someone more lucky has already allocated it. * The only thing we can do is just take first * found block(s) - printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n"); */ + lost = atomic_inc_return(&sbi->s_mb_lost_chunks); + mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n", + ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start, + ac->ac_b_ex.fe_len, lost); + ac->ac_b_ex.fe_group = 0; ac->ac_b_ex.fe_start = 0; ac->ac_b_ex.fe_len = 0; ac->ac_status = AC_STATUS_CONTINUE; ac->ac_flags |= EXT4_MB_HINT_FIRST; cr = 3; - atomic_inc(&sbi->s_mb_lost_chunks); goto repeat; } } @@ -2367,6 +2485,10 @@ out: mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n", ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status, ac->ac_flags, cr, err); + + if (nr) + ext4_mb_prefetch_fini(sb, prefetch_grp, nr); + return err; } @@ -2439,7 +2561,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) for (i = 0; i <= 13; i++) seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ? sg.info.bb_counters[i] : 0); - seq_printf(seq, " ]\n"); + seq_puts(seq, " ]\n"); return 0; } @@ -2613,6 +2735,26 @@ static int ext4_mb_init_backend(struct super_block *sb) goto err_freebuddy; } + if (ext4_has_feature_flex_bg(sb)) { + /* a single flex group is supposed to be read by a single IO */ + sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex; + sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */ + } else { + sbi->s_mb_prefetch = 32; + } + if (sbi->s_mb_prefetch > ext4_get_groups_count(sb)) + sbi->s_mb_prefetch = ext4_get_groups_count(sb); + /* now many real IOs to prefetch within a single allocation at cr=0 + * given cr=0 is an CPU-related optimization we shouldn't try to + * load too many groups, at some point we should start to use what + * we've got in memory. + * with an average random access time 5ms, it'd take a second to get + * 200 groups (* N with flex_bg), so let's make this limit 4 + */ + sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4; + if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb)) + sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb); + return 0; err_freebuddy: @@ -2736,6 +2878,7 @@ int ext4_mb_init(struct super_block *sb) sbi->s_mb_stats = MB_DEFAULT_STATS; sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; + sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC; /* * The default group preallocation is 512, which for 4k block * sizes translates to 2 megabytes. However for bigalloc file @@ -3090,7 +3233,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); - if (!ext4_data_block_valid(sbi, block, len)) { + if (!ext4_inode_block_valid(ac->ac_inode, block, len)) { ext4_error(sb, "Allocating blocks %llu-%llu which overlap " "fs metadata", block, block+len); /* File system mounted not to panic on error @@ -3674,6 +3817,26 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, mb_debug(sb, "preallocated %d for group %u\n", preallocated, group); } +static void ext4_mb_mark_pa_deleted(struct super_block *sb, + struct ext4_prealloc_space *pa) +{ + struct ext4_inode_info *ei; + + if (pa->pa_deleted) { + ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n", + pa->pa_type, pa->pa_pstart, pa->pa_lstart, + pa->pa_len); + return; + } + + pa->pa_deleted = 1; + + if (pa->pa_type == MB_INODE_PA) { + ei = EXT4_I(pa->pa_inode); + atomic_dec(&ei->i_prealloc_active); + } +} + static void ext4_mb_pa_callback(struct rcu_head *head) { struct ext4_prealloc_space *pa; @@ -3706,7 +3869,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, return; } - pa->pa_deleted = 1; + ext4_mb_mark_pa_deleted(sb, pa); spin_unlock(&pa->pa_lock); grp_blk = pa->pa_pstart; @@ -3830,6 +3993,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) spin_lock(pa->pa_obj_lock); list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); spin_unlock(pa->pa_obj_lock); + atomic_inc(&ei->i_prealloc_active); } /* @@ -4040,7 +4204,7 @@ repeat: } /* seems this one can be freed ... */ - pa->pa_deleted = 1; + ext4_mb_mark_pa_deleted(sb, pa); /* we can trust pa_free ... */ free += pa->pa_free; @@ -4103,7 +4267,7 @@ out_dbg: * * FIXME!! Make sure it is valid at all the call sites */ -void ext4_discard_preallocations(struct inode *inode) +void ext4_discard_preallocations(struct inode *inode, unsigned int needed) { struct ext4_inode_info *ei = EXT4_I(inode); struct super_block *sb = inode->i_sb; @@ -4121,15 +4285,19 @@ void ext4_discard_preallocations(struct inode *inode) mb_debug(sb, "discard preallocation for inode %lu\n", inode->i_ino); - trace_ext4_discard_preallocations(inode); + trace_ext4_discard_preallocations(inode, + atomic_read(&ei->i_prealloc_active), needed); INIT_LIST_HEAD(&list); + if (needed == 0) + needed = UINT_MAX; + repeat: /* first, collect all pa's in the inode */ spin_lock(&ei->i_prealloc_lock); - while (!list_empty(&ei->i_prealloc_list)) { - pa = list_entry(ei->i_prealloc_list.next, + while (!list_empty(&ei->i_prealloc_list) && needed) { + pa = list_entry(ei->i_prealloc_list.prev, struct ext4_prealloc_space, pa_inode_list); BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock); spin_lock(&pa->pa_lock); @@ -4146,10 +4314,11 @@ repeat: } if (pa->pa_deleted == 0) { - pa->pa_deleted = 1; + ext4_mb_mark_pa_deleted(sb, pa); spin_unlock(&pa->pa_lock); list_del_rcu(&pa->pa_inode_list); list_add(&pa->u.pa_tmp_list, &list); + needed--; continue; } @@ -4399,7 +4568,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, ac->ac_g_ex = ac->ac_o_ex; ac->ac_flags = ar->flags; - /* we have to define context: we'll we work with a file or + /* we have to define context: we'll work with a file or * locality group. this is a policy, actually */ ext4_mb_group_or_file(ac); @@ -4450,7 +4619,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, BUG_ON(pa->pa_type != MB_GROUP_PA); /* seems this one can be freed ... */ - pa->pa_deleted = 1; + ext4_mb_mark_pa_deleted(sb, pa); spin_unlock(&pa->pa_lock); list_del_rcu(&pa->pa_inode_list); @@ -4549,10 +4718,29 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) } /* + * if per-inode prealloc list is too long, trim some PA + */ +static void ext4_mb_trim_inode_pa(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int count, delta; + + count = atomic_read(&ei->i_prealloc_active); + delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1; + if (count > sbi->s_mb_max_inode_prealloc + delta) { + count -= sbi->s_mb_max_inode_prealloc; + ext4_discard_preallocations(inode, count); + } +} + +/* * release all resource we used in allocation */ static int ext4_mb_release_context(struct ext4_allocation_context *ac) { + struct inode *inode = ac->ac_inode; + struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *pa = ac->ac_pa; if (pa) { @@ -4564,21 +4752,31 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) pa->pa_free -= ac->ac_b_ex.fe_len; pa->pa_len -= ac->ac_b_ex.fe_len; spin_unlock(&pa->pa_lock); + + /* + * We want to add the pa to the right bucket. + * Remove it from the list and while adding + * make sure the list to which we are adding + * doesn't grow big. + */ + if (likely(pa->pa_free)) { + spin_lock(pa->pa_obj_lock); + list_del_rcu(&pa->pa_inode_list); + spin_unlock(pa->pa_obj_lock); + ext4_mb_add_n_trim(ac); + } } - } - if (pa) { - /* - * We want to add the pa to the right bucket. - * Remove it from the list and while adding - * make sure the list to which we are adding - * doesn't grow big. - */ - if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) { + + if (pa->pa_type == MB_INODE_PA) { + /* + * treat per-inode prealloc list as a lru list, then try + * to trim the least recently used PA. + */ spin_lock(pa->pa_obj_lock); - list_del_rcu(&pa->pa_inode_list); + list_move(&pa->pa_inode_list, &ei->i_prealloc_list); spin_unlock(pa->pa_obj_lock); - ext4_mb_add_n_trim(ac); } + ext4_mb_put_pa(ac, ac->ac_sb, pa); } if (ac->ac_bitmap_page) @@ -4588,6 +4786,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) mutex_unlock(&ac->ac_lg->lg_mutex); ext4_mb_collect_stats(ac); + ext4_mb_trim_inode_pa(inode); return 0; } @@ -4915,7 +5114,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, sbi = EXT4_SB(sb); if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && - !ext4_data_block_valid(sbi, block, count)) { + !ext4_inode_block_valid(inode, block, count)) { ext4_error(sb, "Freeing blocks not in datazone - " "block = %llu, count = %lu", block, count); goto error_return; diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 6b4d17c2935d..e75b4749aa1c 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -73,6 +73,10 @@ */ #define MB_DEFAULT_GROUP_PREALLOC 512 +/* + * maximum length of inode prealloc list + */ +#define MB_DEFAULT_MAX_INODE_PREALLOC 512 struct ext4_free_data { /* this links the free block information from sb_info */ diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 1ed86fb6c302..0d601b822875 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -686,8 +686,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, out: if (*moved_len) { - ext4_discard_preallocations(orig_inode); - ext4_discard_preallocations(donor_inode); + ext4_discard_preallocations(orig_inode, 0); + ext4_discard_preallocations(donor_inode, 0); } ext4_ext_drop_refs(path); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 56738b538ddf..153a9fbe1dd0 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1396,8 +1396,8 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, ext4_match(dir, fname, de)) { /* found a match - just to be sure, do * a full check */ - if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, - bh->b_size, offset)) + if (ext4_check_dir_entry(dir, NULL, de, bh, search_buf, + buf_size, offset)) return -1; *res_dir = de; return 1; @@ -1858,7 +1858,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, blocksize, hinfo, map); map -= count; dx_sort_map(map, count); - /* Split the existing block in the middle, size-wise */ + /* Ensure that neither split block is over half full */ size = 0; move = 0; for (i = count-1; i >= 0; i--) { @@ -1868,8 +1868,18 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, size += map[i].size; move++; } - /* map index at which we will split */ - split = count - move; + /* + * map index at which we will split + * + * If the sum of active entries didn't exceed half the block size, just + * split it in half by count; each resulting block will have at least + * half the space free. + */ + if (i > 0) + split = count - move; + else + split = count/2; + hash2 = map[split].hash; continued = hash2 == map[split - 1].hash; dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n", @@ -2455,8 +2465,7 @@ cleanup: * ext4_generic_delete_entry deletes a directory entry by merging it * with the previous entry */ -int ext4_generic_delete_entry(handle_t *handle, - struct inode *dir, +int ext4_generic_delete_entry(struct inode *dir, struct ext4_dir_entry_2 *de_del, struct buffer_head *bh, void *entry_buf, @@ -2472,7 +2481,7 @@ int ext4_generic_delete_entry(handle_t *handle, de = (struct ext4_dir_entry_2 *)entry_buf; while (i < buf_size - csum_size) { if (ext4_check_dir_entry(dir, NULL, de, bh, - bh->b_data, bh->b_size, i)) + entry_buf, buf_size, i)) return -EFSCORRUPTED; if (de == de_del) { if (pde) @@ -2517,8 +2526,7 @@ static int ext4_delete_entry(handle_t *handle, if (unlikely(err)) goto out; - err = ext4_generic_delete_entry(handle, dir, de_del, - bh, bh->b_data, + err = ext4_generic_delete_entry(dir, de_del, bh, bh->b_data, dir->i_sb->s_blocksize, csum_size); if (err) goto out; @@ -3193,30 +3201,33 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) * in separate transaction */ retval = dquot_initialize(dir); if (retval) - return retval; + goto out_trace; retval = dquot_initialize(d_inode(dentry)); if (retval) - return retval; + goto out_trace; - retval = -ENOENT; bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); - if (IS_ERR(bh)) - return PTR_ERR(bh); - if (!bh) - goto end_unlink; + if (IS_ERR(bh)) { + retval = PTR_ERR(bh); + goto out_trace; + } + if (!bh) { + retval = -ENOENT; + goto out_trace; + } inode = d_inode(dentry); - retval = -EFSCORRUPTED; - if (le32_to_cpu(de->inode) != inode->i_ino) - goto end_unlink; + if (le32_to_cpu(de->inode) != inode->i_ino) { + retval = -EFSCORRUPTED; + goto out_bh; + } handle = ext4_journal_start(dir, EXT4_HT_DIR, EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) { retval = PTR_ERR(handle); - handle = NULL; - goto end_unlink; + goto out_bh; } if (IS_DIRSYNC(dir)) @@ -3224,12 +3235,12 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) retval = ext4_delete_entry(handle, dir, de, bh); if (retval) - goto end_unlink; + goto out_handle; dir->i_ctime = dir->i_mtime = current_time(dir); ext4_update_dx_flag(dir); retval = ext4_mark_inode_dirty(handle, dir); if (retval) - goto end_unlink; + goto out_handle; if (inode->i_nlink == 0) ext4_warning_inode(inode, "Deleting file '%.*s' with no links", dentry->d_name.len, dentry->d_name.name); @@ -3251,10 +3262,11 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) d_invalidate(dentry); #endif -end_unlink: +out_handle: + ext4_journal_stop(handle); +out_bh: brelse(bh); - if (handle) - ext4_journal_stop(handle); +out_trace: trace_ext4_unlink_exit(dentry, retval); return retval; } diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index f2df2db0786c..f014c5e473a9 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -140,7 +140,7 @@ static void bio_post_read_processing(struct bio_post_read_ctx *ctx) return; } ctx->cur_step++; - /* fall-through */ + fallthrough; case STEP_VERITY: if (ctx->enabled_steps & (1 << STEP_VERITY)) { INIT_WORK(&ctx->work, verity_work); @@ -148,7 +148,7 @@ static void bio_post_read_processing(struct bio_post_read_ctx *ctx) return; } ctx->cur_step++; - /* fall-through */ + fallthrough; default: __read_end_io(ctx->bio); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0907f907c47d..ea425b49b345 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -66,10 +66,10 @@ static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); static int ext4_show_options(struct seq_file *seq, struct dentry *root); static int ext4_commit_super(struct super_block *sb, int sync); -static void ext4_mark_recovery_complete(struct super_block *sb, +static int ext4_mark_recovery_complete(struct super_block *sb, struct ext4_super_block *es); -static void ext4_clear_journal_err(struct super_block *sb, - struct ext4_super_block *es); +static int ext4_clear_journal_err(struct super_block *sb, + struct ext4_super_block *es); static int ext4_sync_fs(struct super_block *sb, int wait); static int ext4_remount(struct super_block *sb, int *flags, char *data); static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); @@ -744,6 +744,7 @@ void __ext4_msg(struct super_block *sb, struct va_format vaf; va_list args; + atomic_inc(&EXT4_SB(sb)->s_msg_count); if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs")) return; @@ -754,9 +755,12 @@ void __ext4_msg(struct super_block *sb, va_end(args); } -#define ext4_warning_ratelimit(sb) \ - ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), \ - "EXT4-fs warning") +static int ext4_warning_ratelimit(struct super_block *sb) +{ + atomic_inc(&EXT4_SB(sb)->s_warning_count); + return ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), + "EXT4-fs warning"); +} void __ext4_warning(struct super_block *sb, const char *function, unsigned int line, const char *fmt, ...) @@ -1123,6 +1127,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) inode_set_iversion(&ei->vfs_inode, 1); spin_lock_init(&ei->i_raw_lock); INIT_LIST_HEAD(&ei->i_prealloc_list); + atomic_set(&ei->i_prealloc_active, 0); spin_lock_init(&ei->i_prealloc_lock); ext4_es_init_tree(&ei->i_es_tree); rwlock_init(&ei->i_es_lock); @@ -1216,7 +1221,7 @@ void ext4_clear_inode(struct inode *inode) { invalidate_inode_buffers(inode); clear_inode(inode); - ext4_discard_preallocations(inode); + ext4_discard_preallocations(inode, 0); ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); dquot_drop(inode); if (EXT4_I(inode)->jinode) { @@ -1288,8 +1293,8 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page, if (!page_has_buffers(page)) return 0; if (journal) - return jbd2_journal_try_to_free_buffers(journal, page, - wait & ~__GFP_DIRECT_RECLAIM); + return jbd2_journal_try_to_free_buffers(journal, page); + return try_to_free_buffers(page); } @@ -1522,6 +1527,7 @@ enum { Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, + Opt_prefetch_block_bitmaps, }; static const match_table_t tokens = { @@ -1614,6 +1620,7 @@ static const match_table_t tokens = { {Opt_inlinecrypt, "inlinecrypt"}, {Opt_nombcache, "nombcache"}, {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */ + {Opt_prefetch_block_bitmaps, "prefetch_block_bitmaps"}, {Opt_removed, "check=none"}, /* mount option from ext2/3 */ {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ {Opt_removed, "reservation"}, /* mount option from ext2/3 */ @@ -1831,6 +1838,8 @@ static const struct mount_opts { {Opt_max_dir_size_kb, 0, MOPT_GTE0}, {Opt_test_dummy_encryption, 0, MOPT_STRING}, {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, + {Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS, + MOPT_SET}, {Opt_err, 0, 0} }; @@ -3213,15 +3222,34 @@ static void print_daily_error_info(struct timer_list *t) static int ext4_run_li_request(struct ext4_li_request *elr) { struct ext4_group_desc *gdp = NULL; - ext4_group_t group, ngroups; - struct super_block *sb; + struct super_block *sb = elr->lr_super; + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + ext4_group_t group = elr->lr_next_group; unsigned long timeout = 0; + unsigned int prefetch_ios = 0; int ret = 0; - sb = elr->lr_super; - ngroups = EXT4_SB(sb)->s_groups_count; + if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) { + elr->lr_next_group = ext4_mb_prefetch(sb, group, + EXT4_SB(sb)->s_mb_prefetch, &prefetch_ios); + if (prefetch_ios) + ext4_mb_prefetch_fini(sb, elr->lr_next_group, + prefetch_ios); + trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group, + prefetch_ios); + if (group >= elr->lr_next_group) { + ret = 1; + if (elr->lr_first_not_zeroed != ngroups && + !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) { + elr->lr_next_group = elr->lr_first_not_zeroed; + elr->lr_mode = EXT4_LI_MODE_ITABLE; + ret = 0; + } + } + return ret; + } - for (group = elr->lr_next_group; group < ngroups; group++) { + for (; group < ngroups; group++) { gdp = ext4_get_group_desc(sb, group, NULL); if (!gdp) { ret = 1; @@ -3239,9 +3267,10 @@ static int ext4_run_li_request(struct ext4_li_request *elr) timeout = jiffies; ret = ext4_init_inode_table(sb, group, elr->lr_timeout ? 0 : 1); + trace_ext4_lazy_itable_init(sb, group); if (elr->lr_timeout == 0) { timeout = (jiffies - timeout) * - elr->lr_sbi->s_li_wait_mult; + EXT4_SB(elr->lr_super)->s_li_wait_mult; elr->lr_timeout = timeout; } elr->lr_next_sched = jiffies + elr->lr_timeout; @@ -3256,15 +3285,11 @@ static int ext4_run_li_request(struct ext4_li_request *elr) */ static void ext4_remove_li_request(struct ext4_li_request *elr) { - struct ext4_sb_info *sbi; - if (!elr) return; - sbi = elr->lr_sbi; - list_del(&elr->lr_request); - sbi->s_li_request = NULL; + EXT4_SB(elr->lr_super)->s_li_request = NULL; kfree(elr); } @@ -3473,7 +3498,6 @@ static int ext4_li_info_new(void) static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, ext4_group_t start) { - struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_li_request *elr; elr = kzalloc(sizeof(*elr), GFP_KERNEL); @@ -3481,8 +3505,13 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, return NULL; elr->lr_super = sb; - elr->lr_sbi = sbi; - elr->lr_next_group = start; + elr->lr_first_not_zeroed = start; + if (test_opt(sb, PREFETCH_BLOCK_BITMAPS)) + elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP; + else { + elr->lr_mode = EXT4_LI_MODE_ITABLE; + elr->lr_next_group = start; + } /* * Randomize first schedule time of the request to @@ -3512,8 +3541,9 @@ int ext4_register_li_request(struct super_block *sb, goto out; } - if (first_not_zeroed == ngroups || sb_rdonly(sb) || - !test_opt(sb, INIT_INODE_TABLE)) + if (!test_opt(sb, PREFETCH_BLOCK_BITMAPS) && + (first_not_zeroed == ngroups || sb_rdonly(sb) || + !test_opt(sb, INIT_INODE_TABLE))) goto out; elr = ext4_li_request_new(sb, first_not_zeroed); @@ -4710,11 +4740,13 @@ no_journal: ext4_set_resv_clusters(sb); - err = ext4_setup_system_zone(sb); - if (err) { - ext4_msg(sb, KERN_ERR, "failed to initialize system " - "zone (%d)", err); - goto failed_mount4a; + if (test_opt(sb, BLOCK_VALIDITY)) { + err = ext4_setup_system_zone(sb); + if (err) { + ext4_msg(sb, KERN_ERR, "failed to initialize system " + "zone (%d)", err); + goto failed_mount4a; + } } ext4_ext_init(sb); @@ -4777,12 +4809,23 @@ no_journal: } #endif /* CONFIG_QUOTA */ + /* + * Save the original bdev mapping's wb_err value which could be + * used to detect the metadata async write error. + */ + spin_lock_init(&sbi->s_bdev_wb_lock); + if (!sb_rdonly(sb)) + errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err, + &sbi->s_bdev_wb_err); + sb->s_bdev->bd_super = sb; EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; if (needs_recovery) { ext4_msg(sb, KERN_INFO, "recovery complete"); - ext4_mark_recovery_complete(sb, es); + err = ext4_mark_recovery_complete(sb, es); + if (err) + goto failed_mount8; } if (EXT4_SB(sb)->s_journal) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) @@ -4816,6 +4859,8 @@ no_journal: ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10); ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10); ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10); + atomic_set(&sbi->s_warning_count, 0); + atomic_set(&sbi->s_msg_count, 0); kfree(orig_data); return 0; @@ -4825,10 +4870,8 @@ cantfind_ext4: ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); goto failed_mount; -#ifdef CONFIG_QUOTA failed_mount8: ext4_unregister_sysfs(sb); -#endif failed_mount7: ext4_unregister_li_request(sb); failed_mount6: @@ -4968,7 +5011,8 @@ static journal_t *ext4_get_journal(struct super_block *sb, struct inode *journal_inode; journal_t *journal; - BUG_ON(!ext4_has_feature_journal(sb)); + if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) + return NULL; journal_inode = ext4_get_journal_inode(sb, journal_inum); if (!journal_inode) @@ -4998,7 +5042,8 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, struct ext4_super_block *es; struct block_device *bdev; - BUG_ON(!ext4_has_feature_journal(sb)); + if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) + return NULL; bdev = ext4_blkdev_get(j_dev, sb); if (bdev == NULL) @@ -5089,8 +5134,10 @@ static int ext4_load_journal(struct super_block *sb, dev_t journal_dev; int err = 0; int really_read_only; + int journal_dev_ro; - BUG_ON(!ext4_has_feature_journal(sb)); + if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) + return -EFSCORRUPTED; if (journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { @@ -5100,7 +5147,31 @@ static int ext4_load_journal(struct super_block *sb, } else journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); - really_read_only = bdev_read_only(sb->s_bdev); + if (journal_inum && journal_dev) { + ext4_msg(sb, KERN_ERR, + "filesystem has both journal inode and journal device!"); + return -EINVAL; + } + + if (journal_inum) { + journal = ext4_get_journal(sb, journal_inum); + if (!journal) + return -EINVAL; + } else { + journal = ext4_get_dev_journal(sb, journal_dev); + if (!journal) + return -EINVAL; + } + + journal_dev_ro = bdev_read_only(journal->j_dev); + really_read_only = bdev_read_only(sb->s_bdev) | journal_dev_ro; + + if (journal_dev_ro && !sb_rdonly(sb)) { + ext4_msg(sb, KERN_ERR, + "journal device read-only, try mounting with '-o ro'"); + err = -EROFS; + goto err_out; + } /* * Are we loading a blank journal or performing recovery after a @@ -5115,27 +5186,14 @@ static int ext4_load_journal(struct super_block *sb, ext4_msg(sb, KERN_ERR, "write access " "unavailable, cannot proceed " "(try mounting with noload)"); - return -EROFS; + err = -EROFS; + goto err_out; } ext4_msg(sb, KERN_INFO, "write access will " "be enabled during recovery"); } } - if (journal_inum && journal_dev) { - ext4_msg(sb, KERN_ERR, "filesystem has both journal " - "and inode journals!"); - return -EINVAL; - } - - if (journal_inum) { - if (!(journal = ext4_get_journal(sb, journal_inum))) - return -EINVAL; - } else { - if (!(journal = ext4_get_dev_journal(sb, journal_dev))) - return -EINVAL; - } - if (!(journal->j_flags & JBD2_BARRIER)) ext4_msg(sb, KERN_INFO, "barriers disabled"); @@ -5155,12 +5213,16 @@ static int ext4_load_journal(struct super_block *sb, if (err) { ext4_msg(sb, KERN_ERR, "error loading journal"); - jbd2_journal_destroy(journal); - return err; + goto err_out; } EXT4_SB(sb)->s_journal = journal; - ext4_clear_journal_err(sb, es); + err = ext4_clear_journal_err(sb, es); + if (err) { + EXT4_SB(sb)->s_journal = NULL; + jbd2_journal_destroy(journal); + return err; + } if (!really_read_only && journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { @@ -5171,6 +5233,10 @@ static int ext4_load_journal(struct super_block *sb, } return 0; + +err_out: + jbd2_journal_destroy(journal); + return err; } static int ext4_commit_super(struct super_block *sb, int sync) @@ -5183,13 +5249,6 @@ static int ext4_commit_super(struct super_block *sb, int sync) return error; /* - * The superblock bh should be mapped, but it might not be if the - * device was hot-removed. Not much we can do but fail the I/O. - */ - if (!buffer_mapped(sbh)) - return error; - - /* * If the file system is mounted read-only, don't update the * superblock write time. This avoids updating the superblock * write time when we are mounting the root file system @@ -5256,26 +5315,32 @@ static int ext4_commit_super(struct super_block *sb, int sync) * remounting) the filesystem readonly, then we will end up with a * consistent fs on disk. Record that fact. */ -static void ext4_mark_recovery_complete(struct super_block *sb, - struct ext4_super_block *es) +static int ext4_mark_recovery_complete(struct super_block *sb, + struct ext4_super_block *es) { + int err; journal_t *journal = EXT4_SB(sb)->s_journal; if (!ext4_has_feature_journal(sb)) { - BUG_ON(journal != NULL); - return; + if (journal != NULL) { + ext4_error(sb, "Journal got removed while the fs was " + "mounted!"); + return -EFSCORRUPTED; + } + return 0; } jbd2_journal_lock_updates(journal); - if (jbd2_journal_flush(journal) < 0) + err = jbd2_journal_flush(journal); + if (err < 0) goto out; if (ext4_has_feature_journal_needs_recovery(sb) && sb_rdonly(sb)) { ext4_clear_feature_journal_needs_recovery(sb); ext4_commit_super(sb, 1); } - out: jbd2_journal_unlock_updates(journal); + return err; } /* @@ -5283,14 +5348,17 @@ out: * has recorded an error from a previous lifetime, move that error to the * main filesystem now. */ -static void ext4_clear_journal_err(struct super_block *sb, +static int ext4_clear_journal_err(struct super_block *sb, struct ext4_super_block *es) { journal_t *journal; int j_errno; const char *errstr; - BUG_ON(!ext4_has_feature_journal(sb)); + if (!ext4_has_feature_journal(sb)) { + ext4_error(sb, "Journal got removed while the fs was mounted!"); + return -EFSCORRUPTED; + } journal = EXT4_SB(sb)->s_journal; @@ -5315,6 +5383,7 @@ static void ext4_clear_journal_err(struct super_block *sb, jbd2_journal_clear_err(journal); jbd2_journal_update_sb_errno(journal); } + return 0; } /* @@ -5457,7 +5526,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) { struct ext4_super_block *es; struct ext4_sb_info *sbi = EXT4_SB(sb); - unsigned long old_sb_flags; + unsigned long old_sb_flags, vfs_flags; struct ext4_mount_options old_opts; int enable_quota = 0; ext4_group_t g; @@ -5500,6 +5569,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (sbi->s_journal && sbi->s_journal->j_task->io_context) journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; + /* + * Some options can be enabled by ext4 and/or by VFS mount flag + * either way we need to make sure it matches in both *flags and + * s_flags. Copy those selected flags from *flags to s_flags + */ + vfs_flags = SB_LAZYTIME | SB_I_VERSION; + sb->s_flags = (sb->s_flags & ~vfs_flags) | (*flags & vfs_flags); + if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) { err = -EINVAL; goto restore_opts; @@ -5553,9 +5630,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); } - if (*flags & SB_LAZYTIME) - sb->s_flags |= SB_LAZYTIME; - if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) { if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) { err = -EROFS; @@ -5585,8 +5659,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) (sbi->s_mount_state & EXT4_VALID_FS)) es->s_state = cpu_to_le16(sbi->s_mount_state); - if (sbi->s_journal) + if (sbi->s_journal) { + /* + * We let remount-ro finish even if marking fs + * as clean failed... + */ ext4_mark_recovery_complete(sb, es); + } if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); } else { @@ -5629,13 +5708,24 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } /* + * Update the original bdev mapping's wb_err value + * which could be used to detect the metadata async + * write error. + */ + errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err, + &sbi->s_bdev_wb_err); + + /* * Mounting a RDONLY partition read-write, so reread * and store the current valid flag. (It may have * been changed by e2fsck since we originally mounted * the partition.) */ - if (sbi->s_journal) - ext4_clear_journal_err(sb, es); + if (sbi->s_journal) { + err = ext4_clear_journal_err(sb, es); + if (err) + goto restore_opts; + } sbi->s_mount_state = le16_to_cpu(es->s_state); err = ext4_setup_super(sb, es, 0); @@ -5665,7 +5755,17 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) ext4_register_li_request(sb, first_not_zeroed); } - ext4_setup_system_zone(sb); + /* + * Handle creation of system zone data early because it can fail. + * Releasing of existing data is done when we are sure remount will + * succeed. + */ + if (test_opt(sb, BLOCK_VALIDITY) && !sbi->system_blks) { + err = ext4_setup_system_zone(sb); + if (err) + goto restore_opts; + } + if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) { err = ext4_commit_super(sb, 1); if (err) @@ -5686,8 +5786,16 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } } #endif + if (!test_opt(sb, BLOCK_VALIDITY) && sbi->system_blks) + ext4_release_system_zone(sb); + + /* + * Some options can be enabled by ext4 and/or by VFS mount flag + * either way we need to make sure it matches in both *flags and + * s_flags. Copy those selected flags from s_flags to *flags + */ + *flags = (*flags & ~vfs_flags) | (sb->s_flags & vfs_flags); - *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data); kfree(orig_data); return 0; @@ -5701,6 +5809,8 @@ restore_opts: sbi->s_commit_interval = old_opts.s_commit_interval; sbi->s_min_batch_time = old_opts.s_min_batch_time; sbi->s_max_batch_time = old_opts.s_max_batch_time; + if (!test_opt(sb, BLOCK_VALIDITY) && sbi->system_blks) + ext4_release_system_zone(sb); #ifdef CONFIG_QUOTA sbi->s_jquota_fmt = old_opts.s_jquota_fmt; for (i = 0; i < EXT4_MAXQUOTAS; i++) { diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 6c9fc9e21c13..bfabb799fa45 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -189,6 +189,9 @@ static struct ext4_attr ext4_attr_##_name = { \ #define EXT4_RW_ATTR_SBI_UL(_name,_elname) \ EXT4_ATTR_OFFSET(_name, 0644, pointer_ul, ext4_sb_info, _elname) +#define EXT4_RO_ATTR_SBI_ATOMIC(_name,_elname) \ + EXT4_ATTR_OFFSET(_name, 0444, pointer_atomic, ext4_sb_info, _elname) + #define EXT4_ATTR_PTR(_name,_mode,_id,_ptr) \ static struct ext4_attr ext4_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ @@ -215,6 +218,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); +EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc); EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error); EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval); @@ -226,6 +230,8 @@ EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); #ifdef CONFIG_EXT4_DEBUG EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail); #endif +EXT4_RO_ATTR_SBI_ATOMIC(warning_count, s_warning_count); +EXT4_RO_ATTR_SBI_ATOMIC(msg_count, s_msg_count); EXT4_RO_ATTR_ES_UI(errors_count, s_error_count); EXT4_RO_ATTR_ES_U8(first_error_errcode, s_first_error_errcode); EXT4_RO_ATTR_ES_U8(last_error_errcode, s_last_error_errcode); @@ -240,6 +246,8 @@ EXT4_RO_ATTR_ES_STRING(last_error_func, s_last_error_func, 32); EXT4_ATTR(first_error_time, 0444, first_error_time); EXT4_ATTR(last_error_time, 0444, last_error_time); EXT4_ATTR(journal_task, 0444, journal_task); +EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch); +EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit); static unsigned int old_bump_val = 128; EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val); @@ -257,6 +265,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(mb_order2_req), ATTR_LIST(mb_stream_req), ATTR_LIST(mb_group_prealloc), + ATTR_LIST(mb_max_inode_prealloc), ATTR_LIST(max_writeback_mb_bump), ATTR_LIST(extent_max_zeroout_kb), ATTR_LIST(trigger_fs_error), @@ -267,6 +276,8 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(msg_ratelimit_interval_ms), ATTR_LIST(msg_ratelimit_burst), ATTR_LIST(errors_count), + ATTR_LIST(warning_count), + ATTR_LIST(msg_count), ATTR_LIST(first_error_ino), ATTR_LIST(last_error_ino), ATTR_LIST(first_error_block), @@ -283,6 +294,8 @@ static struct attribute *ext4_attrs[] = { #ifdef CONFIG_EXT4_DEBUG ATTR_LIST(simulate_fail), #endif + ATTR_LIST(mb_prefetch), + ATTR_LIST(mb_prefetch_limit), NULL, }; ATTRIBUTE_GROUPS(ext4); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 7d2f6576d954..cba4b877c606 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1356,8 +1356,7 @@ retry: block = 0; while (wsize < bufsize) { - if (bh != NULL) - brelse(bh); + brelse(bh); csize = (bufsize - wsize) > blocksize ? blocksize : bufsize - wsize; bh = ext4_getblk(handle, ea_inode, block, 0); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 16322ea5b463..d9e52a7f3702 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2646,7 +2646,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode, case FI_NEW_INODE: if (set) return; - /* fall through */ + fallthrough; case FI_DATA_EXIST: case FI_INLINE_DOTS: case FI_PIN_FILE: diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 9bbaa2614679..3ad7bdbda5ca 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -618,10 +618,10 @@ pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs) switch (dn->max_level) { case 3: base += 2 * indirect_blks; - /* fall through */ + fallthrough; case 2: base += 2 * direct_blks; - /* fall through */ + fallthrough; case 1: base += direct_index; break; diff --git a/fs/fcntl.c b/fs/fcntl.c index 2e4c0fa2074b..19ac5baad50f 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -362,7 +362,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, case F_OFD_SETLK: case F_OFD_SETLKW: #endif - /* Fallthrough */ + fallthrough; case F_SETLK: case F_SETLKW: if (copy_from_user(&flock, argp, sizeof(flock))) @@ -771,7 +771,7 @@ static void send_sigio_to_task(struct task_struct *p, if (!do_send_sig_info(signum, &si, p, type)) break; } - /* fall-through - fall back on the old plain SIGIO signal */ + fallthrough; /* fall back on the old plain SIGIO signal */ case 0: do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, type); } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index a605c3dddabc..149227160ff0 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -42,7 +42,6 @@ struct wb_writeback_work { long nr_pages; struct super_block *sb; - unsigned long *older_than_this; enum writeback_sync_modes sync_mode; unsigned int tagged_writepages:1; unsigned int for_kupdate:1; @@ -144,7 +143,9 @@ static void inode_io_list_del_locked(struct inode *inode, struct bdi_writeback *wb) { assert_spin_locked(&wb->list_lock); + assert_spin_locked(&inode->i_lock); + inode->i_state &= ~I_SYNC_QUEUED; list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); } @@ -1122,7 +1123,9 @@ void inode_io_list_del(struct inode *inode) struct bdi_writeback *wb; wb = inode_to_wb_and_lock_list(inode); + spin_lock(&inode->i_lock); inode_io_list_del_locked(inode, wb); + spin_unlock(&inode->i_lock); spin_unlock(&wb->list_lock); } EXPORT_SYMBOL(inode_io_list_del); @@ -1172,8 +1175,10 @@ void sb_clear_inode_writeback(struct inode *inode) * the case then the inode must have been redirtied while it was being written * out and we don't reset its dirtied_when. */ -static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) +static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb) { + assert_spin_locked(&inode->i_lock); + if (!list_empty(&wb->b_dirty)) { struct inode *tail; @@ -1182,6 +1187,14 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) inode->dirtied_when = jiffies; } inode_io_list_move_locked(inode, wb, &wb->b_dirty); + inode->i_state &= ~I_SYNC_QUEUED; +} + +static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) +{ + spin_lock(&inode->i_lock); + redirty_tail_locked(inode, wb); + spin_unlock(&inode->i_lock); } /* @@ -1220,16 +1233,13 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t) #define EXPIRE_DIRTY_ATIME 0x0001 /* - * Move expired (dirtied before work->older_than_this) dirty inodes from + * Move expired (dirtied before dirtied_before) dirty inodes from * @delaying_queue to @dispatch_queue. */ static int move_expired_inodes(struct list_head *delaying_queue, struct list_head *dispatch_queue, - int flags, - struct wb_writeback_work *work) + unsigned long dirtied_before) { - unsigned long *older_than_this = NULL; - unsigned long expire_time; LIST_HEAD(tmp); struct list_head *pos, *node; struct super_block *sb = NULL; @@ -1237,21 +1247,15 @@ static int move_expired_inodes(struct list_head *delaying_queue, int do_sb_sort = 0; int moved = 0; - if ((flags & EXPIRE_DIRTY_ATIME) == 0) - older_than_this = work->older_than_this; - else if (!work->for_sync) { - expire_time = jiffies - (dirtytime_expire_interval * HZ); - older_than_this = &expire_time; - } while (!list_empty(delaying_queue)) { inode = wb_inode(delaying_queue->prev); - if (older_than_this && - inode_dirtied_after(inode, *older_than_this)) + if (inode_dirtied_after(inode, dirtied_before)) break; list_move(&inode->i_io_list, &tmp); moved++; - if (flags & EXPIRE_DIRTY_ATIME) - set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state); + spin_lock(&inode->i_lock); + inode->i_state |= I_SYNC_QUEUED; + spin_unlock(&inode->i_lock); if (sb_is_blkdev_sb(inode->i_sb)) continue; if (sb && sb != inode->i_sb) @@ -1289,18 +1293,22 @@ out: * | * +--> dequeue for IO */ -static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) +static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work, + unsigned long dirtied_before) { int moved; + unsigned long time_expire_jif = dirtied_before; assert_spin_locked(&wb->list_lock); list_splice_init(&wb->b_more_io, &wb->b_io); - moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work); + moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, dirtied_before); + if (!work->for_sync) + time_expire_jif = jiffies - dirtytime_expire_interval * HZ; moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io, - EXPIRE_DIRTY_ATIME, work); + time_expire_jif); if (moved) wb_io_lists_populated(wb); - trace_writeback_queue_io(wb, work, moved); + trace_writeback_queue_io(wb, work, dirtied_before, moved); } static int write_inode(struct inode *inode, struct writeback_control *wbc) @@ -1394,7 +1402,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, * writeback is not making progress due to locked * buffers. Skip this inode for now. */ - redirty_tail(inode, wb); + redirty_tail_locked(inode, wb); return; } @@ -1414,7 +1422,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, * retrying writeback of the dirty page/inode * that cannot be performed immediately. */ - redirty_tail(inode, wb); + redirty_tail_locked(inode, wb); } } else if (inode->i_state & I_DIRTY) { /* @@ -1422,10 +1430,11 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, * such as delayed allocation during submission or metadata * updates after data IO completion. */ - redirty_tail(inode, wb); + redirty_tail_locked(inode, wb); } else if (inode->i_state & I_DIRTY_TIME) { inode->dirtied_when = jiffies; inode_io_list_move_locked(inode, wb, &wb->b_dirty_time); + inode->i_state &= ~I_SYNC_QUEUED; } else { /* The inode is clean. Remove from writeback lists. */ inode_io_list_del_locked(inode, wb); @@ -1472,18 +1481,14 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) spin_lock(&inode->i_lock); dirty = inode->i_state & I_DIRTY; - if (inode->i_state & I_DIRTY_TIME) { - if ((dirty & I_DIRTY_INODE) || - wbc->sync_mode == WB_SYNC_ALL || - unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) || - unlikely(time_after(jiffies, - (inode->dirtied_time_when + - dirtytime_expire_interval * HZ)))) { - dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED; - trace_writeback_lazytime(inode); - } - } else - inode->i_state &= ~I_DIRTY_TIME_EXPIRED; + if ((inode->i_state & I_DIRTY_TIME) && + ((dirty & I_DIRTY_INODE) || + wbc->sync_mode == WB_SYNC_ALL || wbc->for_sync || + time_after(jiffies, inode->dirtied_time_when + + dirtytime_expire_interval * HZ))) { + dirty |= I_DIRTY_TIME; + trace_writeback_lazytime(inode); + } inode->i_state &= ~dirty; /* @@ -1669,8 +1674,8 @@ static long writeback_sb_inodes(struct super_block *sb, */ spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + redirty_tail_locked(inode, wb); spin_unlock(&inode->i_lock); - redirty_tail(inode, wb); continue; } if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) { @@ -1811,7 +1816,7 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, blk_start_plug(&plug); spin_lock(&wb->list_lock); if (list_empty(&wb->b_io)) - queue_io(wb, &work); + queue_io(wb, &work, jiffies); __writeback_inodes_wb(wb, &work); spin_unlock(&wb->list_lock); blk_finish_plug(&plug); @@ -1831,7 +1836,7 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, * takes longer than a dirty_writeback_interval interval, then leave a * one-second gap. * - * older_than_this takes precedence over nr_to_write. So we'll only write back + * dirtied_before takes precedence over nr_to_write. So we'll only write back * all dirty pages if they are all attached to "old" mappings. */ static long wb_writeback(struct bdi_writeback *wb, @@ -1839,14 +1844,11 @@ static long wb_writeback(struct bdi_writeback *wb, { unsigned long wb_start = jiffies; long nr_pages = work->nr_pages; - unsigned long oldest_jif; + unsigned long dirtied_before = jiffies; struct inode *inode; long progress; struct blk_plug plug; - oldest_jif = jiffies; - work->older_than_this = &oldest_jif; - blk_start_plug(&plug); spin_lock(&wb->list_lock); for (;;) { @@ -1880,14 +1882,14 @@ static long wb_writeback(struct bdi_writeback *wb, * safe. */ if (work->for_kupdate) { - oldest_jif = jiffies - + dirtied_before = jiffies - msecs_to_jiffies(dirty_expire_interval * 10); } else if (work->for_background) - oldest_jif = jiffies; + dirtied_before = jiffies; trace_writeback_start(wb, work); if (list_empty(&wb->b_io)) - queue_io(wb, work); + queue_io(wb, work, dirtied_before); if (work->sb) progress = writeback_sb_inodes(work->sb, wb, work); else @@ -2289,11 +2291,12 @@ void __mark_inode_dirty(struct inode *inode, int flags) inode->i_state |= flags; /* - * If the inode is being synced, just update its dirty state. - * The unlocker will place the inode on the appropriate - * superblock list, based upon its state. + * If the inode is queued for writeback by flush worker, just + * update its dirty state. Once the flush worker is done with + * the inode it will place it on the appropriate superblock + * list, based upon its state. */ - if (inode->i_state & I_SYNC) + if (inode->i_state & I_SYNC_QUEUED) goto out_unlock_inode; /* diff --git a/fs/fs_context.c b/fs/fs_context.c index 7d5c5dd2b1d5..2834d1afa6e8 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -521,7 +521,7 @@ static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param) switch (param->type) { case fs_value_is_string: len = 1 + param->size; - /* Fall through */ + fallthrough; case fs_value_is_flag: len += strlen(param->key); break; diff --git a/fs/fsopen.c b/fs/fsopen.c index 2fa3f241b762..27a890aa493a 100644 --- a/fs/fsopen.c +++ b/fs/fsopen.c @@ -412,7 +412,7 @@ SYSCALL_DEFINE5(fsconfig, break; case FSCONFIG_SET_PATH_EMPTY: lookup_flags = LOOKUP_EMPTY; - /* fallthru */ + fallthrough; case FSCONFIG_SET_PATH: param.type = fs_value_is_filename; param.name = getname_flags(_value, lookup_flags, NULL); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 770f3a720db9..0f69fbd4af66 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -746,7 +746,7 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, } if (n == 0) break; - /* fall through - To branching from existing tree */ + fallthrough; /* To branching from existing tree */ case ALLOC_GROW_DEPTH: if (i > 1 && i < mp->mp_fheight) gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]); @@ -757,7 +757,7 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, state = ALLOC_DATA; if (n == 0) break; - /* fall through - To tree complete, adding data blocks */ + fallthrough; /* To tree complete, adding data blocks */ case ALLOC_DATA: BUG_ON(n > dblks); BUG_ON(mp->mp_bh[end_of_metadata] == NULL); diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index a58333e3980d..3763c9ff1406 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -902,6 +902,36 @@ static void empty_ail1_list(struct gfs2_sbd *sdp) } /** + * drain_bd - drain the buf and databuf queue for a failed transaction + * @tr: the transaction to drain + * + * When this is called, we're taking an error exit for a log write that failed + * but since we bypassed the after_commit functions, we need to remove the + * items from the buf and databuf queue. + */ +static void trans_drain(struct gfs2_trans *tr) +{ + struct gfs2_bufdata *bd; + struct list_head *head; + + if (!tr) + return; + + head = &tr->tr_buf; + while (!list_empty(head)) { + bd = list_first_entry(head, struct gfs2_bufdata, bd_list); + list_del_init(&bd->bd_list); + kmem_cache_free(gfs2_bufdata_cachep, bd); + } + head = &tr->tr_databuf; + while (!list_empty(head)) { + bd = list_first_entry(head, struct gfs2_bufdata, bd_list); + list_del_init(&bd->bd_list); + kmem_cache_free(gfs2_bufdata_cachep, bd); + } +} + +/** * gfs2_log_flush - flush incore transaction(s) * @sdp: the filesystem * @gl: The glock structure to flush. If NULL, flush the whole incore log @@ -1005,6 +1035,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) out: if (gfs2_withdrawn(sdp)) { + trans_drain(tr); /** * If the tr_list is empty, we're withdrawing during a log * flush that targets a transaction, but the transaction was diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 4b67d47a7e00..6e173ae378c4 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1599,7 +1599,7 @@ static int gfs2_quota_get_state(struct super_block *sb, struct qc_state *state) case GFS2_QUOTA_ON: state->s_state[USRQUOTA].flags |= QCI_LIMITS_ENFORCED; state->s_state[GRPQUOTA].flags |= QCI_LIMITS_ENFORCED; - /*FALLTHRU*/ + fallthrough; case GFS2_QUOTA_ACCOUNT: state->s_state[USRQUOTA].flags |= QCI_ACCT_ENABLED | QCI_SYSFILE; diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index e1c7eb6eb00a..6d4bf7ea7b3b 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -67,6 +67,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, tr->tr_reserved += gfs2_struct2blk(sdp, revokes); INIT_LIST_HEAD(&tr->tr_databuf); INIT_LIST_HEAD(&tr->tr_buf); + INIT_LIST_HEAD(&tr->tr_list); INIT_LIST_HEAD(&tr->tr_ail1_list); INIT_LIST_HEAD(&tr->tr_ail2_list); diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 61eec628805d..0350dc7821bf 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -195,7 +195,7 @@ reread: switch (sbi->s_vhdr->signature) { case cpu_to_be16(HFSPLUS_VOLHEAD_SIGX): set_bit(HFSPLUS_SB_HFSX, &sbi->flags); - /*FALLTHRU*/ + fallthrough; case cpu_to_be16(HFSPLUS_VOLHEAD_SIG): break; case cpu_to_be16(HFSP_WRAP_MAGIC): diff --git a/fs/io-wq.c b/fs/io-wq.c index e92c4724480c..414beb543883 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -925,6 +925,24 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) return match->nr_running && !match->cancel_all; } +static inline void io_wqe_remove_pending(struct io_wqe *wqe, + struct io_wq_work *work, + struct io_wq_work_node *prev) +{ + unsigned int hash = io_get_work_hash(work); + struct io_wq_work *prev_work = NULL; + + if (io_wq_is_hashed(work) && work == wqe->hash_tail[hash]) { + if (prev) + prev_work = container_of(prev, struct io_wq_work, list); + if (prev_work && io_get_work_hash(prev_work) == hash) + wqe->hash_tail[hash] = prev_work; + else + wqe->hash_tail[hash] = NULL; + } + wq_list_del(&wqe->work_list, &work->list, prev); +} + static void io_wqe_cancel_pending_work(struct io_wqe *wqe, struct io_cb_cancel_data *match) { @@ -938,8 +956,7 @@ retry: work = container_of(node, struct io_wq_work, list); if (!match->fn(work, match->data)) continue; - - wq_list_del(&wqe->work_list, node, prev); + io_wqe_remove_pending(wqe, work, prev); spin_unlock_irqrestore(&wqe->lock, flags); io_run_cancel(work, wqe); match->nr_pending++; diff --git a/fs/io_uring.c b/fs/io_uring.c index dc506b75659c..3790c7fe9fee 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -540,7 +540,6 @@ enum { REQ_F_ISREG_BIT, REQ_F_COMP_LOCKED_BIT, REQ_F_NEED_CLEANUP_BIT, - REQ_F_OVERFLOW_BIT, REQ_F_POLLED_BIT, REQ_F_BUFFER_SELECTED_BIT, REQ_F_NO_FILE_TABLE_BIT, @@ -583,8 +582,6 @@ enum { REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT), /* needs cleanup */ REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), - /* in overflow list */ - REQ_F_OVERFLOW = BIT(REQ_F_OVERFLOW_BIT), /* already went through poll handler */ REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), /* buffer already selected */ @@ -946,7 +943,8 @@ static void io_get_req_task(struct io_kiocb *req) static inline void io_clean_op(struct io_kiocb *req) { - if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED)) + if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED | + REQ_F_INFLIGHT)) __io_clean_op(req); } @@ -1152,7 +1150,7 @@ static void io_prep_async_work(struct io_kiocb *req) io_req_init_async(req); if (req->flags & REQ_F_ISREG) { - if (def->hash_reg_file) + if (def->hash_reg_file || (req->ctx->flags & IORING_SETUP_IOPOLL)) io_wq_hash_work(&req->work, file_inode(req->file)); } else { if (def->unbound_nonreg_file) @@ -1366,7 +1364,6 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb, compl.list); list_move(&req->compl.list, &list); - req->flags &= ~REQ_F_OVERFLOW; if (cqe) { WRITE_ONCE(cqe->user_data, req->user_data); WRITE_ONCE(cqe->res, req->result); @@ -1419,7 +1416,6 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW; } io_clean_op(req); - req->flags |= REQ_F_OVERFLOW; req->result = res; req->compl.cflags = cflags; refcount_inc(&req->refs); @@ -1563,17 +1559,6 @@ static bool io_dismantle_req(struct io_kiocb *req) if (req->file) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); - if (req->flags & REQ_F_INFLIGHT) { - struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - - spin_lock_irqsave(&ctx->inflight_lock, flags); - list_del(&req->inflight_entry); - if (waitqueue_active(&ctx->inflight_wait)) - wake_up(&ctx->inflight_wait); - spin_unlock_irqrestore(&ctx->inflight_lock, flags); - } - return io_req_clean_work(req); } @@ -1761,7 +1746,8 @@ static struct io_kiocb *io_req_find_next(struct io_kiocb *req) return __io_req_find_next(req); } -static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb) +static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb, + bool twa_signal_ok) { struct task_struct *tsk = req->task; struct io_ring_ctx *ctx = req->ctx; @@ -1774,7 +1760,7 @@ static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb) * will do the job. */ notify = 0; - if (!(ctx->flags & IORING_SETUP_SQPOLL)) + if (!(ctx->flags & IORING_SETUP_SQPOLL) && twa_signal_ok) notify = TWA_SIGNAL; ret = task_work_add(tsk, cb, notify); @@ -1834,7 +1820,7 @@ static void io_req_task_queue(struct io_kiocb *req) init_task_work(&req->task_work, io_req_task_submit); percpu_ref_get(&req->ctx->refs); - ret = io_req_task_work_add(req, &req->task_work); + ret = io_req_task_work_add(req, &req->task_work, true); if (unlikely(ret)) { struct task_struct *tsk; @@ -2063,6 +2049,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, req = list_first_entry(done, struct io_kiocb, inflight_entry); if (READ_ONCE(req->result) == -EAGAIN) { + req->result = 0; req->iopoll_completed = 0; list_move_tail(&req->inflight_entry, &again); continue; @@ -2308,38 +2295,27 @@ end_req: io_req_complete(req, ret); return false; } - -static void io_rw_resubmit(struct callback_head *cb) -{ - struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); - struct io_ring_ctx *ctx = req->ctx; - int err; - - err = io_sq_thread_acquire_mm(ctx, req); - - if (io_resubmit_prep(req, err)) { - refcount_inc(&req->refs); - io_queue_async_work(req); - } - - percpu_ref_put(&ctx->refs); -} #endif static bool io_rw_reissue(struct io_kiocb *req, long res) { #ifdef CONFIG_BLOCK + umode_t mode = file_inode(req->file)->i_mode; int ret; + if (!S_ISBLK(mode) && !S_ISREG(mode)) + return false; if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker()) return false; - init_task_work(&req->task_work, io_rw_resubmit); - percpu_ref_get(&req->ctx->refs); + ret = io_sq_thread_acquire_mm(req->ctx, req); - ret = io_req_task_work_add(req, &req->task_work); - if (!ret) + if (io_resubmit_prep(req, ret)) { + refcount_inc(&req->refs); + io_queue_async_work(req); return true; + } + #endif return false; } @@ -2578,7 +2554,7 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) * IO with EINTR. */ ret = -EINTR; - /* fall through */ + fallthrough; default: kiocb->ki_complete(kiocb, ret, 0); } @@ -2819,22 +2795,15 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, return __io_iov_buffer_select(req, iov, needs_lock); } -static ssize_t io_import_iovec(int rw, struct io_kiocb *req, - struct iovec **iovec, struct iov_iter *iter, - bool needs_lock) +static ssize_t __io_import_iovec(int rw, struct io_kiocb *req, + struct iovec **iovec, struct iov_iter *iter, + bool needs_lock) { void __user *buf = u64_to_user_ptr(req->rw.addr); size_t sqe_len = req->rw.len; ssize_t ret; u8 opcode; - if (req->io) { - struct io_async_rw *iorw = &req->io->rw; - - *iovec = NULL; - return iov_iter_count(&iorw->iter); - } - opcode = req->opcode; if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { *iovec = NULL; @@ -2848,10 +2817,8 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { if (req->flags & REQ_F_BUFFER_SELECT) { buf = io_rw_buffer_select(req, &sqe_len, needs_lock); - if (IS_ERR(buf)) { - *iovec = NULL; + if (IS_ERR(buf)) return PTR_ERR(buf); - } req->rw.len = sqe_len; } @@ -2879,6 +2846,21 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter); } +static ssize_t io_import_iovec(int rw, struct io_kiocb *req, + struct iovec **iovec, struct iov_iter *iter, + bool needs_lock) +{ + if (!req->io) + return __io_import_iovec(rw, req, iovec, iter, needs_lock); + *iovec = NULL; + return iov_iter_count(&req->io->rw.iter); +} + +static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) +{ + return kiocb->ki_filp->f_mode & FMODE_STREAM ? NULL : &kiocb->ki_pos; +} + /* * For files that don't have ->read_iter() and ->write_iter(), handle them * by looping over ->read() or ->write() manually. @@ -2914,10 +2896,10 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, if (rw == READ) { nr = file->f_op->read(file, iovec.iov_base, - iovec.iov_len, &kiocb->ki_pos); + iovec.iov_len, io_kiocb_ppos(kiocb)); } else { nr = file->f_op->write(file, iovec.iov_base, - iovec.iov_len, &kiocb->ki_pos); + iovec.iov_len, io_kiocb_ppos(kiocb)); } if (iov_iter_is_bvec(iter)) @@ -2998,17 +2980,15 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw, bool force_nonblock) { struct io_async_rw *iorw = &req->io->rw; + struct iovec *iov; ssize_t ret; - iorw->iter.iov = iorw->fast_iov; - /* reset ->io around the iovec import, we don't want to use it */ - req->io = NULL; - ret = io_import_iovec(rw, req, (struct iovec **) &iorw->iter.iov, - &iorw->iter, !force_nonblock); - req->io = container_of(iorw, struct io_async_ctx, rw); + iorw->iter.iov = iov = iorw->fast_iov; + ret = __io_import_iovec(rw, req, &iov, &iorw->iter, !force_nonblock); if (unlikely(ret < 0)) return ret; + iorw->iter.iov = iov; io_req_map_rw(req, iorw->iter.iov, iorw->fast_iov, &iorw->iter); return 0; } @@ -3061,7 +3041,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, /* submit ref gets dropped, acquire a new one */ refcount_inc(&req->refs); - ret = io_req_task_work_add(req, &req->task_work); + ret = io_req_task_work_add(req, &req->task_work, true); if (unlikely(ret)) { struct task_struct *tsk; @@ -3074,27 +3054,6 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, return 1; } -static inline int kiocb_wait_page_queue_init(struct kiocb *kiocb, - struct wait_page_queue *wait, - wait_queue_func_t func, - void *data) -{ - /* Can't support async wakeup with polled IO */ - if (kiocb->ki_flags & IOCB_HIPRI) - return -EINVAL; - if (kiocb->ki_filp->f_mode & FMODE_BUF_RASYNC) { - wait->wait.func = func; - wait->wait.private = data; - wait->wait.flags = 0; - INIT_LIST_HEAD(&wait->wait.entry); - kiocb->ki_flags |= IOCB_WAITQ; - kiocb->ki_waitq = wait; - return 0; - } - - return -EOPNOTSUPP; -} - /* * This controls whether a given IO request should be armed for async page * based retry. If we return false here, the request is handed to the async @@ -3109,16 +3068,17 @@ static inline int kiocb_wait_page_queue_init(struct kiocb *kiocb, */ static bool io_rw_should_retry(struct io_kiocb *req) { + struct wait_page_queue *wait = &req->io->rw.wpq; struct kiocb *kiocb = &req->rw.kiocb; - int ret; /* never retry for NOWAIT, we just complete with -EAGAIN */ if (req->flags & REQ_F_NOWAIT) return false; /* Only for buffered IO */ - if (kiocb->ki_flags & IOCB_DIRECT) + if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI)) return false; + /* * just use poll if we can, and don't attempt if the fs doesn't * support callback based unlocks @@ -3126,14 +3086,15 @@ static bool io_rw_should_retry(struct io_kiocb *req) if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC)) return false; - ret = kiocb_wait_page_queue_init(kiocb, &req->io->rw.wpq, - io_async_buf_func, req); - if (!ret) { - io_get_req_task(req); - return true; - } + wait->wait.func = io_async_buf_func; + wait->wait.private = req; + wait->wait.flags = 0; + INIT_LIST_HEAD(&wait->wait.entry); + kiocb->ki_flags |= IOCB_WAITQ; + kiocb->ki_waitq = wait; - return false; + io_get_req_task(req); + return true; } static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) @@ -3161,6 +3122,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock); if (ret < 0) return ret; + iov_count = iov_iter_count(iter); io_size = ret; req->result = io_size; ret = 0; @@ -3173,8 +3135,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, if (force_nonblock && !io_file_supports_async(req->file, READ)) goto copy_iov; - iov_count = iov_iter_count(iter); - ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); + ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), iov_count); if (unlikely(ret)) goto out_free; @@ -3186,14 +3147,21 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, ret = 0; goto out_free; } else if (ret == -EAGAIN) { - if (!force_nonblock) + /* IOPOLL retry should happen for io-wq threads */ + if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) goto done; + /* no retry on NONBLOCK marked file */ + if (req->file->f_flags & O_NONBLOCK) + goto done; + /* some cases will consume bytes even on error returns */ + iov_iter_revert(iter, iov_count - iov_iter_count(iter)); ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); if (ret) goto out_free; return -EAGAIN; } else if (ret < 0) { - goto out_free; + /* make sure -ERESTARTSYS -> -EINTR is done */ + goto done; } /* read it all, or we did blocking attempt. no retry. */ @@ -3238,6 +3206,7 @@ done: kiocb_done(kiocb, ret, cs); ret = 0; out_free: + /* it's reportedly faster than delegating the null check to kfree() */ if (iovec) kfree(iovec); return ret; @@ -3276,6 +3245,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock); if (ret < 0) return ret; + iov_count = iov_iter_count(iter); io_size = ret; req->result = io_size; @@ -3292,8 +3262,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, (req->flags & REQ_F_ISREG)) goto copy_iov; - iov_count = iov_iter_count(iter); - ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count); + ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), iov_count); if (unlikely(ret)) goto out_free; @@ -3325,15 +3294,25 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, */ if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) ret2 = -EAGAIN; + /* no retry on NONBLOCK marked file */ + if (ret2 == -EAGAIN && (req->file->f_flags & O_NONBLOCK)) + goto done; if (!force_nonblock || ret2 != -EAGAIN) { + /* IOPOLL retry should happen for io-wq threads */ + if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN) + goto copy_iov; +done: kiocb_done(kiocb, ret2, cs); } else { copy_iov: + /* some cases will consume bytes even on error returns */ + iov_iter_revert(iter, iov_count - iov_iter_count(iter)); ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); if (!ret) return -EAGAIN; } out_free: + /* it's reportedly faster than delegating the null check to kfree() */ if (iovec) kfree(iovec); return ret; @@ -4600,6 +4579,7 @@ struct io_poll_table { static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, __poll_t mask, task_work_func_t func) { + bool twa_signal_ok; int ret; /* for instances that support it check for an event match first: */ @@ -4615,12 +4595,20 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, percpu_ref_get(&req->ctx->refs); /* + * If we using the signalfd wait_queue_head for this wakeup, then + * it's not safe to use TWA_SIGNAL as we could be recursing on the + * tsk->sighand->siglock on doing the wakeup. Should not be needed + * either, as the normal wakeup will suffice. + */ + twa_signal_ok = (poll->head != &req->task->sighand->signalfd_wqh); + + /* * If this fails, then the task is exiting. When a task exits, the * work gets canceled, so just cancel this request as well instead * of executing it. We can't safely execute it anyway, as we may not * have the needed state needed for it anyway. */ - ret = io_req_task_work_add(req, &req->task_work); + ret = io_req_task_work_add(req, &req->task_work, twa_signal_ok); if (unlikely(ret)) { struct task_struct *tsk; @@ -4909,12 +4897,20 @@ static bool io_arm_poll_handler(struct io_kiocb *req) struct async_poll *apoll; struct io_poll_table ipt; __poll_t mask, ret; + int rw; if (!req->file || !file_can_poll(req->file)) return false; if (req->flags & REQ_F_POLLED) return false; - if (!def->pollin && !def->pollout) + if (def->pollin) + rw = READ; + else if (def->pollout) + rw = WRITE; + else + return false; + /* if we can't nonblock try, then no point in arming a poll handler */ + if (!io_file_supports_async(req->file, rw)) return false; apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); @@ -5653,6 +5649,18 @@ static void __io_clean_op(struct io_kiocb *req) } req->flags &= ~REQ_F_NEED_CLEANUP; } + + if (req->flags & REQ_F_INFLIGHT) { + struct io_ring_ctx *ctx = req->ctx; + unsigned long flags; + + spin_lock_irqsave(&ctx->inflight_lock, flags); + list_del(&req->inflight_entry); + if (waitqueue_active(&ctx->inflight_wait)) + wake_up(&ctx->inflight_wait); + spin_unlock_irqrestore(&ctx->inflight_lock, flags); + req->flags &= ~REQ_F_INFLIGHT; + } } static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, @@ -7327,7 +7335,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT]; index = i & IORING_FILE_TABLE_MASK; if (table->files[index]) { - file = io_file_from_index(ctx, index); + file = table->files[index]; err = io_queue_file_removal(data, file); if (err) break; @@ -7356,6 +7364,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, table->files[index] = file; err = io_sqe_file_register(ctx, file, i); if (err) { + table->files[index] = NULL; fput(file); break; } @@ -7455,9 +7464,6 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, { int ret; - mmgrab(current->mm); - ctx->sqo_mm = current->mm; - if (ctx->flags & IORING_SETUP_SQPOLL) { ret = -EPERM; if (!capable(CAP_SYS_ADMIN)) @@ -7502,10 +7508,6 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, return 0; err: io_finish_async(ctx); - if (ctx->sqo_mm) { - mmdrop(ctx->sqo_mm); - ctx->sqo_mm = NULL; - } return ret; } @@ -7979,7 +7981,13 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) ACCT_LOCKED); INIT_WORK(&ctx->exit_work, io_ring_exit_work); - queue_work(system_wq, &ctx->exit_work); + /* + * Use system_unbound_wq to avoid spawning tons of event kworkers + * if we're exiting a ton of rings at the same time. It just adds + * noise and overhead, there's no discernable change in runtime + * over using system_wq. + */ + queue_work(system_unbound_wq, &ctx->exit_work); } static int io_uring_release(struct inode *inode, struct file *file) @@ -8016,6 +8024,28 @@ static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req) return false; } +static inline bool io_match_files(struct io_kiocb *req, + struct files_struct *files) +{ + return (req->flags & REQ_F_WORK_INITIALIZED) && req->work.files == files; +} + +static bool io_match_link_files(struct io_kiocb *req, + struct files_struct *files) +{ + struct io_kiocb *link; + + if (io_match_files(req, files)) + return true; + if (req->flags & REQ_F_LINK_HEAD) { + list_for_each_entry(link, &req->link_list, link_list) { + if (io_match_files(link, files)) + return true; + } + } + return false; +} + /* * We're looking to cancel 'req' because it's holding on to our files, but * 'req' could be a link to another request. See if it is, and cancel that @@ -8063,12 +8093,65 @@ static bool io_timeout_remove_link(struct io_ring_ctx *ctx, return found; } +static bool io_cancel_link_cb(struct io_wq_work *work, void *data) +{ + return io_match_link(container_of(work, struct io_kiocb, work), data); +} + +static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) +{ + enum io_wq_cancel cret; + + /* cancel this particular work, if it's running */ + cret = io_wq_cancel_work(ctx->io_wq, &req->work); + if (cret != IO_WQ_CANCEL_NOTFOUND) + return; + + /* find links that hold this pending, cancel those */ + cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_link_cb, req, true); + if (cret != IO_WQ_CANCEL_NOTFOUND) + return; + + /* if we have a poll link holding this pending, cancel that */ + if (io_poll_remove_link(ctx, req)) + return; + + /* final option, timeout link is holding this req pending */ + io_timeout_remove_link(ctx, req); +} + +static void io_cancel_defer_files(struct io_ring_ctx *ctx, + struct files_struct *files) +{ + struct io_defer_entry *de = NULL; + LIST_HEAD(list); + + spin_lock_irq(&ctx->completion_lock); + list_for_each_entry_reverse(de, &ctx->defer_list, list) { + if (io_match_link_files(de->req, files)) { + list_cut_position(&list, &ctx->defer_list, &de->list); + break; + } + } + spin_unlock_irq(&ctx->completion_lock); + + while (!list_empty(&list)) { + de = list_first_entry(&list, struct io_defer_entry, list); + list_del_init(&de->list); + req_set_fail_links(de->req); + io_put_req(de->req); + io_req_complete(de->req, -ECANCELED); + kfree(de); + } +} + static void io_uring_cancel_files(struct io_ring_ctx *ctx, struct files_struct *files) { if (list_empty_careful(&ctx->inflight_list)) return; + io_cancel_defer_files(ctx, files); /* cancel all at once, should be faster than doing it one by one*/ io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true); @@ -8094,35 +8177,9 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, /* We need to keep going until we don't find a matching req */ if (!cancel_req) break; - - if (cancel_req->flags & REQ_F_OVERFLOW) { - spin_lock_irq(&ctx->completion_lock); - list_del(&cancel_req->compl.list); - cancel_req->flags &= ~REQ_F_OVERFLOW; - - io_cqring_mark_overflow(ctx); - WRITE_ONCE(ctx->rings->cq_overflow, - atomic_inc_return(&ctx->cached_cq_overflow)); - io_commit_cqring(ctx); - spin_unlock_irq(&ctx->completion_lock); - - /* - * Put inflight ref and overflow ref. If that's - * all we had, then we're done with this request. - */ - if (refcount_sub_and_test(2, &cancel_req->refs)) { - io_free_req(cancel_req); - finish_wait(&ctx->inflight_wait, &wait); - continue; - } - } else { - io_wq_cancel_work(ctx->io_wq, &cancel_req->work); - /* could be a link, check and remove if it is */ - if (!io_poll_remove_link(ctx, cancel_req)) - io_timeout_remove_link(ctx, cancel_req); - io_put_req(cancel_req); - } - + /* cancel this request, or head link requests */ + io_attempt_cancel(ctx, cancel_req); + io_put_req(cancel_req); schedule(); finish_wait(&ctx->inflight_wait, &wait); } @@ -8548,6 +8605,9 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, ctx->user = user; ctx->creds = get_current_cred(); + mmgrab(current->mm); + ctx->sqo_mm = current->mm; + /* * Account memory _before_ installing the file descriptor. Once * the descriptor is installed, it can get closed at any time. Also diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c index 89f61d93c0bc..107ee80c3568 100644 --- a/fs/iomap/seek.c +++ b/fs/iomap/seek.c @@ -127,7 +127,7 @@ iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length, SEEK_HOLE); if (offset < 0) return length; - /* fall through */ + fallthrough; case IOMAP_HOLE: *(loff_t *)data = offset; return 0; @@ -175,7 +175,7 @@ iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length, SEEK_DATA); if (offset < 0) return length; - /*FALLTHRU*/ + fallthrough; default: *(loff_t *)data = offset; return 0; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index e4944436e733..17fdc482f554 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1285,7 +1285,7 @@ journal_t *jbd2_journal_init_inode(struct inode *inode) * superblock as being NULL to prevent the journal destroy from writing * back a bogus superblock. */ -static void journal_fail_superblock (journal_t *journal) +static void journal_fail_superblock(journal_t *journal) { struct buffer_head *bh = journal->j_sb_buffer; brelse(bh); @@ -1367,8 +1367,10 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags) int ret; /* Buffer got discarded which means block device got invalidated */ - if (!buffer_mapped(bh)) + if (!buffer_mapped(bh)) { + unlock_buffer(bh); return -EIO; + } trace_jbd2_write_superblock(journal, write_flags); if (!(journal->j_flags & JBD2_BARRIER)) @@ -1815,7 +1817,7 @@ int jbd2_journal_destroy(journal_t *journal) /** - *int jbd2_journal_check_used_features () - Check if features specified are used. + *int jbd2_journal_check_used_features() - Check if features specified are used. * @journal: Journal to check. * @compat: bitmask of compatible features * @ro: bitmask of features that force read-only mount @@ -1825,7 +1827,7 @@ int jbd2_journal_destroy(journal_t *journal) * features. Return true (non-zero) if it does. **/ -int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat, +int jbd2_journal_check_used_features(journal_t *journal, unsigned long compat, unsigned long ro, unsigned long incompat) { journal_superblock_t *sb; @@ -1860,7 +1862,7 @@ int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat, * all of a given set of features on this journal. Return true * (non-zero) if it can. */ -int jbd2_journal_check_available_features (journal_t *journal, unsigned long compat, +int jbd2_journal_check_available_features(journal_t *journal, unsigned long compat, unsigned long ro, unsigned long incompat) { if (!compat && !ro && !incompat) @@ -1882,7 +1884,7 @@ int jbd2_journal_check_available_features (journal_t *journal, unsigned long com } /** - * int jbd2_journal_set_features () - Mark a given journal feature in the superblock + * int jbd2_journal_set_features() - Mark a given journal feature in the superblock * @journal: Journal to act on. * @compat: bitmask of compatible features * @ro: bitmask of features that force read-only mount @@ -1893,7 +1895,7 @@ int jbd2_journal_check_available_features (journal_t *journal, unsigned long com * */ -int jbd2_journal_set_features (journal_t *journal, unsigned long compat, +int jbd2_journal_set_features(journal_t *journal, unsigned long compat, unsigned long ro, unsigned long incompat) { #define INCOMPAT_FEATURE_ON(f) \ diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 2ed278f0dced..faa97d748474 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -690,14 +690,11 @@ static int do_one_pass(journal_t *journal, * number. */ if (pass == PASS_SCAN && jbd2_has_feature_checksum(journal)) { - int chksum_err, chksum_seen; struct commit_header *cbh = (struct commit_header *)bh->b_data; unsigned found_chksum = be32_to_cpu(cbh->h_chksum[0]); - chksum_err = chksum_seen = 0; - if (info->end_transaction) { journal->j_failed_commit = info->end_transaction; @@ -705,42 +702,23 @@ static int do_one_pass(journal_t *journal, break; } - if (crc32_sum == found_chksum && - cbh->h_chksum_type == JBD2_CRC32_CHKSUM && - cbh->h_chksum_size == - JBD2_CRC32_CHKSUM_SIZE) - chksum_seen = 1; - else if (!(cbh->h_chksum_type == 0 && - cbh->h_chksum_size == 0 && - found_chksum == 0 && - !chksum_seen)) - /* - * If fs is mounted using an old kernel and then - * kernel with journal_chksum is used then we - * get a situation where the journal flag has - * checksum flag set but checksums are not - * present i.e chksum = 0, in the individual - * commit blocks. - * Hence to avoid checksum failures, in this - * situation, this extra check is added. - */ - chksum_err = 1; - - if (chksum_err) { - info->end_transaction = next_commit_ID; - - if (!jbd2_has_feature_async_commit(journal)) { - journal->j_failed_commit = - next_commit_ID; - brelse(bh); - break; - } - } + /* Neither checksum match nor unused? */ + if (!((crc32_sum == found_chksum && + cbh->h_chksum_type == + JBD2_CRC32_CHKSUM && + cbh->h_chksum_size == + JBD2_CRC32_CHKSUM_SIZE) || + (cbh->h_chksum_type == 0 && + cbh->h_chksum_size == 0 && + found_chksum == 0))) + goto chksum_error; + crc32_sum = ~0; } if (pass == PASS_SCAN && !jbd2_commit_block_csum_verify(journal, bh->b_data)) { + chksum_error: info->end_transaction = next_commit_ID; if (!jbd2_has_feature_async_commit(journal)) { diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e91aad3637a2..43985738aa86 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2026,6 +2026,9 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) */ static void __jbd2_journal_unfile_buffer(struct journal_head *jh) { + J_ASSERT_JH(jh, jh->b_transaction != NULL); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + __jbd2_journal_temp_unlink_buffer(jh); jh->b_transaction = NULL; } @@ -2078,10 +2081,6 @@ out: * int jbd2_journal_try_to_free_buffers() - try to free page buffers. * @journal: journal for operation * @page: to try and free - * @gfp_mask: we use the mask to detect how hard should we try to release - * buffers. If __GFP_DIRECT_RECLAIM and __GFP_FS is set, we wait for commit - * code to release the buffers. - * * * For all the buffers on this page, * if they are fully written out ordered data, move them onto BUF_CLEAN @@ -2112,11 +2111,11 @@ out: * * Return 0 on failure, 1 on success */ -int jbd2_journal_try_to_free_buffers(journal_t *journal, - struct page *page, gfp_t gfp_mask) +int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page) { struct buffer_head *head; struct buffer_head *bh; + bool has_write_io_error = false; int ret = 0; J_ASSERT(PageLocked(page)); @@ -2141,11 +2140,26 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, jbd2_journal_put_journal_head(jh); if (buffer_jbd(bh)) goto busy; + + /* + * If we free a metadata buffer which has been failed to + * write out, the jbd2 checkpoint procedure will not detect + * this failure and may lead to filesystem inconsistency + * after cleanup journal tail. + */ + if (buffer_write_io_error(bh)) { + pr_err("JBD2: Error while async write back metadata bh %llu.", + (unsigned long long)bh->b_blocknr); + has_write_io_error = true; + } } while ((bh = bh->b_this_page) != head); ret = try_to_free_buffers(page); busy: + if (has_write_io_error) + jbd2_journal_abort(journal, -EIO); + return ret; } @@ -2572,6 +2586,13 @@ bool __jbd2_journal_refile_buffer(struct journal_head *jh) was_dirty = test_clear_buffer_jbddirty(bh); __jbd2_journal_temp_unlink_buffer(jh); + + /* + * b_transaction must be set, otherwise the new b_transaction won't + * be holding jh reference + */ + J_ASSERT_JH(jh, jh->b_transaction != NULL); + /* * We set b_transaction here because b_next_transaction will inherit * our jh reference and thus __jbd2_journal_file_buffer() must not diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index ab8cdd9e9325..78858f6e9583 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -341,7 +341,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) rdev = old_decode_dev(je16_to_cpu(jdev.old_id)); else rdev = new_decode_dev(je32_to_cpu(jdev.new_id)); - /* fall through */ + fallthrough; case S_IFSOCK: case S_IFIFO: diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index bccfc40b3a74..2f6f0b140c05 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -1273,7 +1273,7 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c, dbg_readinode("symlink's target '%s' cached\n", f->target); } - /* fall through... */ + fallthrough; case S_IFBLK: case S_IFCHR: diff --git a/fs/libfs.c b/fs/libfs.c index 4d08edf19c78..e0d42e977d9a 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -137,11 +137,11 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) switch (whence) { case 1: offset += file->f_pos; - /* fall through */ + fallthrough; case 0: if (offset >= 0) break; - /* fall through */ + fallthrough; default: return -EINVAL; } diff --git a/fs/locks.c b/fs/locks.c index 8fc0542f5132..1f84a03601fe 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1499,7 +1499,7 @@ static void lease_clear_pending(struct file_lock *fl, int arg) switch (arg) { case F_UNLCK: fl->fl_flags &= ~FL_UNLOCK_PENDING; - /* fall through */ + fallthrough; case F_RDLCK: fl->fl_flags &= ~FL_DOWNGRADE_PENDING; } @@ -2525,7 +2525,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, cmd = F_SETLKW; file_lock->fl_flags |= FL_OFDLCK; file_lock->fl_owner = filp; - /* Fallthrough */ + fallthrough; case F_SETLKW: file_lock->fl_flags |= FL_SLEEP; } @@ -2656,7 +2656,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, cmd = F_SETLKW64; file_lock->fl_flags |= FL_OFDLCK; file_lock->fl_owner = filp; - /* Fallthrough */ + fallthrough; case F_SETLKW64: file_lock->fl_flags |= FL_SLEEP; } diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index d1a0e2c8b1b4..08108b6d2fa1 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -753,7 +753,7 @@ out: case -ENODEV: /* Our extent block devices are unavailable */ set_bit(NFS_LSEG_UNAVAILABLE, &lseg->pls_flags); - /* Fall through */ + fallthrough; case 0: return lseg; default: diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index a12f42e7d8c7..e732580fe47b 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1181,7 +1181,7 @@ int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags) /* A NFSv4 OPEN will revalidate later */ if (server->caps & NFS_CAP_ATOMIC_OPEN) goto out; - /* Fallthrough */ + fallthrough; case S_IFDIR: if (server->flags & NFS_MOUNT_NOCTO) break; diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index a13e69009f19..7f5aa0403e16 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -187,7 +187,7 @@ static int filelayout_async_handle_error(struct rpc_task *task, pnfs_error_mark_layout_for_return(inode, lseg); pnfs_set_lo_fail(lseg); rpc_wake_up(&tbl->slot_tbl_waitq); - /* fall through */ + fallthrough; default: reset: dprintk("%s Retry through MDS. Error %d\n", __func__, diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 965145592750..ff8965d1a4d4 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1133,7 +1133,7 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, nfs4_delete_deviceid(devid->ld, devid->nfs_client, &devid->deviceid); rpc_wake_up(&tbl->slot_tbl_waitq); - /* fall through */ + fallthrough; default: if (ff_layout_avoid_mds_available_ds(lseg)) return -NFS4ERR_RESET_TO_PNFS; @@ -1260,7 +1260,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, */ if (opnum == OP_READ) break; - /* Fallthrough */ + fallthrough; default: pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg); diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 66949da0e827..524812984e2d 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -651,21 +651,21 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) { case Opt_xprt_udp6: protofamily = AF_INET6; - /* fall through */ + fallthrough; case Opt_xprt_udp: ctx->flags &= ~NFS_MOUNT_TCP; ctx->nfs_server.protocol = XPRT_TRANSPORT_UDP; break; case Opt_xprt_tcp6: protofamily = AF_INET6; - /* fall through */ + fallthrough; case Opt_xprt_tcp: ctx->flags |= NFS_MOUNT_TCP; ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP; break; case Opt_xprt_rdma6: protofamily = AF_INET6; - /* fall through */ + fallthrough; case Opt_xprt_rdma: /* vector side protocols to TCP */ ctx->flags |= NFS_MOUNT_TCP; @@ -684,13 +684,13 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) { case Opt_xprt_udp6: mountfamily = AF_INET6; - /* fall through */ + fallthrough; case Opt_xprt_udp: ctx->mount_server.protocol = XPRT_TRANSPORT_UDP; break; case Opt_xprt_tcp6: mountfamily = AF_INET6; - /* fall through */ + fallthrough; case Opt_xprt_tcp: ctx->mount_server.protocol = XPRT_TRANSPORT_TCP; break; @@ -899,9 +899,11 @@ static int nfs23_parse_monolithic(struct fs_context *fc, ctx->version = NFS_DEFAULT_VERSION; switch (data->version) { case 1: - data->namlen = 0; /* fall through */ + data->namlen = 0; + fallthrough; case 2: - data->bsize = 0; /* fall through */ + data->bsize = 0; + fallthrough; case 3: if (data->flags & NFS_MOUNT_VER3) goto out_no_v3; @@ -909,14 +911,14 @@ static int nfs23_parse_monolithic(struct fs_context *fc, memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE); /* Turn off security negotiation */ extra_flags |= NFS_MOUNT_SECFLAVOUR; - /* fall through */ + fallthrough; case 4: if (data->flags & NFS_MOUNT_SECFLAVOUR) goto out_no_sec; - /* fall through */ + fallthrough; case 5: memset(data->context, 0, sizeof(data->context)); - /* fall through */ + fallthrough; case 6: if (data->flags & NFS_MOUNT_VER3) { if (data->root.size > NFS3_FHSIZE || data->root.size == 0) diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 26c94b32d6f4..c6c863382f37 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -108,7 +108,7 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type) case -EPROTONOSUPPORT: dprintk("NFS_V3_ACL extension not supported; disabling\n"); server->caps &= ~NFS_CAP_ACLS; - /* fall through */ + fallthrough; case -ENOTSUPP: status = -EOPNOTSUPP; default: @@ -228,7 +228,7 @@ static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, dprintk("NFS_V3_ACL SETACL RPC not supported" "(will not retry)\n"); server->caps &= ~NFS_CAP_ACLS; - /* fall through */ + fallthrough; case -ENOTSUPP: status = -EOPNOTSUPP; } diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index a33970765467..fdfc77486ace 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -211,7 +211,7 @@ static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence) ret = nfs42_proc_llseek(filep, offset, whence); if (ret != -ENOTSUPP) return ret; - /* Fall through */ + fallthrough; default: return nfs_file_llseek(filep, offset, whence); } diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index 1e7296395d71..62e6eea5c516 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -520,7 +520,7 @@ static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap, switch (token) { case Opt_find_uid: im->im_type = IDMAP_TYPE_USER; - /* Fall through */ + fallthrough; case Opt_find_gid: im->im_conv = IDMAP_CONV_NAMETOID; ret = match_strlcpy(im->im_name, &substr, IDMAP_NAMESZ); @@ -528,7 +528,7 @@ static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap, case Opt_find_user: im->im_type = IDMAP_TYPE_USER; - /* Fall through */ + fallthrough; case Opt_find_group: im->im_conv = IDMAP_CONV_IDTONAME; ret = match_int(&substr, &im->im_id); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index dbd01548335b..6e95c85fe395 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -483,7 +483,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server, stateid); goto wait_on_recovery; } - /* Fall through */ + fallthrough; case -NFS4ERR_OPENMODE: if (inode) { int err; @@ -534,10 +534,10 @@ static int nfs4_do_handle_exception(struct nfs_server *server, ret = -EBUSY; break; } - /* Fall through */ + fallthrough; case -NFS4ERR_DELAY: nfs_inc_server_stats(server, NFSIOS_DELAY); - /* Fall through */ + fallthrough; case -NFS4ERR_GRACE: case -NFS4ERR_LAYOUTTRYLATER: case -NFS4ERR_RECALLCONFLICT: @@ -1505,7 +1505,7 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode, case NFS4_OPEN_CLAIM_PREVIOUS: if (!test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags)) break; - /* Fall through */ + fallthrough; default: return 0; } @@ -2439,7 +2439,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) case NFS4_OPEN_CLAIM_DELEG_CUR_FH: case NFS4_OPEN_CLAIM_DELEG_PREV_FH: data->o_arg.open_bitmap = &nfs4_open_noattr_bitmap[0]; - /* Fall through */ + fallthrough; case NFS4_OPEN_CLAIM_FH: task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; } @@ -3293,8 +3293,10 @@ static int _nfs4_do_setattr(struct inode *inode, /* Servers should only apply open mode checks for file size changes */ truncate = (arg->iap->ia_valid & ATTR_SIZE) ? true : false; - if (!truncate) + if (!truncate) { + nfs4_inode_make_writeable(inode); goto zero_stateid; + } if (nfs4_copy_delegation_stateid(inode, FMODE_WRITE, &arg->stateid, &delegation_cred)) { /* Use that stateid */ @@ -3545,11 +3547,11 @@ static void nfs4_close_done(struct rpc_task *task, void *data) nfs4_free_revoked_stateid(server, &calldata->arg.stateid, task->tk_msg.rpc_cred); - /* Fallthrough */ + fallthrough; case -NFS4ERR_BAD_STATEID: if (calldata->arg.fmode == 0) break; - /* Fallthrough */ + fallthrough; default: task->tk_status = nfs4_async_handle_exception(task, server, task->tk_status, &exception); @@ -6294,7 +6296,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) nfs4_free_revoked_stateid(data->res.server, data->args.stateid, task->tk_msg.rpc_cred); - /* Fallthrough */ + fallthrough; case -NFS4ERR_BAD_STATEID: case -NFS4ERR_STALE_STATEID: case -ETIMEDOUT: @@ -6314,7 +6316,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) data->res.fattr = NULL; goto out_restart; } - /* Fallthrough */ + fallthrough; default: task->tk_status = nfs4_async_handle_exception(task, data->res.server, task->tk_status, @@ -6622,13 +6624,13 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) if (nfs4_update_lock_stateid(calldata->lsp, &calldata->res.stateid)) break; - /* Fall through */ + fallthrough; case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_EXPIRED: nfs4_free_revoked_stateid(calldata->server, &calldata->arg.stateid, task->tk_msg.rpc_cred); - /* Fall through */ + fallthrough; case -NFS4ERR_BAD_STATEID: case -NFS4ERR_STALE_STATEID: if (nfs4_sync_lock_stateid(&calldata->arg.stateid, @@ -7298,7 +7300,12 @@ int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, err = nfs4_set_lock_state(state, fl); if (err != 0) return err; - err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW); + do { + err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW); + if (err != -NFS4ERR_DELAY) + break; + ssleep(1); + } while (err == -NFS4ERR_DELAY); return nfs4_handle_delegation_recall_error(server, state, stateid, fl, err); } @@ -8665,7 +8672,7 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata) dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status); rpc_delay(task, NFS4_POLL_RETRY_MIN); task->tk_status = 0; - /* fall through */ + fallthrough; case -NFS4ERR_RETRY_UNCACHED_REP: rpc_restart_call_prepare(task); return; @@ -9113,13 +9120,13 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf switch(task->tk_status) { case 0: wake_up_all(&clp->cl_lock_waitq); - /* Fallthrough */ + fallthrough; case -NFS4ERR_COMPLETE_ALREADY: case -NFS4ERR_WRONG_CRED: /* What to do here? */ break; case -NFS4ERR_DELAY: rpc_delay(task, NFS4_POLL_RETRY_MAX); - /* fall through */ + fallthrough; case -NFS4ERR_RETRY_UNCACHED_REP: return -EAGAIN; case -NFS4ERR_BADSESSION: @@ -9434,10 +9441,10 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) &lrp->args.range, lrp->args.inode)) goto out_restart; - /* Fallthrough */ + fallthrough; default: task->tk_status = 0; - /* Fallthrough */ + fallthrough; case 0: break; case -NFS4ERR_DELAY: diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index b1dba24918f8..4bf10792cb5b 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1530,7 +1530,7 @@ restart: default: pr_err("NFS: %s: unhandled error %d\n", __func__, status); - /* Fall through */ + fallthrough; case -ENOMEM: case -NFS4ERR_DENIED: case -NFS4ERR_RECLAIM_BAD: @@ -1667,7 +1667,7 @@ restart: break; } printk(KERN_ERR "NFS: %s: unhandled error %d\n", __func__, status); - /* Fall through */ + fallthrough; case -ENOENT: case -ENOMEM: case -EACCES: @@ -1683,7 +1683,7 @@ restart: set_bit(ops->state_flag_bit, &state->flags); break; } - /* Fall through */ + fallthrough; case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_OLD_STATEID: @@ -1695,7 +1695,7 @@ restart: case -NFS4ERR_EXPIRED: case -NFS4ERR_NO_GRACE: nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); - /* Fall through */ + fallthrough; case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: @@ -2273,11 +2273,11 @@ again: case -ETIMEDOUT: if (clnt->cl_softrtry) break; - /* Fall through */ + fallthrough; case -NFS4ERR_DELAY: case -EAGAIN: ssleep(1); - /* Fall through */ + fallthrough; case -NFS4ERR_STALE_CLIENTID: dprintk("NFS: %s after status %d, retrying\n", __func__, status); @@ -2289,7 +2289,7 @@ again: } if (clnt->cl_auth->au_flavor == RPC_AUTH_UNIX) break; - /* Fall through */ + fallthrough; case -NFS4ERR_CLID_INUSE: case -NFS4ERR_WRONGSEC: /* No point in retrying if we already used RPC_AUTH_UNIX */ diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 6ea4cac41e46..6985cacf4700 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -711,7 +711,7 @@ static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr, case FLUSH_COND_STABLE: if (nfs_reqs_to_commit(cinfo)) break; - /* fall through */ + fallthrough; default: hdr->args.stable = NFS_FILE_SYNC; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 40332c758d84..71f7741126b6 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1541,7 +1541,7 @@ void pnfs_roc_release(struct nfs4_layoutreturn_args *args, case 0: if (res->lrs_present) res_stateid = &res->stateid; - /* Fallthrough */ + fallthrough; default: arg_stateid = &args->stateid; } diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c index 8ceb6425e01a..d056ad2fdefd 100644 --- a/fs/nfs_common/nfsacl.c +++ b/fs/nfs_common/nfsacl.c @@ -237,7 +237,7 @@ posix_acl_from_nfsacl(struct posix_acl *acl) break; case ACL_MASK: mask = pa; - /* fall through */ + fallthrough; case ACL_OTHER: break; } diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 9bbaa671c079..311e5ce80cfc 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -83,13 +83,13 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp, bex->soff = iomap.addr; break; } - /*FALLTHRU*/ + fallthrough; case IOMAP_HOLE: if (seg->iomode == IOMODE_READ) { bex->es = PNFS_BLOCK_NONE_DATA; break; } - /*FALLTHRU*/ + fallthrough; case IOMAP_DELALLOC: default: WARN(1, "pnfsd: filesystem returned %d extent\n", iomap.type); diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 7fbe9840a03e..052be5bf9ef5 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1119,7 +1119,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback break; case -ESERVERFAULT: ++session->se_cb_seq_nr; - /* Fall through */ + fallthrough; case 1: case -NFS4ERR_BADSESSION: nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status); diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index e12409eca7cc..a97873f2d22b 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -681,7 +681,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) rpc_delay(task, HZ/100); /* 10 mili-seconds */ return 0; } - /* Fallthrough */ + fallthrough; default: /* * Unknown error or non-responding client, we'll need to fence. diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index a527da3d8052..eaf50eafa935 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -428,7 +428,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; reclaim = true; - /* fall through */ + fallthrough; case NFS4_OPEN_CLAIM_FH: case NFS4_OPEN_CLAIM_DELEG_CUR_FH: status = do_open_fhandle(rqstp, cstate, open); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 81ed8e8bab3f..c09a2a4281ec 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3117,7 +3117,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, break; default: /* checked by xdr code */ WARN_ON_ONCE(1); - /* fall through */ + fallthrough; case SP4_SSV: status = nfserr_encr_alg_unsupp; goto out_nolock; @@ -4532,7 +4532,7 @@ static int nfsd4_cb_recall_done(struct nfsd4_callback *cb, rpc_delay(task, 2 * HZ); return 0; } - /*FALLTHRU*/ + fallthrough; default: return 1; } @@ -4597,6 +4597,8 @@ static bool nfsd_breaker_owns_lease(struct file_lock *fl) if (!i_am_nfsd()) return NULL; rqst = kthread_data(current); + if (!rqst->rq_lease_breaker) + return NULL; clp = *(rqst->rq_lease_breaker); return dl->dl_stid.sc_client == clp; } @@ -5652,7 +5654,7 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) break; default: printk("unknown stateid type %x\n", s->sc_type); - /* Fallthrough */ + fallthrough; case NFS4_CLOSED_STID: case NFS4_CLOSED_DELEG_STID: status = nfserr_bad_stateid; @@ -6742,7 +6744,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, case NFS4_READW_LT: if (nfsd4_has_session(cstate)) fl_flags |= FL_SLEEP; - /* Fallthrough */ + fallthrough; case NFS4_READ_LT: spin_lock(&fp->fi_lock); nf = find_readable_file_locked(fp); @@ -6754,7 +6756,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, case NFS4_WRITEW_LT: if (nfsd4_has_session(cstate)) fl_flags |= FL_SLEEP; - /* Fallthrough */ + fallthrough; case NFS4_WRITE_LT: spin_lock(&fp->fi_lock); nf = find_writeable_file_locked(fp); @@ -6816,7 +6818,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, break; case FILE_LOCK_DEFERRED: nbl = NULL; - /* Fallthrough */ + fallthrough; case -EAGAIN: /* conflock holds conflicting lock */ status = nfserr_denied; dprintk("NFSD: nfsd4_lock: conflicting lock found!\n"); diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 37bc8f5f4514..c81dbbad8792 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -459,7 +459,7 @@ static bool fsid_type_ok_for_exp(u8 fsid_type, struct svc_export *exp) case FSID_DEV: if (!old_valid_dev(exp_sb(exp)->s_dev)) return false; - /* FALL THROUGH */ + fallthrough; case FSID_MAJOR_MINOR: case FSID_ENCODE_DEV: return exp_sb(exp)->s_type->fs_flags & FS_REQUIRES_DEV; @@ -469,7 +469,7 @@ static bool fsid_type_ok_for_exp(u8 fsid_type, struct svc_export *exp) case FSID_UUID16: if (!is_root_export(exp)) return false; - /* fall through */ + fallthrough; case FSID_UUID4_INUM: case FSID_UUID16_INUM: return exp->ex_uuid != NULL; diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 543bbe0a556e..6e0b066480c5 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -314,7 +314,7 @@ nfsd_proc_create(struct svc_rqst *rqstp) rdev = inode->i_rdev; attr->ia_valid |= ATTR_SIZE; - /* FALLTHROUGH */ + fallthrough; case S_IFIFO: /* this is probably a permission check.. * at least IRIX implements perm checking on diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index b603dfcdd361..f7f6473578af 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -221,7 +221,7 @@ int nfsd_vers(struct nfsd_net *nn, int vers, enum vers_op change) case NFSD_TEST: if (nn->nfsd_versions) return nn->nfsd_versions[vers]; - /* Fallthrough */ + fallthrough; case NFSD_AVAIL: return nfsd_support_version(vers); } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 7d2933b85b65..aba5af9df328 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1456,7 +1456,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, *created = true; break; } - /* fall through */ + fallthrough; case NFS4_CREATE_EXCLUSIVE4_1: if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime && d_inode(dchild)->i_atime.tv_sec == v_atime @@ -1465,7 +1465,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, *created = true; goto set_attr; } - /* fall through */ + fallthrough; case NFS3_CREATE_GUARDED: err = nfserr_exist; } diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index fb5a9a8a13cf..e516ae389ca5 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -519,7 +519,7 @@ int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode) break; case NILFS_IFILE_INO: lockdep_set_class(&bmap->b_sem, &nilfs_bmap_mdt_lock_key); - /* Fall through */ + fallthrough; default: bmap->b_ptr_type = NILFS_BMAP_PTR_VM; bmap->b_last_allocated_key = 0; diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index 0b453ef8fae5..2217f904a7cf 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -626,7 +626,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs, !(flags & NILFS_SS_SYNDT)) goto try_next_pseg; state = RF_DSYNC_ST; - /* Fall through */ + fallthrough; case RF_DSYNC_ST: if (!(flags & NILFS_SS_SYNDT)) goto confused; diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index a651e821c2de..e3726aca28ed 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1138,7 +1138,8 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) nilfs_sc_cstage_set(sci, NILFS_ST_DAT); goto dat_stage; } - nilfs_sc_cstage_inc(sci); /* Fall through */ + nilfs_sc_cstage_inc(sci); + fallthrough; case NILFS_ST_GC: if (nilfs_doing_gc()) { head = &sci->sc_gc_inodes; @@ -1159,7 +1160,8 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) } sci->sc_stage.gc_inode_ptr = NULL; } - nilfs_sc_cstage_inc(sci); /* Fall through */ + nilfs_sc_cstage_inc(sci); + fallthrough; case NILFS_ST_FILE: head = &sci->sc_dirty_files; ii = list_prepare_entry(sci->sc_stage.dirty_file_ptr, head, @@ -1186,7 +1188,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) } nilfs_sc_cstage_inc(sci); sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED; - /* Fall through */ + fallthrough; case NILFS_ST_IFILE: err = nilfs_segctor_scan_file(sci, sci->sc_root->ifile, &nilfs_sc_file_ops); @@ -1197,13 +1199,14 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) err = nilfs_segctor_create_checkpoint(sci); if (unlikely(err)) break; - /* Fall through */ + fallthrough; case NILFS_ST_CPFILE: err = nilfs_segctor_scan_file(sci, nilfs->ns_cpfile, &nilfs_sc_file_ops); if (unlikely(err)) break; - nilfs_sc_cstage_inc(sci); /* Fall through */ + nilfs_sc_cstage_inc(sci); + fallthrough; case NILFS_ST_SUFILE: err = nilfs_sufile_freev(nilfs->ns_sufile, sci->sc_freesegs, sci->sc_nfreesegs, &ndone); @@ -1219,7 +1222,8 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) &nilfs_sc_file_ops); if (unlikely(err)) break; - nilfs_sc_cstage_inc(sci); /* Fall through */ + nilfs_sc_cstage_inc(sci); + fallthrough; case NILFS_ST_DAT: dat_stage: err = nilfs_segctor_scan_file(sci, nilfs->ns_dat, @@ -1230,7 +1234,8 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) nilfs_sc_cstage_set(sci, NILFS_ST_DONE); return 0; } - nilfs_sc_cstage_inc(sci); /* Fall through */ + nilfs_sc_cstage_inc(sci); + fallthrough; case NILFS_ST_SR: if (mode == SC_LSEG_SR) { /* Appending a super root */ diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 559de311deca..3e01d8f2ab90 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1147,7 +1147,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, } switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) { - case FAN_MARK_ADD: /* fallthrough */ + case FAN_MARK_ADD: case FAN_MARK_REMOVE: if (!mask) return -EINVAL; diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index 1ef24574f481..cea739be77c4 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -67,7 +67,7 @@ static void o2quo_fence_self(void) default: WARN_ON(o2nm_single_cluster->cl_fence_method >= O2NM_FENCE_METHODS); - /* fall through */ + fallthrough; case O2NM_FENCE_RESET: printk(KERN_ERR "*** ocfs2 is very sorry to be fencing this " "system by restarting ***\n"); diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c index 819428dfa32f..3ce89216670c 100644 --- a/fs/pstore/zone.c +++ b/fs/pstore/zone.c @@ -1081,7 +1081,6 @@ next_zone: readop = psz_ftrace_read; break; case PSTORE_TYPE_CONSOLE: - fallthrough; case PSTORE_TYPE_PMSG: readop = psz_record_read; break; diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 5444d3c4d93f..47f9e151988b 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -38,7 +38,7 @@ static int check_quotactl_permission(struct super_block *sb, int type, int cmd, if ((type == USRQUOTA && uid_eq(current_euid(), make_kuid(current_user_ns(), id))) || (type == GRPQUOTA && in_egroup_p(make_kgid(current_user_ns(), id)))) break; - /*FALLTHROUGH*/ + fallthrough; default: if (!capable(CAP_SYS_ADMIN)) return -EPERM; diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c index 6b2b4362089e..b57b3ffcbc32 100644 --- a/fs/romfs/storage.c +++ b/fs/romfs/storage.c @@ -217,10 +217,8 @@ int romfs_dev_read(struct super_block *sb, unsigned long pos, size_t limit; limit = romfs_maxsize(sb); - if (pos >= limit) + if (pos >= limit || buflen > limit - pos) return -EIO; - if (buflen > limit - pos) - buflen = limit - pos; #ifdef CONFIG_ROMFS_ON_MTD if (sb->s_mtd) diff --git a/fs/seq_file.c b/fs/seq_file.c index 4e6239f33c06..31219c1db17d 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -295,7 +295,7 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence) switch (whence) { case SEEK_CUR: offset += file->f_pos; - /* fall through */ + fallthrough; case SEEK_SET: if (offset < 0) break; diff --git a/fs/signalfd.c b/fs/signalfd.c index 5b78719be445..456046e15873 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -176,7 +176,7 @@ static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, kernel_siginfo_t *info if (!nonblock) break; ret = -EAGAIN; - /* fall through */ + fallthrough; default: spin_unlock_irq(¤t->sighand->siglock); return ret; diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 76bb1c846845..8a19773b5a0b 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -87,7 +87,11 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length, int error, i; struct bio *bio; - bio = bio_alloc(GFP_NOIO, page_count); + if (page_count <= BIO_MAX_PAGES) + bio = bio_alloc(GFP_NOIO, page_count); + else + bio = bio_kmalloc(GFP_NOIO, page_count); + if (!bio) return -ENOMEM; diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index 22bfda158f7f..6d6cd85c2b4c 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -269,7 +269,7 @@ void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops, break; /* No more room on heap so make it un-categorized */ cat = LPROPS_UNCAT; - /* Fall through */ + fallthrough; case LPROPS_UNCAT: list_add(&lprops->list, &c->uncat_list); break; @@ -313,7 +313,7 @@ static void ubifs_remove_from_cat(struct ubifs_info *c, case LPROPS_FREEABLE: c->freeable_cnt -= 1; ubifs_assert(c, c->freeable_cnt >= 0); - /* Fall through */ + fallthrough; case LPROPS_UNCAT: case LPROPS_EMPTY: case LPROPS_FRDI_IDX: diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c index 6023c97c6da2..25ff91c7e94a 100644 --- a/fs/udf/symlink.c +++ b/fs/udf/symlink.c @@ -52,7 +52,7 @@ static int udf_pc_to_char(struct super_block *sb, unsigned char *from, elen += pc->lengthComponentIdent; break; } - /* Fall through */ + fallthrough; case 2: if (tolen == 0) return -ENAMETOOLONG; diff --git a/fs/ufs/util.h b/fs/ufs/util.h index e1f1b2e868a7..4931bec1a01c 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -42,7 +42,7 @@ ufs_get_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1, case UFS_ST_SUNOS: if (fs32_to_cpu(sb, usb3->fs_postblformat) == UFS_42POSTBLFMT) return fs32_to_cpu(sb, usb1->fs_u0.fs_sun.fs_state); - /* Fall Through - to UFS_ST_SUN */ + fallthrough; /* to UFS_ST_SUN */ case UFS_ST_SUN: return fs32_to_cpu(sb, usb3->fs_un2.fs_sun.fs_state); case UFS_ST_SUNx86: @@ -63,7 +63,7 @@ ufs_set_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1, usb1->fs_u0.fs_sun.fs_state = cpu_to_fs32(sb, value); break; } - /* Fall Through - to UFS_ST_SUN */ + fallthrough; /* to UFS_ST_SUN */ case UFS_ST_SUN: usb3->fs_un2.fs_sun.fs_state = cpu_to_fs32(sb, value); break; @@ -197,7 +197,7 @@ ufs_get_inode_uid(struct super_block *sb, struct ufs_inode *inode) case UFS_UID_EFT: if (inode->ui_u1.oldids.ui_suid == 0xFFFF) return fs32_to_cpu(sb, inode->ui_u3.ui_sun.ui_uid); - /* Fall through */ + fallthrough; default: return fs16_to_cpu(sb, inode->ui_u1.oldids.ui_suid); } @@ -215,7 +215,7 @@ ufs_set_inode_uid(struct super_block *sb, struct ufs_inode *inode, u32 value) inode->ui_u3.ui_sun.ui_uid = cpu_to_fs32(sb, value); if (value > 0xFFFF) value = 0xFFFF; - /* Fall through */ + fallthrough; default: inode->ui_u1.oldids.ui_suid = cpu_to_fs16(sb, value); break; @@ -231,7 +231,7 @@ ufs_get_inode_gid(struct super_block *sb, struct ufs_inode *inode) case UFS_UID_EFT: if (inode->ui_u1.oldids.ui_sgid == 0xFFFF) return fs32_to_cpu(sb, inode->ui_u3.ui_sun.ui_gid); - /* Fall through */ + fallthrough; default: return fs16_to_cpu(sb, inode->ui_u1.oldids.ui_sgid); } @@ -249,7 +249,7 @@ ufs_set_inode_gid(struct super_block *sb, struct ufs_inode *inode, u32 value) inode->ui_u3.ui_sun.ui_gid = cpu_to_fs32(sb, value); if (value > 0xFFFF) value = 0xFFFF; - /* Fall through */ + fallthrough; default: inode->ui_u1.oldids.ui_sgid = cpu_to_fs16(sb, value); break; diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c index 96bd160da48b..018057546067 100644 --- a/fs/vboxsf/utils.c +++ b/fs/vboxsf/utils.c @@ -226,7 +226,7 @@ int vboxsf_getattr(const struct path *path, struct kstat *kstat, break; case AT_STATX_FORCE_SYNC: sf_i->force_restat = 1; - /* fall-through */ + fallthrough; default: err = vboxsf_inode_revalidate(dentry); } diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 8623c815164a..305d4bc07337 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -653,8 +653,8 @@ xfs_attr_shortform_create( ASSERT(ifp->if_flags & XFS_IFINLINE); } xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK); - hdr = (xfs_attr_sf_hdr_t *)ifp->if_u1.if_data; - hdr->count = 0; + hdr = (struct xfs_attr_sf_hdr *)ifp->if_u1.if_data; + memset(hdr, 0, sizeof(*hdr)); hdr->totsize = cpu_to_be16(sizeof(*hdr)); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); } @@ -1036,8 +1036,10 @@ xfs_attr_shortform_verify( * struct xfs_attr_sf_entry has a variable length. * Check the fixed-offset parts of the structure are * within the data buffer. + * xfs_attr_sf_entry is defined with a 1-byte variable + * array at the end, so we must subtract that off. */ - if (((char *)sfep + sizeof(*sfep)) >= endp) + if (((char *)sfep + sizeof(*sfep) - 1) >= endp) return __this_address; /* Don't allow names with known bad length. */ diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 9c40d5971035..1b0a01b06a05 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -6226,7 +6226,7 @@ xfs_bmap_validate_extent( isrt = XFS_IS_REALTIME_INODE(ip); endfsb = irec->br_startblock + irec->br_blockcount - 1; - if (isrt) { + if (isrt && whichfork == XFS_DATA_FORK) { if (!xfs_verify_rtbno(mp, irec->br_startblock)) return __this_address; if (!xfs_verify_rtbno(mp, endfsb)) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index f742a96a2fe1..a6b37db55169 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -688,7 +688,7 @@ xfs_ialloc_ag_alloc( args.minalignslop = igeo->cluster_align - 1; /* Allow space for the inode btree to split. */ - args.minleft = igeo->inobt_maxlevels - 1; + args.minleft = igeo->inobt_maxlevels; if ((error = xfs_alloc_vextent(&args))) return error; @@ -736,7 +736,7 @@ xfs_ialloc_ag_alloc( /* * Allow space for the inode btree to split. */ - args.minleft = igeo->inobt_maxlevels - 1; + args.minleft = igeo->inobt_maxlevels; if ((error = xfs_alloc_vextent(&args))) return error; } diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index e15129647e00..b7e222befb08 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -110,9 +110,9 @@ xfs_trans_log_inode( * to log the timestamps, or will clear already cleared fields in the * worst case. */ - if (inode->i_state & (I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED)) { + if (inode->i_state & I_DIRTY_TIME) { spin_lock(&inode->i_lock); - inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED); + inode->i_state &= ~I_DIRTY_TIME; spin_unlock(&inode->i_lock); } diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index c6df01a2a158..7ad3659c5d2a 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -58,7 +58,7 @@ #define XFS_IALLOC_SPACE_RES(mp) \ (M_IGEO(mp)->ialloc_blks + \ ((xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1) * \ - (M_IGEO(mp)->inobt_maxlevels - 1))) + M_IGEO(mp)->inobt_maxlevels)) /* * Space reservation values for various transactions. diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 73cafc843cd7..5123f82f2477 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1165,7 +1165,7 @@ xfs_insert_file_space( goto out_trans_cancel; do { - error = xfs_trans_roll_inode(&tp, ip); + error = xfs_defer_finish(&tp); if (error) goto out_trans_cancel; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index c31cd3be9fb2..a29f78a663ca 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1223,6 +1223,14 @@ __xfs_filemap_fault( return ret; } +static inline bool +xfs_is_write_fault( + struct vm_fault *vmf) +{ + return (vmf->flags & FAULT_FLAG_WRITE) && + (vmf->vma->vm_flags & VM_SHARED); +} + static vm_fault_t xfs_filemap_fault( struct vm_fault *vmf) @@ -1230,7 +1238,7 @@ xfs_filemap_fault( /* DAX can shortcut the normal fault path on write faults! */ return __xfs_filemap_fault(vmf, PE_SIZE_PTE, IS_DAX(file_inode(vmf->vma->vm_file)) && - (vmf->flags & FAULT_FLAG_WRITE)); + xfs_is_write_fault(vmf)); } static vm_fault_t @@ -1243,7 +1251,7 @@ xfs_filemap_huge_fault( /* DAX can shortcut the normal fault path on write faults! */ return __xfs_filemap_fault(vmf, pe_size, - (vmf->flags & FAULT_FLAG_WRITE)); + xfs_is_write_fault(vmf)); } static vm_fault_t |