diff options
Diffstat (limited to 'fs')
40 files changed, 372 insertions, 246 deletions
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 6765949b3aab..380ad5ace7cf 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -169,7 +169,7 @@ static int afs_record_cm_probe(struct afs_call *call, struct afs_server *server) spin_lock(&server->probe_lock); - if (!test_bit(AFS_SERVER_FL_HAVE_EPOCH, &server->flags)) { + if (!test_and_set_bit(AFS_SERVER_FL_HAVE_EPOCH, &server->flags)) { server->cm_epoch = call->epoch; server->probe.cm_epoch = call->epoch; goto out; diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c index e1b9ed679045..a587767b6ae1 100644 --- a/fs/afs/fs_probe.c +++ b/fs/afs/fs_probe.c @@ -117,11 +117,8 @@ out: (unsigned int)rtt, ret); have_result |= afs_fs_probe_done(server); - if (have_result) { - server->probe.have_result = true; - wake_up_var(&server->probe.have_result); + if (have_result) wake_up_all(&server->probe_wq); - } } /* diff --git a/fs/afs/internal.h b/fs/afs/internal.h index ef732dd4e7ef..80255513e230 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -533,12 +533,10 @@ struct afs_server { u32 abort_code; u32 cm_epoch; short error; - bool have_result; bool responded:1; bool is_yfs:1; bool not_yfs:1; bool local_failure:1; - bool no_epoch:1; bool cm_probed:1; bool said_rebooted:1; bool said_inconsistent:1; @@ -1335,7 +1333,7 @@ extern struct afs_volume *afs_create_volume(struct afs_fs_context *); extern void afs_activate_volume(struct afs_volume *); extern void afs_deactivate_volume(struct afs_volume *); extern void afs_put_volume(struct afs_cell *, struct afs_volume *); -extern int afs_check_volume_status(struct afs_volume *, struct key *); +extern int afs_check_volume_status(struct afs_volume *, struct afs_fs_cursor *); /* * write.c diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index 172ba569cd60..2a3305e42b14 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -192,7 +192,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) write_unlock(&vnode->volume->servers_lock); set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags); - error = afs_check_volume_status(vnode->volume, fc->key); + error = afs_check_volume_status(vnode->volume, fc); if (error < 0) goto failed_set_error; @@ -281,7 +281,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) set_bit(AFS_VOLUME_WAIT, &vnode->volume->flags); set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags); - error = afs_check_volume_status(vnode->volume, fc->key); + error = afs_check_volume_status(vnode->volume, fc); if (error < 0) goto failed_set_error; @@ -341,7 +341,7 @@ start: /* See if we need to do an update of the volume record. Note that the * volume may have moved or even have been deleted. */ - error = afs_check_volume_status(vnode->volume, fc->key); + error = afs_check_volume_status(vnode->volume, fc); if (error < 0) goto failed_set_error; diff --git a/fs/afs/server.c b/fs/afs/server.c index b7f3cb2130ca..11b90ac7ea30 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -594,12 +594,9 @@ retry: } ret = wait_on_bit(&server->flags, AFS_SERVER_FL_UPDATING, - TASK_INTERRUPTIBLE); + (fc->flags & AFS_FS_CURSOR_INTR) ? + TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); if (ret == -ERESTARTSYS) { - if (!(fc->flags & AFS_FS_CURSOR_INTR) && server->addresses) { - _leave(" = t [intr]"); - return true; - } fc->error = ret; _leave(" = f [intr]"); return false; diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c index 9a5ce9687779..72eacc14e6e1 100644 --- a/fs/afs/vl_rotate.c +++ b/fs/afs/vl_rotate.c @@ -302,8 +302,8 @@ static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc) pr_notice("VC: - nr=%u/%u/%u pf=%u\n", a->nr_ipv4, a->nr_addrs, a->max_addrs, a->preferred); - pr_notice("VC: - pr=%lx R=%lx F=%lx\n", - a->probed, a->responded, a->failed); + pr_notice("VC: - R=%lx F=%lx\n", + a->responded, a->failed); if (a == vc->ac.alist) pr_notice("VC: - current\n"); } diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 92ca5e27573b..4310336b9bb8 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -281,7 +281,7 @@ error: /* * Make sure the volume record is up to date. */ -int afs_check_volume_status(struct afs_volume *volume, struct key *key) +int afs_check_volume_status(struct afs_volume *volume, struct afs_fs_cursor *fc) { time64_t now = ktime_get_real_seconds(); int ret, retries = 0; @@ -299,7 +299,7 @@ retry: } if (!test_and_set_bit_lock(AFS_VOLUME_UPDATING, &volume->flags)) { - ret = afs_update_volume_status(volume, key); + ret = afs_update_volume_status(volume, fc->key); clear_bit_unlock(AFS_VOLUME_WAIT, &volume->flags); clear_bit_unlock(AFS_VOLUME_UPDATING, &volume->flags); wake_up_bit(&volume->flags, AFS_VOLUME_WAIT); @@ -312,7 +312,9 @@ retry: return 0; } - ret = wait_on_bit(&volume->flags, AFS_VOLUME_WAIT, TASK_INTERRUPTIBLE); + ret = wait_on_bit(&volume->flags, AFS_VOLUME_WAIT, + (fc->flags & AFS_FS_CURSOR_INTR) ? + TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); if (ret == -ERESTARTSYS) { _leave(" = %d", ret); return ret; diff --git a/fs/block_dev.c b/fs/block_dev.c index 52b6f646cdbd..93672c3f1c78 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -19,7 +19,6 @@ #include <linux/module.h> #include <linux/blkpg.h> #include <linux/magic.h> -#include <linux/dax.h> #include <linux/buffer_head.h> #include <linux/swap.h> #include <linux/pagevec.h> @@ -1893,6 +1892,16 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) struct gendisk *disk = bdev->bd_disk; struct block_device *victim = NULL; + /* + * Sync early if it looks like we're the last one. If someone else + * opens the block device between now and the decrement of bd_openers + * then we did a sync that we didn't need to, but that's not the end + * of the world and we want to avoid long (could be several minute) + * syncs while holding the mutex. + */ + if (bdev->bd_openers == 1) + sync_blockdev(bdev); + mutex_lock_nested(&bdev->bd_mutex, for_part); if (for_part) bdev->bd_part_count--; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 7e362a6935fd..d35936c934ab 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1527,8 +1527,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, int clear_rsv = 0; int ret; - if (!rc || !rc->create_reloc_tree || - root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + if (!rc) return 0; /* @@ -1538,12 +1537,28 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, if (reloc_root_is_dead(root)) return 0; + /* + * This is subtle but important. We do not do + * record_root_in_transaction for reloc roots, instead we record their + * corresponding fs root, and then here we update the last trans for the + * reloc root. This means that we have to do this for the entire life + * of the reloc root, regardless of which stage of the relocation we are + * in. + */ if (root->reloc_root) { reloc_root = root->reloc_root; reloc_root->last_trans = trans->transid; return 0; } + /* + * We are merging reloc roots, we do not need new reloc trees. Also + * reloc trees never need their own reloc tree. + */ + if (!rc->create_reloc_tree || + root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + return 0; + if (!trans->reloc_reserved) { rsv = trans->block_rsv; trans->block_rsv = rc->block_rsv; diff --git a/fs/buffer.c b/fs/buffer.c index f73276d746bb..a60f60396cfa 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -967,7 +967,7 @@ grow_dev_page(struct block_device *bdev, sector_t block, struct page *page; struct buffer_head *bh; sector_t end_block; - int ret = 0; /* Will call free_more_memory() */ + int ret = 0; gfp_t gfp_mask; gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp; @@ -1371,6 +1371,17 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size) } EXPORT_SYMBOL(__breadahead); +void __breadahead_gfp(struct block_device *bdev, sector_t block, unsigned size, + gfp_t gfp) +{ + struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp); + if (likely(bh)) { + ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh); + brelse(bh); + } +} +EXPORT_SYMBOL(__breadahead_gfp); + /** * __bread_gfp() - reads a specified block and returns the bh * @bdev: the block_device to read from diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 140efc1a9374..182b864b3075 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -594,6 +594,8 @@ decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr) cifs_max_pending); set_credits(server, server->maxReq); server->maxBuf = le16_to_cpu(rsp->MaxBufSize); + /* set up max_read for readpages check */ + server->max_read = server->maxBuf; /* even though we do not use raw we might as well set this accurately, in case we ever find a need for it */ if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) { @@ -755,6 +757,8 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) set_credits(server, server->maxReq); /* probably no need to store and check maxvcs */ server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize); + /* set up max_read for readpages check */ + server->max_read = server->maxBuf; server->max_rw = le32_to_cpu(pSMBr->MaxRawSize); cifs_dbg(NOISY, "Max buf = %d\n", ses->server->maxBuf); server->capabilities = le32_to_cpu(pSMBr->Capabilities); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 8fbbdcdad8ff..390d2b15ef6e 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -61,7 +61,7 @@ static void cifs_set_ops(struct inode *inode) } /* check if server can support readpages */ - if (cifs_sb_master_tcon(cifs_sb)->ses->server->maxBuf < + if (cifs_sb_master_tcon(cifs_sb)->ses->server->max_read < PAGE_SIZE + MAX_CIFS_HDR_SIZE) inode->i_data.a_ops = &cifs_addr_ops_smallbuf; else diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 47d3e382ecaa..b30aa3cdd845 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1552,6 +1552,21 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) } rc = SMB2_sess_establish_session(sess_data); +#ifdef CONFIG_CIFS_DEBUG_DUMP_KEYS + if (ses->server->dialect < SMB30_PROT_ID) { + cifs_dbg(VFS, "%s: dumping generated SMB2 session keys\n", __func__); + /* + * The session id is opaque in terms of endianness, so we can't + * print it as a long long. we dump it as we got it on the wire + */ + cifs_dbg(VFS, "Session Id %*ph\n", (int)sizeof(ses->Suid), + &ses->Suid); + cifs_dbg(VFS, "Session Key %*ph\n", + SMB2_NTLMV2_SESSKEY_SIZE, ses->auth_key.response); + cifs_dbg(VFS, "Signing Key %*ph\n", + SMB3_SIGN_KEY_SIZE, ses->auth_key.response); + } +#endif out: kfree(ntlmssp_blob); SMB2_sess_free_buffer(sess_data); diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 1a6c227ada8f..c0348e3b1695 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -660,8 +660,8 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) return rc; if (memcmp(server_response_sig, shdr->Signature, SMB2_SIGNATURE_SIZE)) { - dump_stack(); - cifs_dbg(VFS, "sign fail cmd 0x%x message id 0x%llx\n", shdr->Command, shdr->MessageId); + cifs_dbg(VFS, "sign fail cmd 0x%x message id 0x%llx\n", + shdr->Command, shdr->MessageId); return -EACCES; } else return 0; diff --git a/fs/coredump.c b/fs/coredump.c index f8296a82d01d..408418e6aa13 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -211,6 +211,8 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm, return -ENOMEM; (*argv)[(*argc)++] = 0; ++pat_ptr; + if (!(*pat_ptr)) + return -ENOMEM; } /* Repeat as long as we have more pattern to process and more output diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c index 6a04cc02565a..6774a5a6ded8 100644 --- a/fs/exfat/balloc.c +++ b/fs/exfat/balloc.c @@ -91,7 +91,6 @@ static int exfat_allocate_bitmap(struct super_block *sb, } } - sbi->pbr_bh = NULL; return 0; } @@ -137,8 +136,6 @@ void exfat_free_bitmap(struct exfat_sb_info *sbi) { int i; - brelse(sbi->pbr_bh); - for (i = 0; i < sbi->map_sectors; i++) __brelse(sbi->vol_amap[i]); diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 67d4e46fb810..d67fb8a6f770 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -507,6 +507,7 @@ void exfat_msg(struct super_block *sb, const char *lv, const char *fmt, ...) __printf(3, 4) __cold; void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, u8 tz, __le16 time, __le16 date, u8 time_ms); +void exfat_truncate_atime(struct timespec64 *ts); void exfat_set_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, u8 *tz, __le16 *time, __le16 *date, u8 *time_ms); unsigned short exfat_calc_chksum_2byte(void *data, int len, diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 483f683757aa..4f76764165cf 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -273,6 +273,7 @@ int exfat_getattr(const struct path *path, struct kstat *stat, struct exfat_inode_info *ei = EXFAT_I(inode); generic_fillattr(inode, stat); + exfat_truncate_atime(&stat->atime); stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = ei->i_crtime.tv_sec; stat->btime.tv_nsec = ei->i_crtime.tv_nsec; @@ -339,6 +340,7 @@ int exfat_setattr(struct dentry *dentry, struct iattr *attr) } setattr_copy(inode, attr); + exfat_truncate_atime(&inode->i_atime); mark_inode_dirty(inode); out: diff --git a/fs/exfat/misc.c b/fs/exfat/misc.c index 14a3300848f6..ebd2cbe3cbc1 100644 --- a/fs/exfat/misc.c +++ b/fs/exfat/misc.c @@ -88,7 +88,8 @@ void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, if (time_ms) { ts->tv_sec += time_ms / 100; ts->tv_nsec = (time_ms % 100) * 10 * NSEC_PER_MSEC; - } + } else + ts->tv_nsec = 0; if (tz & EXFAT_TZ_VALID) /* Adjust timezone to UTC0. */ @@ -124,6 +125,17 @@ void exfat_set_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, *tz = EXFAT_TZ_VALID; } +/* + * The timestamp for access_time has double seconds granularity. + * (There is no 10msIncrement field for access_time unlike create/modify_time) + * atime also has only a 2-second resolution. + */ +void exfat_truncate_atime(struct timespec64 *ts) +{ + ts->tv_sec = round_down(ts->tv_sec, 2); + ts->tv_nsec = 0; +} + unsigned short exfat_calc_chksum_2byte(void *data, int len, unsigned short chksum, int type) { diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index a8681d91f569..b72d782568b8 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -595,6 +595,7 @@ static int exfat_create(struct inode *dir, struct dentry *dentry, umode_t mode, inode_inc_iversion(inode); inode->i_mtime = inode->i_atime = inode->i_ctime = EXFAT_I(inode)->i_crtime = current_time(inode); + exfat_truncate_atime(&inode->i_atime); /* timestamp is already written, so mark_inode_dirty() is unneeded. */ d_instantiate(dentry, inode); @@ -854,6 +855,7 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry) inode_inc_iversion(dir); dir->i_mtime = dir->i_atime = current_time(dir); + exfat_truncate_atime(&dir->i_atime); if (IS_DIRSYNC(dir)) exfat_sync_inode(dir); else @@ -861,6 +863,7 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry) clear_nlink(inode); inode->i_mtime = inode->i_atime = current_time(inode); + exfat_truncate_atime(&inode->i_atime); exfat_unhash_inode(inode); exfat_d_version_set(dentry, inode_query_iversion(dir)); unlock: @@ -903,6 +906,7 @@ static int exfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inode_inc_iversion(inode); inode->i_mtime = inode->i_atime = inode->i_ctime = EXFAT_I(inode)->i_crtime = current_time(inode); + exfat_truncate_atime(&inode->i_atime); /* timestamp is already written, so mark_inode_dirty() is unneeded. */ d_instantiate(dentry, inode); @@ -1019,6 +1023,7 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry) inode_inc_iversion(dir); dir->i_mtime = dir->i_atime = current_time(dir); + exfat_truncate_atime(&dir->i_atime); if (IS_DIRSYNC(dir)) exfat_sync_inode(dir); else @@ -1027,6 +1032,7 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry) clear_nlink(inode); inode->i_mtime = inode->i_atime = current_time(inode); + exfat_truncate_atime(&inode->i_atime); exfat_unhash_inode(inode); exfat_d_version_set(dentry, inode_query_iversion(dir)); unlock: @@ -1387,6 +1393,7 @@ static int exfat_rename(struct inode *old_dir, struct dentry *old_dentry, inode_inc_iversion(new_dir); new_dir->i_ctime = new_dir->i_mtime = new_dir->i_atime = EXFAT_I(new_dir)->i_crtime = current_time(new_dir); + exfat_truncate_atime(&new_dir->i_atime); if (IS_DIRSYNC(new_dir)) exfat_sync_inode(new_dir); else diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 16ed202ef527..0565d5539d57 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -49,6 +49,7 @@ static void exfat_put_super(struct super_block *sb) sync_blockdev(sb->s_bdev); exfat_set_vol_flags(sb, VOL_CLEAN); exfat_free_bitmap(sbi); + brelse(sbi->pbr_bh); mutex_unlock(&sbi->s_lock); call_rcu(&sbi->rcu, exfat_delayed_free); @@ -100,7 +101,7 @@ static int exfat_statfs(struct dentry *dentry, struct kstatfs *buf) int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flag) { struct exfat_sb_info *sbi = EXFAT_SB(sb); - struct pbr64 *bpb; + struct pbr64 *bpb = (struct pbr64 *)sbi->pbr_bh->b_data; bool sync = 0; /* flags are not changed */ @@ -115,15 +116,6 @@ int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flag) if (sb_rdonly(sb)) return 0; - if (!sbi->pbr_bh) { - sbi->pbr_bh = sb_bread(sb, 0); - if (!sbi->pbr_bh) { - exfat_msg(sb, KERN_ERR, "failed to read boot sector"); - return -ENOMEM; - } - } - - bpb = (struct pbr64 *)sbi->pbr_bh->b_data; bpb->bsx.vol_flags = cpu_to_le16(new_flag); if (new_flag == VOL_DIRTY && !buffer_dirty(sbi->pbr_bh)) @@ -159,7 +151,6 @@ static int exfat_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",iocharset=utf8"); else if (sbi->nls_io) seq_printf(m, ",iocharset=%s", sbi->nls_io->charset); - seq_printf(m, ",bps=%ld", sb->s_blocksize); if (opts->errors == EXFAT_ERRORS_CONT) seq_puts(m, ",errors=continue"); else if (opts->errors == EXFAT_ERRORS_PANIC) @@ -351,14 +342,15 @@ static int exfat_read_root(struct inode *inode) exfat_save_attr(inode, ATTR_SUBDIR); inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = current_time(inode); + exfat_truncate_atime(&inode->i_atime); exfat_cache_init_inode(inode); return 0; } -static struct pbr *exfat_read_pbr_with_logical_sector(struct super_block *sb, - struct buffer_head **prev_bh) +static struct pbr *exfat_read_pbr_with_logical_sector(struct super_block *sb) { - struct pbr *p_pbr = (struct pbr *) (*prev_bh)->b_data; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct pbr *p_pbr = (struct pbr *) (sbi->pbr_bh)->b_data; unsigned short logical_sect = 0; logical_sect = 1 << p_pbr->bsx.f64.sect_size_bits; @@ -378,26 +370,23 @@ static struct pbr *exfat_read_pbr_with_logical_sector(struct super_block *sb, } if (logical_sect > sb->s_blocksize) { - struct buffer_head *bh = NULL; - - __brelse(*prev_bh); - *prev_bh = NULL; + brelse(sbi->pbr_bh); + sbi->pbr_bh = NULL; if (!sb_set_blocksize(sb, logical_sect)) { exfat_msg(sb, KERN_ERR, "unable to set blocksize %u", logical_sect); return NULL; } - bh = sb_bread(sb, 0); - if (!bh) { + sbi->pbr_bh = sb_bread(sb, 0); + if (!sbi->pbr_bh) { exfat_msg(sb, KERN_ERR, "unable to read boot sector (logical sector size = %lu)", sb->s_blocksize); return NULL; } - *prev_bh = bh; - p_pbr = (struct pbr *) bh->b_data; + p_pbr = (struct pbr *)sbi->pbr_bh->b_data; } return p_pbr; } @@ -408,21 +397,20 @@ static int __exfat_fill_super(struct super_block *sb) int ret; struct pbr *p_pbr; struct pbr64 *p_bpb; - struct buffer_head *bh; struct exfat_sb_info *sbi = EXFAT_SB(sb); /* set block size to read super block */ sb_min_blocksize(sb, 512); /* read boot sector */ - bh = sb_bread(sb, 0); - if (!bh) { + sbi->pbr_bh = sb_bread(sb, 0); + if (!sbi->pbr_bh) { exfat_msg(sb, KERN_ERR, "unable to read boot sector"); return -EIO; } /* PRB is read */ - p_pbr = (struct pbr *)bh->b_data; + p_pbr = (struct pbr *)sbi->pbr_bh->b_data; /* check the validity of PBR */ if (le16_to_cpu((p_pbr->signature)) != PBR_SIGNATURE) { @@ -433,7 +421,7 @@ static int __exfat_fill_super(struct super_block *sb) /* check logical sector size */ - p_pbr = exfat_read_pbr_with_logical_sector(sb, &bh); + p_pbr = exfat_read_pbr_with_logical_sector(sb); if (!p_pbr) { ret = -EIO; goto free_bh; @@ -514,7 +502,7 @@ free_alloc_bitmap: free_upcase_table: exfat_free_upcase_table(sbi); free_bh: - brelse(bh); + brelse(sbi->pbr_bh); return ret; } @@ -531,17 +519,18 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc) if (opts->discard) { struct request_queue *q = bdev_get_queue(sb->s_bdev); - if (!blk_queue_discard(q)) + if (!blk_queue_discard(q)) { exfat_msg(sb, KERN_WARNING, "mounting with \"discard\" option, but the device does not support discard"); - opts->discard = 0; + opts->discard = 0; + } } sb->s_flags |= SB_NODIRATIME; sb->s_magic = EXFAT_SUPER_MAGIC; sb->s_op = &exfat_sops; - sb->s_time_gran = 1; + sb->s_time_gran = 10 * NSEC_PER_MSEC; sb->s_time_min = EXFAT_MIN_TIMESTAMP_SECS; sb->s_time_max = EXFAT_MAX_TIMESTAMP_SECS; @@ -605,6 +594,7 @@ put_inode: free_table: exfat_free_upcase_table(sbi); exfat_free_bitmap(sbi); + brelse(sbi->pbr_bh); check_nls_io: unload_nls(sbi->nls_io); @@ -717,6 +707,7 @@ static void __exit exit_exfat_fs(void) module_init(init_exfat_fs); module_exit(exit_exfat_fs); +MODULE_ALIAS_FS("exfat"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("exFAT filesystem support"); MODULE_AUTHOR("Samsung Electronics Co., Ltd."); diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 0e0a4d6209c7..a32e5f7b5385 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -410,7 +410,7 @@ verified: * Read the bitmap for a given block_group,and validate the * bits for block/inode/inode tables are set in the bitmaps * - * Return buffer_head on success or NULL in case of failure. + * Return buffer_head on success or an ERR_PTR in case of failure. */ struct buffer_head * ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) @@ -502,7 +502,7 @@ out: return ERR_PTR(err); } -/* Returns 0 on success, 1 on error */ +/* Returns 0 on success, -errno on error */ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, struct buffer_head *bh) { diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 7f16e1af8d5c..0c76cdd44d90 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -338,9 +338,6 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, if (inode && inode_needs_sync(inode)) { sync_dirty_buffer(bh); if (buffer_req(bh) && !buffer_uptodate(bh)) { - struct ext4_super_block *es; - - es = EXT4_SB(inode->i_sb)->s_es; ext4_error_inode_err(inode, where, line, bh->b_blocknr, EIO, "IO error syncing itable block"); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 031752cfb6f7..f2b577b315a0 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3374,8 +3374,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, (unsigned long long)map->m_lblk, map_len); sbi = EXT4_SB(inode->i_sb); - eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> - inode->i_sb->s_blocksize_bits; + eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1) + >> inode->i_sb->s_blocksize_bits; if (eof_block < map->m_lblk + map_len) eof_block = map->m_lblk + map_len; @@ -3627,8 +3627,8 @@ static int ext4_split_convert_extents(handle_t *handle, __func__, inode->i_ino, (unsigned long long)map->m_lblk, map->m_len); - eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> - inode->i_sb->s_blocksize_bits; + eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1) + >> inode->i_sb->s_blocksize_bits; if (eof_block < map->m_lblk + map->m_len) eof_block = map->m_lblk + map->m_len; /* diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index b420c9dc444d..4b8c9a9bdf0c 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -113,7 +113,7 @@ verified: * Read the inode allocation bitmap for a given block_group, reading * into the specified slot in the superblock's bitmap cache. * - * Return buffer_head of bitmap on success or NULL. + * Return buffer_head of bitmap on success, or an ERR_PTR on error. */ static struct buffer_head * ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) @@ -662,7 +662,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent, * block has been written back to disk. (Yes, these values are * somewhat arbitrary...) */ -#define RECENTCY_MIN 5 +#define RECENTCY_MIN 60 #define RECENTCY_DIRTY 300 static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e416096fc081..2a4aae6acdcb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1973,7 +1973,7 @@ static int ext4_writepage(struct page *page, bool keep_towrite = false; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) { - ext4_invalidatepage(page, 0, PAGE_SIZE); + inode->i_mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); unlock_page(page); return -EIO; } @@ -4364,7 +4364,7 @@ make_io: if (end > table) end = table; while (b <= end) - sb_breadahead(sb, b++); + sb_breadahead_unmovable(sb, b++); } /* diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 87c85be4c12e..30d5d97548c4 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1943,7 +1943,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, int free; free = e4b->bd_info->bb_free; - BUG_ON(free <= 0); + if (WARN_ON(free <= 0)) + return; i = e4b->bd_info->bb_first_free; @@ -1966,7 +1967,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, } mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex); - BUG_ON(ex.fe_len <= 0); + if (WARN_ON(ex.fe_len <= 0)) + break; if (free < ex.fe_len) { ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, "%d free clusters as per " diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9728e7b0e84f..bf5fcb477f66 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -596,7 +596,6 @@ void __ext4_error_file(struct file *file, const char *function, { va_list args; struct va_format vaf; - struct ext4_super_block *es; struct inode *inode = file_inode(file); char pathname[80], *path; @@ -604,7 +603,6 @@ void __ext4_error_file(struct file *file, const char *function, return; trace_ext4_error(inode->i_sb, function, line); - es = EXT4_SB(inode->i_sb)->s_es; if (ext4_error_ratelimit(inode->i_sb)) { path = file_path(file, pathname, sizeof(pathname)); if (IS_ERR(path)) @@ -4340,7 +4338,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* Pre-read the descriptors into the buffer cache */ for (i = 0; i < db_count; i++) { block = descriptor_loc(sb, logical_sb_block, i); - sb_breadahead(sb, block); + sb_breadahead_unmovable(sb, block); } for (i = 0; i < db_count; i++) { diff --git a/fs/io_uring.c b/fs/io_uring.c index 5190bfb6a665..c687f57fb651 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -357,7 +357,6 @@ struct io_timeout_data { struct hrtimer timer; struct timespec64 ts; enum hrtimer_mode mode; - u32 seq_offset; }; struct io_accept { @@ -385,7 +384,7 @@ struct io_timeout { struct file *file; u64 addr; int flags; - unsigned count; + u32 count; }; struct io_rw { @@ -508,6 +507,7 @@ enum { REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, + REQ_F_LINK_HEAD_BIT, REQ_F_LINK_NEXT_BIT, REQ_F_FAIL_LINK_BIT, REQ_F_INFLIGHT_BIT, @@ -543,6 +543,8 @@ enum { /* IOSQE_BUFFER_SELECT */ REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), + /* head of a link */ + REQ_F_LINK_HEAD = BIT(REQ_F_LINK_HEAD_BIT), /* already grabbed next link */ REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT), /* fail rest of links */ @@ -955,8 +957,8 @@ static inline bool __req_need_defer(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped - + atomic_read(&ctx->cached_cq_overflow); + return req->sequence != ctx->cached_cq_tail + + atomic_read(&ctx->cached_cq_overflow); } static inline bool req_need_defer(struct io_kiocb *req) @@ -1437,7 +1439,7 @@ static bool io_link_cancel_timeout(struct io_kiocb *req) if (ret != -1) { io_cqring_fill_event(req, -ECANCELED); io_commit_cqring(ctx); - req->flags &= ~REQ_F_LINK; + req->flags &= ~REQ_F_LINK_HEAD; io_put_req(req); return true; } @@ -1473,7 +1475,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) list_del_init(&req->link_list); if (!list_empty(&nxt->link_list)) - nxt->flags |= REQ_F_LINK; + nxt->flags |= REQ_F_LINK_HEAD; *nxtptr = nxt; break; } @@ -1484,7 +1486,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) } /* - * Called if REQ_F_LINK is set, and we fail the head request + * Called if REQ_F_LINK_HEAD is set, and we fail the head request */ static void io_fail_links(struct io_kiocb *req) { @@ -1517,7 +1519,7 @@ static void io_fail_links(struct io_kiocb *req) static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) { - if (likely(!(req->flags & REQ_F_LINK))) + if (likely(!(req->flags & REQ_F_LINK_HEAD))) return; /* @@ -1669,7 +1671,7 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req) { - if ((req->flags & REQ_F_LINK) || io_is_fallback_req(req)) + if ((req->flags & REQ_F_LINK_HEAD) || io_is_fallback_req(req)) return false; if (!(req->flags & REQ_F_FIXED_FILE) || req->io) @@ -2562,7 +2564,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock) req->result = 0; io_size = ret; - if (req->flags & REQ_F_LINK) + if (req->flags & REQ_F_LINK_HEAD) req->result = io_size; /* @@ -2653,7 +2655,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock) req->result = 0; io_size = ret; - if (req->flags & REQ_F_LINK) + if (req->flags & REQ_F_LINK_HEAD) req->result = io_size; /* @@ -2760,7 +2762,7 @@ static bool io_splice_punt(struct file *file) return false; if (!io_file_supports_async(file)) return true; - return !(file->f_mode & O_NONBLOCK); + return !(file->f_flags & O_NONBLOCK); } static int io_splice(struct io_kiocb *req, bool force_nonblock) @@ -4153,25 +4155,62 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, return 1; } +static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll) + __acquires(&req->ctx->completion_lock) +{ + struct io_ring_ctx *ctx = req->ctx; + + if (!req->result && !READ_ONCE(poll->canceled)) { + struct poll_table_struct pt = { ._key = poll->events }; + + req->result = vfs_poll(req->file, &pt) & poll->events; + } + + spin_lock_irq(&ctx->completion_lock); + if (!req->result && !READ_ONCE(poll->canceled)) { + add_wait_queue(poll->head, &poll->wait); + return true; + } + + return false; +} + static void io_async_task_func(struct callback_head *cb) { struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); struct async_poll *apoll = req->apoll; struct io_ring_ctx *ctx = req->ctx; + bool canceled; trace_io_uring_task_run(req->ctx, req->opcode, req->user_data); - WARN_ON_ONCE(!list_empty(&req->apoll->poll.wait.entry)); + if (io_poll_rewait(req, &apoll->poll)) { + spin_unlock_irq(&ctx->completion_lock); + return; + } - if (hash_hashed(&req->hash_node)) { - spin_lock_irq(&ctx->completion_lock); + if (hash_hashed(&req->hash_node)) hash_del(&req->hash_node); - spin_unlock_irq(&ctx->completion_lock); + + canceled = READ_ONCE(apoll->poll.canceled); + if (canceled) { + io_cqring_fill_event(req, -ECANCELED); + io_commit_cqring(ctx); } + spin_unlock_irq(&ctx->completion_lock); + /* restore ->work in case we need to retry again */ memcpy(&req->work, &apoll->work, sizeof(req->work)); + if (canceled) { + kfree(apoll); + io_cqring_ev_posted(ctx); + req_set_fail_links(req); + io_double_put_req(req); + return; + } + __set_current_state(TASK_RUNNING); mutex_lock(&ctx->uring_lock); __io_queue_sqe(req, NULL); @@ -4315,11 +4354,13 @@ static bool __io_poll_remove_one(struct io_kiocb *req, static bool io_poll_remove_one(struct io_kiocb *req) { + struct async_poll *apoll = NULL; bool do_complete; if (req->opcode == IORING_OP_POLL_ADD) { do_complete = __io_poll_remove_one(req, &req->poll); } else { + apoll = req->apoll; /* non-poll requests have submit ref still */ do_complete = __io_poll_remove_one(req, &req->apoll->poll); if (do_complete) @@ -4328,6 +4369,14 @@ static bool io_poll_remove_one(struct io_kiocb *req) hash_del(&req->hash_node); + if (do_complete && apoll) { + /* + * restore ->work because we need to call io_req_work_drop_env. + */ + memcpy(&req->work, &apoll->work, sizeof(req->work)); + kfree(apoll); + } + if (do_complete) { io_cqring_fill_event(req, -ECANCELED); io_commit_cqring(req->ctx); @@ -4342,7 +4391,7 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx) { struct hlist_node *tmp; struct io_kiocb *req; - int i; + int posted = 0, i; spin_lock_irq(&ctx->completion_lock); for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { @@ -4350,11 +4399,12 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx) list = &ctx->cancel_hash[i]; hlist_for_each_entry_safe(req, tmp, list, hash_node) - io_poll_remove_one(req); + posted += io_poll_remove_one(req); } spin_unlock_irq(&ctx->completion_lock); - io_cqring_ev_posted(ctx); + if (posted) + io_cqring_ev_posted(ctx); } static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) @@ -4423,18 +4473,11 @@ static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt) struct io_ring_ctx *ctx = req->ctx; struct io_poll_iocb *poll = &req->poll; - if (!req->result && !READ_ONCE(poll->canceled)) { - struct poll_table_struct pt = { ._key = poll->events }; - - req->result = vfs_poll(req->file, &pt) & poll->events; - } - - spin_lock_irq(&ctx->completion_lock); - if (!req->result && !READ_ONCE(poll->canceled)) { - add_wait_queue(poll->head, &poll->wait); + if (io_poll_rewait(req, poll)) { spin_unlock_irq(&ctx->completion_lock); return; } + hash_del(&req->hash_node); io_poll_complete(req, req->result, 0); req->flags |= REQ_F_COMP_LOCKED; @@ -4665,11 +4708,12 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, static int io_timeout(struct io_kiocb *req) { - unsigned count; struct io_ring_ctx *ctx = req->ctx; struct io_timeout_data *data; struct list_head *entry; unsigned span = 0; + u32 count = req->timeout.count; + u32 seq = req->sequence; data = &req->io->timeout; @@ -4678,7 +4722,6 @@ static int io_timeout(struct io_kiocb *req) * timeout event to be satisfied. If it isn't set, then this is * a pure timeout request, sequence isn't used. */ - count = req->timeout.count; if (!count) { req->flags |= REQ_F_TIMEOUT_NOSEQ; spin_lock_irq(&ctx->completion_lock); @@ -4686,8 +4729,7 @@ static int io_timeout(struct io_kiocb *req) goto add; } - req->sequence = ctx->cached_sq_head + count - 1; - data->seq_offset = count; + req->sequence = seq + count; /* * Insertion sort, ensuring the first entry in the list is always @@ -4696,26 +4738,26 @@ static int io_timeout(struct io_kiocb *req) spin_lock_irq(&ctx->completion_lock); list_for_each_prev(entry, &ctx->timeout_list) { struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); - unsigned nxt_sq_head; + unsigned nxt_seq; long long tmp, tmp_nxt; - u32 nxt_offset = nxt->io->timeout.seq_offset; + u32 nxt_offset = nxt->timeout.count; if (nxt->flags & REQ_F_TIMEOUT_NOSEQ) continue; /* - * Since cached_sq_head + count - 1 can overflow, use type long + * Since seq + count can overflow, use type long * long to store it. */ - tmp = (long long)ctx->cached_sq_head + count - 1; - nxt_sq_head = nxt->sequence - nxt_offset + 1; - tmp_nxt = (long long)nxt_sq_head + nxt_offset - 1; + tmp = (long long)seq + count; + nxt_seq = nxt->sequence - nxt_offset; + tmp_nxt = (long long)nxt_seq + nxt_offset; /* * cached_sq_head may overflow, and it will never overflow twice * once there is some timeout req still be valid. */ - if (ctx->cached_sq_head < nxt_sq_head) + if (seq < nxt_seq) tmp += UINT_MAX; if (tmp > tmp_nxt) @@ -5476,7 +5518,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) { struct io_kiocb *nxt; - if (!(req->flags & REQ_F_LINK)) + if (!(req->flags & REQ_F_LINK_HEAD)) return NULL; /* for polled retry, if flag is set, we already went through here */ if (req->flags & REQ_F_POLLED) @@ -5604,54 +5646,11 @@ static inline void io_queue_link_head(struct io_kiocb *req) io_queue_sqe(req, NULL); } -#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ - IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ - IOSQE_BUFFER_SELECT) - -static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, +static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, struct io_submit_state *state, struct io_kiocb **link) { struct io_ring_ctx *ctx = req->ctx; - unsigned int sqe_flags; - int ret, id, fd; - - sqe_flags = READ_ONCE(sqe->flags); - - /* enforce forwards compatibility on users */ - if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) { - ret = -EINVAL; - goto err_req; - } - - if ((sqe_flags & IOSQE_BUFFER_SELECT) && - !io_op_defs[req->opcode].buffer_select) { - ret = -EOPNOTSUPP; - goto err_req; - } - - id = READ_ONCE(sqe->personality); - if (id) { - req->work.creds = idr_find(&ctx->personality_idr, id); - if (unlikely(!req->work.creds)) { - ret = -EINVAL; - goto err_req; - } - get_cred(req->work.creds); - } - - /* same numerical values with corresponding REQ_F_*, safe to copy */ - req->flags |= sqe_flags & (IOSQE_IO_DRAIN | IOSQE_IO_HARDLINK | - IOSQE_ASYNC | IOSQE_FIXED_FILE | - IOSQE_BUFFER_SELECT); - - fd = READ_ONCE(sqe->fd); - ret = io_req_set_file(state, req, fd, sqe_flags); - if (unlikely(ret)) { -err_req: - io_cqring_add_event(req, ret); - io_double_put_req(req); - return false; - } + int ret; /* * If we already have a head request, queue this one for async @@ -5670,42 +5669,39 @@ err_req: * next after the link request. The last one is done via * drain_next flag to persist the effect across calls. */ - if (sqe_flags & IOSQE_IO_DRAIN) { + if (req->flags & REQ_F_IO_DRAIN) { head->flags |= REQ_F_IO_DRAIN; ctx->drain_next = 1; } - if (io_alloc_async_ctx(req)) { - ret = -EAGAIN; - goto err_req; - } + if (io_alloc_async_ctx(req)) + return -EAGAIN; ret = io_req_defer_prep(req, sqe); if (ret) { /* fail even hard links since we don't submit */ head->flags |= REQ_F_FAIL_LINK; - goto err_req; + return ret; } trace_io_uring_link(ctx, req, head); list_add_tail(&req->link_list, &head->link_list); /* last request of a link, enqueue the link */ - if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK))) { + if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { io_queue_link_head(head); *link = NULL; } } else { if (unlikely(ctx->drain_next)) { req->flags |= REQ_F_IO_DRAIN; - req->ctx->drain_next = 0; + ctx->drain_next = 0; } - if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) { - req->flags |= REQ_F_LINK; + if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { + req->flags |= REQ_F_LINK_HEAD; INIT_LIST_HEAD(&req->link_list); - if (io_alloc_async_ctx(req)) { - ret = -EAGAIN; - goto err_req; - } + if (io_alloc_async_ctx(req)) + return -EAGAIN; + ret = io_req_defer_prep(req, sqe); if (ret) req->flags |= REQ_F_FAIL_LINK; @@ -5715,7 +5711,7 @@ err_req: } } - return true; + return 0; } /* @@ -5789,15 +5785,23 @@ static inline void io_consume_sqe(struct io_ring_ctx *ctx) ctx->cached_sq_head++; } -static void io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct io_uring_sqe *sqe) +#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ + IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ + IOSQE_BUFFER_SELECT) + +static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, + const struct io_uring_sqe *sqe, + struct io_submit_state *state, bool async) { + unsigned int sqe_flags; + int id, fd; + /* * All io need record the previous position, if LINK vs DARIN, * it can be used to mark the position of the first IO in the * link list. */ - req->sequence = ctx->cached_sq_head; + req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped; req->opcode = READ_ONCE(sqe->opcode); req->user_data = READ_ONCE(sqe->user_data); req->io = NULL; @@ -5808,17 +5812,50 @@ static void io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, refcount_set(&req->refs, 2); req->task = NULL; req->result = 0; + req->needs_fixed_file = async; INIT_IO_WORK(&req->work, io_wq_submit_work); + + if (unlikely(req->opcode >= IORING_OP_LAST)) + return -EINVAL; + + if (io_op_defs[req->opcode].needs_mm && !current->mm) { + if (unlikely(!mmget_not_zero(ctx->sqo_mm))) + return -EFAULT; + use_mm(ctx->sqo_mm); + } + + sqe_flags = READ_ONCE(sqe->flags); + /* enforce forwards compatibility on users */ + if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) + return -EINVAL; + + if ((sqe_flags & IOSQE_BUFFER_SELECT) && + !io_op_defs[req->opcode].buffer_select) + return -EOPNOTSUPP; + + id = READ_ONCE(sqe->personality); + if (id) { + req->work.creds = idr_find(&ctx->personality_idr, id); + if (unlikely(!req->work.creds)) + return -EINVAL; + get_cred(req->work.creds); + } + + /* same numerical values with corresponding REQ_F_*, safe to copy */ + req->flags |= sqe_flags & (IOSQE_IO_DRAIN | IOSQE_IO_HARDLINK | + IOSQE_ASYNC | IOSQE_FIXED_FILE | + IOSQE_BUFFER_SELECT | IOSQE_IO_LINK); + + fd = READ_ONCE(sqe->fd); + return io_req_set_file(state, req, fd, sqe_flags); } static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, - struct file *ring_file, int ring_fd, - struct mm_struct **mm, bool async) + struct file *ring_file, int ring_fd, bool async) { struct io_submit_state state, *statep = NULL; struct io_kiocb *link = NULL; int i, submitted = 0; - bool mm_fault = false; /* if we have a backlog and couldn't flush it all, return BUSY */ if (test_bit(0, &ctx->sq_check_overflow)) { @@ -5858,34 +5895,23 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, break; } - io_init_req(ctx, req, sqe); + err = io_init_req(ctx, req, sqe, statep, async); io_consume_sqe(ctx); /* will complete beyond this point, count as submitted */ submitted++; - if (unlikely(req->opcode >= IORING_OP_LAST)) { - err = -EINVAL; + if (unlikely(err)) { fail_req: io_cqring_add_event(req, err); io_double_put_req(req); break; } - if (io_op_defs[req->opcode].needs_mm && !*mm) { - mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm); - if (unlikely(mm_fault)) { - err = -EFAULT; - goto fail_req; - } - use_mm(ctx->sqo_mm); - *mm = ctx->sqo_mm; - } - - req->needs_fixed_file = async; trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data, true, async); - if (!io_submit_sqe(req, sqe, statep, &link)) - break; + err = io_submit_sqe(req, sqe, statep, &link); + if (err) + goto fail_req; } if (unlikely(submitted != nr)) { @@ -5904,10 +5930,19 @@ fail_req: return submitted; } +static inline void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) +{ + struct mm_struct *mm = current->mm; + + if (mm) { + unuse_mm(mm); + mmput(mm); + } +} + static int io_sq_thread(void *data) { struct io_ring_ctx *ctx = data; - struct mm_struct *cur_mm = NULL; const struct cred *old_cred; mm_segment_t old_fs; DEFINE_WAIT(wait); @@ -5948,11 +5983,7 @@ static int io_sq_thread(void *data) * adding ourselves to the waitqueue, as the unuse/drop * may sleep. */ - if (cur_mm) { - unuse_mm(cur_mm); - mmput(cur_mm); - cur_mm = NULL; - } + io_sq_thread_drop_mm(ctx); /* * We're polling. If we're within the defined idle @@ -6016,7 +6047,7 @@ static int io_sq_thread(void *data) } mutex_lock(&ctx->uring_lock); - ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true); + ret = io_submit_sqes(ctx, to_submit, NULL, -1, true); mutex_unlock(&ctx->uring_lock); timeout = jiffies + ctx->sq_thread_idle; } @@ -6025,10 +6056,7 @@ static int io_sq_thread(void *data) task_work_run(); set_fs(old_fs); - if (cur_mm) { - unuse_mm(cur_mm); - mmput(cur_mm); - } + io_sq_thread_drop_mm(ctx); revert_creds(old_cred); kthread_parkme(); @@ -7509,13 +7537,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, wake_up(&ctx->sqo_wait); submitted = to_submit; } else if (to_submit) { - struct mm_struct *cur_mm; - mutex_lock(&ctx->uring_lock); - /* already have mm, so io_submit_sqes() won't try to grab it */ - cur_mm = ctx->sqo_mm; - submitted = io_submit_sqes(ctx, to_submit, f.file, fd, - &cur_mm, false); + submitted = io_submit_sqes(ctx, to_submit, f.file, fd, false); mutex_unlock(&ctx->uring_lock); if (submitted != to_submit) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index f2dc35c22964..b8d78f393365 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2023,6 +2023,7 @@ lookup_again: goto lookup_again; } + spin_unlock(&ino->i_lock); first = true; status = nfs4_select_rw_stateid(ctx->state, iomode == IOMODE_RW ? FMODE_WRITE : FMODE_READ, @@ -2032,12 +2033,12 @@ lookup_again: trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_INVALID_OPEN); - spin_unlock(&ino->i_lock); nfs4_schedule_stateid_recovery(server, ctx->state); pnfs_clear_first_layoutget(lo); pnfs_put_layout_hdr(lo); goto lookup_again; } + spin_lock(&ino->i_lock); } else { nfs4_stateid_copy(&stateid, &lo->plh_stateid); } diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index c3b11a715082..5cf91322de0f 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1312,6 +1312,7 @@ nfsd4_run_cb_work(struct work_struct *work) container_of(work, struct nfsd4_callback, cb_work); struct nfs4_client *clp = cb->cb_clp; struct rpc_clnt *clnt; + int flags; if (cb->cb_need_restart) { cb->cb_need_restart = false; @@ -1340,7 +1341,8 @@ nfsd4_run_cb_work(struct work_struct *work) } cb->cb_msg.rpc_cred = clp->cl_cb_cred; - rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN, + flags = clp->cl_minorversion ? RPC_TASK_NOCONNECT : RPC_TASK_SOFTCONN; + rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | flags, cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb); } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index e32ecedece0f..c107caa56525 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -267,6 +267,8 @@ find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh, if (!nbl) { nbl= kmalloc(sizeof(*nbl), GFP_KERNEL); if (nbl) { + INIT_LIST_HEAD(&nbl->nbl_list); + INIT_LIST_HEAD(&nbl->nbl_lru); fh_copy_shallow(&nbl->nbl_fh, fh); locks_init_lock(&nbl->nbl_lock); nfsd4_init_cb(&nbl->nbl_cb, lo->lo_owner.so_client, diff --git a/fs/proc/base.c b/fs/proc/base.c index 6042b646ab27..eb2255e95f62 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1573,6 +1573,7 @@ static ssize_t timens_offsets_write(struct file *file, const char __user *buf, noffsets = 0; for (pos = kbuf; pos; pos = next_line) { struct proc_timens_offset *off = &offsets[noffsets]; + char clock[10]; int err; /* Find the end of line and ensure we don't look past it */ @@ -1584,10 +1585,21 @@ static ssize_t timens_offsets_write(struct file *file, const char __user *buf, next_line = NULL; } - err = sscanf(pos, "%u %lld %lu", &off->clockid, + err = sscanf(pos, "%9s %lld %lu", clock, &off->val.tv_sec, &off->val.tv_nsec); if (err != 3 || off->val.tv_nsec >= NSEC_PER_SEC) goto out; + + clock[sizeof(clock) - 1] = 0; + if (strcmp(clock, "monotonic") == 0 || + strcmp(clock, __stringify(CLOCK_MONOTONIC)) == 0) + off->clockid = CLOCK_MONOTONIC; + else if (strcmp(clock, "boottime") == 0 || + strcmp(clock, __stringify(CLOCK_BOOTTIME)) == 0) + off->clockid = CLOCK_BOOTTIME; + else + goto out; + noffsets++; if (noffsets == ARRAY_SIZE(offsets)) { if (next_line) @@ -3274,7 +3286,6 @@ static const struct inode_operations proc_tgid_base_inode_operations = { void proc_flush_pid(struct pid *pid) { proc_invalidate_siblings_dcache(&pid->inodes, &pid->lock); - put_pid(pid); } static struct dentry *proc_pid_instantiate(struct dentry * dentry, diff --git a/fs/proc/root.c b/fs/proc/root.c index 2633f10446c3..cdbe9293ea55 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -196,6 +196,13 @@ static void proc_kill_sb(struct super_block *sb) if (ns->proc_thread_self) dput(ns->proc_thread_self); kill_anon_super(sb); + + /* Make the pid namespace safe for the next mount of proc */ + ns->proc_self = NULL; + ns->proc_thread_self = NULL; + ns->pid_gid = GLOBAL_ROOT_GID; + ns->hide_pid = 0; + put_pid_ns(ns); } diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 7dc800cce354..c663202da8de 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -266,7 +266,8 @@ static int vmcoredd_mmap_dumps(struct vm_area_struct *vma, unsigned long dst, if (start < offset + dump->size) { tsz = min(offset + (u64)dump->size - start, (u64)size); buf = dump->buf + start - offset; - if (remap_vmalloc_range_partial(vma, dst, buf, tsz)) { + if (remap_vmalloc_range_partial(vma, dst, buf, 0, + tsz)) { ret = -EFAULT; goto out_unlock; } @@ -624,7 +625,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size); kaddr = elfnotes_buf + start - elfcorebuf_sz - vmcoredd_orig_sz; if (remap_vmalloc_range_partial(vma, vma->vm_start + len, - kaddr, tsz)) + kaddr, 0, tsz)) goto fail; size -= tsz; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index a7be7a9e5c1a..8bf1d15be3f6 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -911,7 +911,12 @@ xfs_eofblocks_worker( { struct xfs_mount *mp = container_of(to_delayed_work(work), struct xfs_mount, m_eofblocks_work); + + if (!sb_start_write_trylock(mp->m_super)) + return; xfs_icache_free_eofblocks(mp, NULL); + sb_end_write(mp->m_super); + xfs_queue_eofblocks(mp); } @@ -938,7 +943,12 @@ xfs_cowblocks_worker( { struct xfs_mount *mp = container_of(to_delayed_work(work), struct xfs_mount, m_cowblocks_work); + + if (!sb_start_write_trylock(mp->m_super)) + return; xfs_icache_free_cowblocks(mp, NULL); + sb_end_write(mp->m_super); + xfs_queue_cowblocks(mp); } diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index cdfb3cd9a25b..309958186d33 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -2363,7 +2363,10 @@ xfs_file_ioctl( if (error) return error; - return xfs_icache_free_eofblocks(mp, &keofb); + sb_start_write(mp->m_super); + error = xfs_icache_free_eofblocks(mp, &keofb); + sb_end_write(mp->m_super); + return error; } default: diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 50c43422fa17..b2e4598fdf7d 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -167,8 +167,12 @@ typedef struct xfs_mount { struct xfs_kobj m_error_meta_kobj; struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX]; struct xstats m_stats; /* per-fs stats */ - struct ratelimit_state m_flush_inodes_ratelimit; + /* + * Workqueue item so that we can coalesce multiple inode flush attempts + * into a single flush. + */ + struct work_struct m_flush_inodes_work; struct workqueue_struct *m_buf_workqueue; struct workqueue_struct *m_unwritten_workqueue; struct workqueue_struct *m_cil_workqueue; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index b0ce04ffd3cd..107bf2a2f344 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1051,6 +1051,7 @@ xfs_reflink_remap_extent( uirec.br_startblock = irec->br_startblock + rlen; uirec.br_startoff = irec->br_startoff + rlen; uirec.br_blockcount = unmap_len - rlen; + uirec.br_state = irec->br_state; unmap_len = rlen; /* If this isn't a real mapping, we're done. */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index abf06bf9c3f3..424bb9a2d532 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -516,6 +516,20 @@ xfs_destroy_mount_workqueues( destroy_workqueue(mp->m_buf_workqueue); } +static void +xfs_flush_inodes_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(work, struct xfs_mount, + m_flush_inodes_work); + struct super_block *sb = mp->m_super; + + if (down_read_trylock(&sb->s_umount)) { + sync_inodes_sb(sb); + up_read(&sb->s_umount); + } +} + /* * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting @@ -526,15 +540,15 @@ void xfs_flush_inodes( struct xfs_mount *mp) { - struct super_block *sb = mp->m_super; - - if (!__ratelimit(&mp->m_flush_inodes_ratelimit)) + /* + * If flush_work() returns true then that means we waited for a flush + * which was already in progress. Don't bother running another scan. + */ + if (flush_work(&mp->m_flush_inodes_work)) return; - if (down_read_trylock(&sb->s_umount)) { - sync_inodes_sb(sb); - up_read(&sb->s_umount); - } + queue_work(mp->m_sync_workqueue, &mp->m_flush_inodes_work); + flush_work(&mp->m_flush_inodes_work); } /* Catch misguided souls that try to use this interface on XFS */ @@ -1369,17 +1383,6 @@ xfs_fc_fill_super( if (error) goto out_free_names; - /* - * Cap the number of invocations of xfs_flush_inodes to 16 for every - * quarter of a second. The magic numbers here were determined by - * observation neither to cause stalls in writeback when there are a - * lot of IO threads and the fs is near ENOSPC, nor cause any fstest - * regressions. YMMV. - */ - ratelimit_state_init(&mp->m_flush_inodes_ratelimit, HZ / 4, 16); - ratelimit_set_flags(&mp->m_flush_inodes_ratelimit, - RATELIMIT_MSG_ON_RELEASE); - error = xfs_init_mount_workqueues(mp); if (error) goto out_close_devices; @@ -1752,6 +1755,7 @@ static int xfs_init_fs_context( spin_lock_init(&mp->m_perag_lock); mutex_init(&mp->m_growlock); atomic_set(&mp->m_active_trans, 0); + INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker); |