diff options
Diffstat (limited to 'fs')
129 files changed, 1994 insertions, 1568 deletions
diff --git a/fs/afs/dir.c b/fs/afs/dir.c index c14533ef108f..b5b8de521f99 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -124,7 +124,7 @@ static void afs_dir_read_cleanup(struct afs_read *req) if (xas_retry(&xas, folio)) continue; BUG_ON(xa_is_value(folio)); - ASSERTCMP(folio_file_mapping(folio), ==, mapping); + ASSERTCMP(folio->mapping, ==, mapping); folio_put(folio); } @@ -202,12 +202,12 @@ static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req) if (xas_retry(&xas, folio)) continue; - BUG_ON(folio_file_mapping(folio) != mapping); + BUG_ON(folio->mapping != mapping); size = min_t(loff_t, folio_size(folio), req->actual_len - folio_pos(folio)); for (offset = 0; offset < size; offset += sizeof(*block)) { block = kmap_local_folio(folio, offset); - pr_warn("[%02lx] %32phN\n", folio_index(folio) + offset, block); + pr_warn("[%02lx] %32phN\n", folio->index + offset, block); kunmap_local(block); } } @@ -233,7 +233,7 @@ static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req) if (xas_retry(&xas, folio)) continue; - BUG_ON(folio_file_mapping(folio) != mapping); + BUG_ON(folio->mapping != mapping); if (!afs_dir_check_folio(dvnode, folio, req->actual_len)) { afs_dir_dump(dvnode, req); @@ -474,6 +474,14 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode, continue; } + /* Don't expose silly rename entries to userspace. */ + if (nlen > 6 && + dire->u.name[0] == '.' && + ctx->actor != afs_lookup_filldir && + ctx->actor != afs_lookup_one_filldir && + memcmp(dire->u.name, ".__afs", 6) == 0) + continue; + /* found the next entry */ if (!dir_emit(ctx, dire->u.name, nlen, ntohl(dire->u.vnode), @@ -708,6 +716,8 @@ static void afs_do_lookup_success(struct afs_operation *op) break; } + if (vp->scb.status.abort_code) + trace_afs_bulkstat_error(op, &vp->fid, i, vp->scb.status.abort_code); if (!vp->scb.have_status && !vp->scb.have_error) continue; @@ -897,12 +907,16 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, afs_begin_vnode_operation(op); afs_wait_for_operation(op); } - inode = ERR_PTR(afs_op_error(op)); out_op: if (!afs_op_error(op)) { - inode = &op->file[1].vnode->netfs.inode; - op->file[1].vnode = NULL; + if (op->file[1].scb.status.abort_code) { + afs_op_accumulate_error(op, -ECONNABORTED, + op->file[1].scb.status.abort_code); + } else { + inode = &op->file[1].vnode->netfs.inode; + op->file[1].vnode = NULL; + } } if (op->file[0].scb.have_status) @@ -2022,7 +2036,7 @@ static bool afs_dir_release_folio(struct folio *folio, gfp_t gfp_flags) { struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio)); - _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio_index(folio)); + _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio->index); folio_detach_private(folio); diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index d3bc4a2d7085..c4d2711e20ad 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -258,16 +258,7 @@ const struct inode_operations afs_dynroot_inode_operations = { .lookup = afs_dynroot_lookup, }; -/* - * Dirs in the dynamic root don't need revalidation. - */ -static int afs_dynroot_d_revalidate(struct dentry *dentry, unsigned int flags) -{ - return 1; -} - const struct dentry_operations afs_dynroot_dentry_operations = { - .d_revalidate = afs_dynroot_d_revalidate, .d_delete = always_delete_dentry, .d_release = afs_d_release, .d_automount = afs_d_automount, diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 3bd02571f30d..15eab053af6d 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -166,7 +166,7 @@ static int afs_proc_addr_prefs_show(struct seq_file *m, void *v) if (!preflist) { seq_puts(m, "NO PREFS\n"); - return 0; + goto out; } seq_printf(m, "PROT SUBNET PRIOR (v=%u n=%u/%u/%u)\n", @@ -191,7 +191,8 @@ static int afs_proc_addr_prefs_show(struct seq_file *m, void *v) } } - rcu_read_lock(); +out: + rcu_read_unlock(); return 0; } diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 10704f2d3af5..fd3e175d8342 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1715,7 +1715,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, * This works without any other locks because this is the only * thread that removes items from the need_discard tree */ - bch2_trans_unlock(trans); + bch2_trans_unlock_long(trans); blkdev_issue_discard(ca->disk_sb.bdev, k.k->p.offset * ca->mi.bucket_size, ca->mi.bucket_size, diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index bed75c93c069..684397442338 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -92,7 +92,7 @@ static noinline void print_cycle(struct printbuf *out, struct lock_graph *g) continue; bch2_btree_trans_to_text(out, i->trans); - bch2_prt_task_backtrace(out, task, i == g->g ? 5 : 1); + bch2_prt_task_backtrace(out, task, i == g->g ? 5 : 1, GFP_NOWAIT); } } @@ -227,7 +227,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) prt_printf(&buf, "backtrace:"); prt_newline(&buf); printbuf_indent_add(&buf, 2); - bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2); + bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT); printbuf_indent_sub(&buf, 2); prt_newline(&buf); } diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index cadda9bbe4a4..7bdba8507fc9 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -627,7 +627,7 @@ restart: prt_printf(&i->buf, "backtrace:"); prt_newline(&i->buf); printbuf_indent_add(&i->buf, 2); - bch2_prt_task_backtrace(&i->buf, task, 0); + bch2_prt_task_backtrace(&i->buf, task, 0, GFP_KERNEL); printbuf_indent_sub(&i->buf, 2); prt_newline(&i->buf); diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index dc52918d06ef..8c70123b6a0c 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -79,7 +79,7 @@ void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, continue; bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0, - REQ_OP_FLUSH, + REQ_OP_WRITE|REQ_PREFLUSH, GFP_KERNEL, &c->nocow_flush_bioset), struct nocow_flush, bio); diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 3a4c24c28e7f..3dc8630ff9fe 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -455,6 +455,7 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, if (IS_ERR(victim)) return PTR_ERR(victim); + dir = d_inode(path.dentry); if (victim->d_sb->s_fs_info != c) { ret = -EXDEV; goto err; @@ -463,14 +464,13 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, ret = -ENOENT; goto err; } - dir = d_inode(path.dentry); ret = __bch2_unlink(dir, victim, true); if (!ret) { fsnotify_rmdir(dir, victim); d_delete(victim); } - inode_unlock(dir); err: + inode_unlock(dir); dput(victim); path_put(&path); return ret; diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 4f0ecd605675..6a760777bafb 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -119,22 +119,19 @@ static int lookup_inode(struct btree_trans *trans, u64 inode_nr, if (!ret) *snapshot = iter.pos.snapshot; err: - bch_err_msg(trans->c, ret, "fetching inode %llu:%u", inode_nr, *snapshot); bch2_trans_iter_exit(trans, &iter); return ret; } -static int __lookup_dirent(struct btree_trans *trans, +static int lookup_dirent_in_snapshot(struct btree_trans *trans, struct bch_hash_info hash_info, subvol_inum dir, struct qstr *name, - u64 *target, unsigned *type) + u64 *target, unsigned *type, u32 snapshot) { struct btree_iter iter; struct bkey_s_c_dirent d; - int ret; - - ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc, - &hash_info, dir, name, 0); + int ret = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc, + &hash_info, dir, name, 0, snapshot); if (ret) return ret; @@ -225,15 +222,16 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, struct bch_inode_unpacked root_inode; struct bch_hash_info root_hash_info; - ret = lookup_inode(trans, root_inum.inum, &root_inode, &snapshot); + u32 root_inode_snapshot = snapshot; + ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot); bch_err_msg(c, ret, "looking up root inode"); if (ret) return ret; root_hash_info = bch2_hash_info_init(c, &root_inode); - ret = __lookup_dirent(trans, root_hash_info, root_inum, - &lostfound_str, &inum, &d_type); + ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum, + &lostfound_str, &inum, &d_type, snapshot); if (bch2_err_matches(ret, ENOENT)) goto create_lostfound; @@ -250,7 +248,10 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, * The bch2_check_dirents pass has already run, dangling dirents * shouldn't exist here: */ - return lookup_inode(trans, inum, lostfound, &snapshot); + ret = lookup_inode(trans, inum, lostfound, &snapshot); + bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)", + inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot)); + return ret; create_lostfound: /* diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index d71d26e39521..bc890776eb57 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -233,7 +233,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t prt_str(&pbuf, "entry size: "); prt_human_readable_u64(&pbuf, vstruct_bytes(buf->data)); prt_newline(&pbuf); - bch2_prt_task_backtrace(&pbuf, current, 1); + bch2_prt_task_backtrace(&pbuf, current, 1, GFP_NOWAIT); trace_journal_entry_close(c, pbuf.buf); printbuf_exit(&pbuf); } diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 04a1e79a5ed3..bfd6585e746d 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1988,7 +1988,8 @@ CLOSURE_CALLBACK(bch2_journal_write) percpu_ref_get(&ca->io_ref); bio = ca->journal.bio; - bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH); + bio_reset(bio, ca->disk_sb.bdev, + REQ_OP_WRITE|REQ_PREFLUSH); bio->bi_end_io = journal_write_endio; bio->bi_private = ca; closure_bio_submit(bio, cl); diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h index b2be565bb8f2..64df11ab422b 100644 --- a/fs/bcachefs/mean_and_variance.h +++ b/fs/bcachefs/mean_and_variance.h @@ -17,7 +17,7 @@ * Rust and rustc has issues with u128. */ -#if defined(__SIZEOF_INT128__) && defined(__KERNEL__) +#if defined(__SIZEOF_INT128__) && defined(__KERNEL__) && !defined(CONFIG_PARISC) typedef struct { unsigned __int128 v; diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index 89fdb7c21134..fcaa5a888744 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -160,21 +160,16 @@ static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, s } static __always_inline int -bch2_hash_lookup(struct btree_trans *trans, +bch2_hash_lookup_in_snapshot(struct btree_trans *trans, struct btree_iter *iter, const struct bch_hash_desc desc, const struct bch_hash_info *info, subvol_inum inum, const void *key, - unsigned flags) + unsigned flags, u32 snapshot) { struct bkey_s_c k; - u32 snapshot; int ret; - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - return ret; - for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, SPOS(inum.inum, desc.hash_key(info, key), snapshot), POS(inum.inum, U64_MAX), @@ -195,6 +190,19 @@ bch2_hash_lookup(struct btree_trans *trans, } static __always_inline int +bch2_hash_lookup(struct btree_trans *trans, + struct btree_iter *iter, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + subvol_inum inum, const void *key, + unsigned flags) +{ + u32 snapshot; + return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?: + bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot); +} + +static __always_inline int bch2_hash_hole(struct btree_trans *trans, struct btree_iter *iter, const struct bch_hash_desc desc, diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c index b1c867aa2b58..9220d7de10db 100644 --- a/fs/bcachefs/thread_with_file.c +++ b/fs/bcachefs/thread_with_file.c @@ -53,9 +53,9 @@ int bch2_run_thread_with_file(struct thread_with_file *thr, if (ret) goto err; - fd_install(fd, file); get_task_struct(thr->task); wake_up_process(thr->task); + fd_install(fd, file); return fd; err: if (fd >= 0) diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index a135136adeee..231003b405ef 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -272,14 +272,14 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines) console_unlock(); } -int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr) +int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr, + gfp_t gfp) { #ifdef CONFIG_STACKTRACE unsigned nr_entries = 0; - int ret = 0; stack->nr = 0; - ret = darray_make_room(stack, 32); + int ret = darray_make_room_gfp(stack, 32, gfp); if (ret) return ret; @@ -308,10 +308,10 @@ void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack) } } -int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task, unsigned skipnr) +int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task, unsigned skipnr, gfp_t gfp) { bch_stacktrace stack = { 0 }; - int ret = bch2_save_backtrace(&stack, task, skipnr + 1); + int ret = bch2_save_backtrace(&stack, task, skipnr + 1, gfp); bch2_prt_backtrace(out, &stack); darray_exit(&stack); @@ -418,14 +418,15 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats, bch2_quantiles_update(&stats->quantiles, duration); } - if (time_after64(end, stats->last_event)) { + if (stats->last_event && time_after64(end, stats->last_event)) { freq = end - stats->last_event; mean_and_variance_update(&stats->freq_stats, freq); mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq); stats->max_freq = max(stats->max_freq, freq); stats->min_freq = min(stats->min_freq, freq); - stats->last_event = end; } + + stats->last_event = end; } static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index df67bf55fe2b..b414736d59a5 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -348,9 +348,9 @@ void bch2_prt_u64_base2(struct printbuf *, u64); void bch2_print_string_as_lines(const char *prefix, const char *lines); typedef DARRAY(unsigned long) bch_stacktrace; -int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned); +int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t); void bch2_prt_backtrace(struct printbuf *, bch_stacktrace *); -int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *, unsigned); +int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *, unsigned, gfp_t); static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev) { diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 193168214eeb..68345f73d429 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -141,16 +141,16 @@ static int compression_decompress_bio(struct list_head *ws, } static int compression_decompress(int type, struct list_head *ws, - const u8 *data_in, struct page *dest_page, - unsigned long start_byte, size_t srclen, size_t destlen) + const u8 *data_in, struct page *dest_page, + unsigned long dest_pgoff, size_t srclen, size_t destlen) { switch (type) { case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page, - start_byte, srclen, destlen); + dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page, - start_byte, srclen, destlen); + dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page, - start_byte, srclen, destlen); + dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_NONE: default: /* @@ -1037,14 +1037,23 @@ static int btrfs_decompress_bio(struct compressed_bio *cb) * start_byte tells us the offset into the compressed data we're interested in */ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, - unsigned long start_byte, size_t srclen, size_t destlen) + unsigned long dest_pgoff, size_t srclen, size_t destlen) { + struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb); struct list_head *workspace; + const u32 sectorsize = fs_info->sectorsize; int ret; + /* + * The full destination page range should not exceed the page size. + * And the @destlen should not exceed sectorsize, as this is only called for + * inline file extents, which should not exceed sectorsize. + */ + ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize); + workspace = get_workspace(type, 0); ret = compression_decompress(type, workspace, data_in, dest_page, - start_byte, srclen, destlen); + dest_pgoff, srclen, destlen); put_workspace(type, workspace); return ret; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 93cc92974dee..afd7e50d073d 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -148,7 +148,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zlib_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen); struct list_head *zlib_alloc_workspace(unsigned int level); void zlib_free_workspace(struct list_head *ws); @@ -159,7 +159,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int lzo_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen); struct list_head *lzo_alloc_workspace(unsigned int level); void lzo_free_workspace(struct list_head *ws); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c6907d533fe8..e71ef97d0a7c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1336,8 +1336,17 @@ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info, again: root = btrfs_lookup_fs_root(fs_info, objectid); if (root) { - /* Shouldn't get preallocated anon_dev for cached roots */ - ASSERT(!anon_dev); + /* + * Some other caller may have read out the newly inserted + * subvolume already (for things like backref walk etc). Not + * that common but still possible. In that case, we just need + * to free the anon_dev. + */ + if (unlikely(anon_dev)) { + free_anon_bdev(anon_dev); + anon_dev = 0; + } + if (check_ref && btrfs_root_refs(&root->root_item) == 0) { btrfs_put_root(root); return ERR_PTR(-ENOENT); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f396aba92c57..8e8cc1111277 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1260,7 +1260,8 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, u64 bytes_left, end; u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT); - if (WARN_ON(start != aligned_start)) { + /* Adjust the range to be aligned to 512B sectors if necessary. */ + if (start != aligned_start) { len -= aligned_start - start; len = round_down(len, 1 << SECTOR_SHIFT); start = aligned_start; @@ -4298,6 +4299,42 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, return 0; } +static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl) +{ + if (ffe_ctl->for_treelog) { + spin_lock(&fs_info->treelog_bg_lock); + if (fs_info->treelog_bg) + ffe_ctl->hint_byte = fs_info->treelog_bg; + spin_unlock(&fs_info->treelog_bg_lock); + } else if (ffe_ctl->for_data_reloc) { + spin_lock(&fs_info->relocation_bg_lock); + if (fs_info->data_reloc_bg) + ffe_ctl->hint_byte = fs_info->data_reloc_bg; + spin_unlock(&fs_info->relocation_bg_lock); + } else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) { + struct btrfs_block_group *block_group; + + spin_lock(&fs_info->zone_active_bgs_lock); + list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { + /* + * No lock is OK here because avail is monotinically + * decreasing, and this is just a hint. + */ + u64 avail = block_group->zone_capacity - block_group->alloc_offset; + + if (block_group_bits(block_group, ffe_ctl->flags) && + avail >= ffe_ctl->num_bytes) { + ffe_ctl->hint_byte = block_group->start; + break; + } + } + spin_unlock(&fs_info->zone_active_bgs_lock); + } + + return 0; +} + static int prepare_allocation(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl, struct btrfs_space_info *space_info, @@ -4308,19 +4345,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, return prepare_allocation_clustered(fs_info, ffe_ctl, space_info, ins); case BTRFS_EXTENT_ALLOC_ZONED: - if (ffe_ctl->for_treelog) { - spin_lock(&fs_info->treelog_bg_lock); - if (fs_info->treelog_bg) - ffe_ctl->hint_byte = fs_info->treelog_bg; - spin_unlock(&fs_info->treelog_bg_lock); - } - if (ffe_ctl->for_data_reloc) { - spin_lock(&fs_info->relocation_bg_lock); - if (fs_info->data_reloc_bg) - ffe_ctl->hint_byte = fs_info->data_reloc_bg; - spin_unlock(&fs_info->relocation_bg_lock); - } - return 0; + return prepare_allocation_zoned(fs_info, ffe_ctl); default: BUG(); } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 809b11472a80..1eb93d3962aa 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4458,6 +4458,8 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) u64 root_flags; int ret; + down_write(&fs_info->subvol_sem); + /* * Don't allow to delete a subvolume with send in progress. This is * inside the inode lock so the error handling that has to drop the bit @@ -4469,25 +4471,25 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) btrfs_warn(fs_info, "attempt to delete subvolume %llu during send", dest->root_key.objectid); - return -EPERM; + ret = -EPERM; + goto out_up_write; } if (atomic_read(&dest->nr_swapfiles)) { spin_unlock(&dest->root_item_lock); btrfs_warn(fs_info, "attempt to delete subvolume %llu with active swapfile", root->root_key.objectid); - return -EPERM; + ret = -EPERM; + goto out_up_write; } root_flags = btrfs_root_flags(&dest->root_item); btrfs_set_root_flags(&dest->root_item, root_flags | BTRFS_ROOT_SUBVOL_DEAD); spin_unlock(&dest->root_item_lock); - down_write(&fs_info->subvol_sem); - ret = may_destroy_subvol(dest); if (ret) - goto out_up_write; + goto out_undead; btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); /* @@ -4497,7 +4499,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) */ ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true); if (ret) - goto out_up_write; + goto out_undead; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { @@ -4563,15 +4565,17 @@ out_end_trans: inode->i_flags |= S_DEAD; out_release: btrfs_subvolume_release_metadata(root, &block_rsv); -out_up_write: - up_write(&fs_info->subvol_sem); +out_undead: if (ret) { spin_lock(&dest->root_item_lock); root_flags = btrfs_root_flags(&dest->root_item); btrfs_set_root_flags(&dest->root_item, root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); spin_unlock(&dest->root_item_lock); - } else { + } +out_up_write: + up_write(&fs_info->subvol_sem); + if (!ret) { d_invalidate(dentry); btrfs_prune_dentries(dest); ASSERT(dest->send_in_progress == 0); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 41b479861b3c..ac3316e0d11c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -790,6 +790,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, return -EOPNOTSUPP; } + if (btrfs_root_refs(&root->root_item) == 0) + return -ENOENT; + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return -EINVAL; @@ -2608,6 +2611,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) ret = -EFAULT; goto out; } + if (range.flags & ~BTRFS_DEFRAG_RANGE_FLAGS_SUPP) { + ret = -EOPNOTSUPP; + goto out; + } /* compression requires us to start the IO */ if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { range.flags |= BTRFS_DEFRAG_RANGE_START_IO; @@ -3808,6 +3815,11 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) goto out; } + if (sa->create && is_fstree(sa->qgroupid)) { + ret = -EINVAL; + goto out; + } + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 1131d5a29d61..e43bc0fdc74e 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -425,16 +425,16 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) } int lzo_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); + struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb); + const u32 sectorsize = fs_info->sectorsize; size_t in_len; size_t out_len; size_t max_segment_len = WORKSPACE_BUF_LENGTH; int ret = 0; - char *kaddr; - unsigned long bytes; if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2) return -EUCLEAN; @@ -451,7 +451,7 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, } data_in += LZO_LEN; - out_len = PAGE_SIZE; + out_len = sectorsize; ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); if (ret != LZO_E_OK) { pr_warn("BTRFS: decompress failed!\n"); @@ -459,29 +459,13 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, goto out; } - if (out_len < start_byte) { + ASSERT(out_len <= sectorsize); + memcpy_to_page(dest_page, dest_pgoff, workspace->buf, out_len); + /* Early end, considered as an error. */ + if (unlikely(out_len < destlen)) { ret = -EIO; - goto out; + memzero_page(dest_page, dest_pgoff + out_len, destlen - out_len); } - - /* - * the caller is already checking against PAGE_SIZE, but lets - * move this check closer to the memcpy/memset - */ - destlen = min_t(unsigned long, destlen, PAGE_SIZE); - bytes = min_t(unsigned long, destlen, out_len - start_byte); - - kaddr = kmap_local_page(dest_page); - memcpy(kaddr, workspace->buf + start_byte, bytes); - - /* - * btrfs_getblock is doing a zero on the tail of the page too, - * but this will cover anything missing from the decompressed - * data. - */ - if (bytes < destlen) - memset(kaddr+bytes, 0, destlen-bytes); - kunmap_local(kaddr); out: return ret; } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 63b426cc7798..5470e1cdf10c 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1736,6 +1736,15 @@ out: return ret; } +static bool qgroup_has_usage(struct btrfs_qgroup *qgroup) +{ + return (qgroup->rfer > 0 || qgroup->rfer_cmpr > 0 || + qgroup->excl > 0 || qgroup->excl_cmpr > 0 || + qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] > 0 || + qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] > 0 || + qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > 0); +} + int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -1755,6 +1764,11 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) goto out; } + if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) { + ret = -EBUSY; + goto out; + } + /* Check if there are no children of this qgroup */ if (!list_empty(&qgroup->members)) { ret = -EBUSY; diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 6486f0d7e993..8c4fc98ca9ce 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -889,8 +889,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, out_unlock: spin_unlock(&fs_info->ref_verify_lock); out: - if (ret) + if (ret) { + btrfs_free_ref_cache(fs_info); btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); + } return ret; } @@ -1021,8 +1023,8 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) } } if (ret) { - btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); btrfs_free_ref_cache(fs_info); + btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); } btrfs_free_path(path); return ret; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a01807cbd4d4..0123d2728923 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1098,12 +1098,22 @@ out: static void scrub_read_endio(struct btrfs_bio *bbio) { struct scrub_stripe *stripe = bbio->private; + struct bio_vec *bvec; + int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); + int num_sectors; + u32 bio_size = 0; + int i; + + ASSERT(sector_nr < stripe->nr_sectors); + bio_for_each_bvec_all(bvec, &bbio->bio, i) + bio_size += bvec->bv_len; + num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits; if (bbio->bio.bi_status) { - bitmap_set(&stripe->io_error_bitmap, 0, stripe->nr_sectors); - bitmap_set(&stripe->error_bitmap, 0, stripe->nr_sectors); + bitmap_set(&stripe->io_error_bitmap, sector_nr, num_sectors); + bitmap_set(&stripe->error_bitmap, sector_nr, num_sectors); } else { - bitmap_clear(&stripe->io_error_bitmap, 0, stripe->nr_sectors); + bitmap_clear(&stripe->io_error_bitmap, sector_nr, num_sectors); } bio_put(&bbio->bio); if (atomic_dec_and_test(&stripe->pending_io)) { @@ -1636,6 +1646,9 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct btrfs_bio *bbio = NULL; + unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start + + stripe->bg->length - stripe->logical) >> + fs_info->sectorsize_bits; u64 stripe_len = BTRFS_STRIPE_LEN; int mirror = stripe->mirror_num; int i; @@ -1646,6 +1659,10 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, struct page *page = scrub_stripe_get_page(stripe, i); unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i); + /* We're beyond the chunk boundary, no need to read anymore. */ + if (i >= nr_sectors) + break; + /* The current sector cannot be merged, submit the bio. */ if (bbio && ((i > 0 && @@ -1701,6 +1718,9 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, { struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_bio *bbio; + unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start + + stripe->bg->length - stripe->logical) >> + fs_info->sectorsize_bits; int mirror = stripe->mirror_num; ASSERT(stripe->bg); @@ -1715,14 +1735,16 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info, scrub_read_endio, stripe); - /* Read the whole stripe. */ bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT; - for (int i = 0; i < BTRFS_STRIPE_LEN >> PAGE_SHIFT; i++) { + /* Read the whole range inside the chunk boundary. */ + for (unsigned int cur = 0; cur < nr_sectors; cur++) { + struct page *page = scrub_stripe_get_page(stripe, cur); + unsigned int pgoff = scrub_stripe_get_page_offset(stripe, cur); int ret; - ret = bio_add_page(&bbio->bio, stripe->pages[i], PAGE_SIZE, 0); + ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); /* We should have allocated enough bio vectors. */ - ASSERT(ret == PAGE_SIZE); + ASSERT(ret == fs_info->sectorsize); } atomic_inc(&stripe->pending_io); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 4e36550618e5..7902298c1f25 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -8111,7 +8111,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) } if (arg->flags & ~BTRFS_SEND_FLAG_MASK) { - ret = -EINVAL; + ret = -EOPNOTSUPP; goto out; } @@ -8205,8 +8205,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) goto out; } - sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), - arg->clone_sources_count + 1, + sctx->clone_roots = kvcalloc(arg->clone_sources_count + 1, + sizeof(*sctx->clone_roots), GFP_KERNEL); if (!sctx->clone_roots) { ret = -ENOMEM; diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 93511d54abf8..0e49dab8dad2 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -475,7 +475,8 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - folio_start_writeback(folio); + if (!folio_test_writeback(folio)) + folio_start_writeback(folio); spin_unlock_irqrestore(&subpage->lock, flags); } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 896acfda1789..101f786963d4 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1457,6 +1457,14 @@ static int btrfs_reconfigure(struct fs_context *fc) btrfs_info_to_ctx(fs_info, &old_ctx); + /* + * This is our "bind mount" trick, we don't want to allow the user to do + * anything other than mount a different ro/rw and a different subvol, + * all of the mount options should be maintained. + */ + if (mount_reconfigure) + ctx->mount_opt = old_ctx.mount_opt; + sync_filesystem(sb); set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 50fdc69fdddf..6eccf8496486 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1436,7 +1436,7 @@ static int check_extent_item(struct extent_buffer *leaf, if (unlikely(ptr + btrfs_extent_inline_ref_size(inline_type) > end)) { extent_err(leaf, slot, "inline ref item overflows extent item, ptr %lu iref size %u end %lu", - ptr, inline_type, end); + ptr, btrfs_extent_inline_ref_size(inline_type), end); return -EUCLEAN; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 4c32497311d2..d67785be2c77 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3087,7 +3087,6 @@ struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, map = btrfs_find_chunk_map(fs_info, logical, length); if (unlikely(!map)) { - read_unlock(&fs_info->mapping_tree_lock); btrfs_crit(fs_info, "unable to find chunk map for logical %llu length %llu", logical, length); @@ -3095,7 +3094,6 @@ struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, } if (unlikely(map->start > logical || map->start + map->chunk_len <= logical)) { - read_unlock(&fs_info->mapping_tree_lock); btrfs_crit(fs_info, "found a bad chunk map, wanted %llu-%llu, found %llu-%llu", logical, logical + length, map->start, diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 36cf1f0e338e..8da66ea699e8 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -354,18 +354,13 @@ done: } int zlib_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0; int wbits = MAX_WBITS; - unsigned long bytes_left; - unsigned long total_out = 0; - unsigned long pg_offset = 0; - - destlen = min_t(unsigned long, destlen, PAGE_SIZE); - bytes_left = destlen; + unsigned long to_copy; workspace->strm.next_in = data_in; workspace->strm.avail_in = srclen; @@ -390,60 +385,30 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in, return -EIO; } - while (bytes_left > 0) { - unsigned long buf_start; - unsigned long buf_offset; - unsigned long bytes; - - ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH); - if (ret != Z_OK && ret != Z_STREAM_END) - break; - - buf_start = total_out; - total_out = workspace->strm.total_out; - - if (total_out == buf_start) { - ret = -EIO; - break; - } - - if (total_out <= start_byte) - goto next; - - if (total_out > start_byte && buf_start < start_byte) - buf_offset = start_byte - buf_start; - else - buf_offset = 0; - - bytes = min(PAGE_SIZE - pg_offset, - PAGE_SIZE - (buf_offset % PAGE_SIZE)); - bytes = min(bytes, bytes_left); + /* + * Everything (in/out buf) should be at most one sector, there should + * be no need to switch any input/output buffer. + */ + ret = zlib_inflate(&workspace->strm, Z_FINISH); + to_copy = min(workspace->strm.total_out, destlen); + if (ret != Z_STREAM_END) + goto out; - memcpy_to_page(dest_page, pg_offset, - workspace->buf + buf_offset, bytes); + memcpy_to_page(dest_page, dest_pgoff, workspace->buf, to_copy); - pg_offset += bytes; - bytes_left -= bytes; -next: - workspace->strm.next_out = workspace->buf; - workspace->strm.avail_out = workspace->buf_size; - } - - if (ret != Z_STREAM_END && bytes_left != 0) +out: + if (unlikely(to_copy != destlen)) { + pr_warn_ratelimited("BTRFS: infalte failed, decompressed=%lu expected=%zu\n", + to_copy, destlen); ret = -EIO; - else + } else { ret = 0; + } zlib_inflateEnd(&workspace->strm); - /* - * this should only happen if zlib returned fewer bytes than we - * expected. btrfs_get_block is responsible for zeroing from the - * end of the inline extent (destlen) to the end of the page - */ - if (pg_offset < destlen) { - memzero_page(dest_page, pg_offset, destlen - pg_offset); - } + if (unlikely(to_copy < destlen)) + memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy); return ret; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 5bd76813b23f..168af9d000d1 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -2055,6 +2055,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) map = block_group->physical_map; + spin_lock(&fs_info->zone_active_bgs_lock); spin_lock(&block_group->lock); if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) { ret = true; @@ -2067,7 +2068,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) goto out_unlock; } - spin_lock(&fs_info->zone_active_bgs_lock); for (i = 0; i < map->num_stripes; i++) { struct btrfs_zoned_device_info *zinfo; int reserved = 0; @@ -2087,20 +2087,17 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) */ if (atomic_read(&zinfo->active_zones_left) <= reserved) { ret = false; - spin_unlock(&fs_info->zone_active_bgs_lock); goto out_unlock; } if (!btrfs_dev_set_active_zone(device, physical)) { /* Cannot activate the zone */ ret = false; - spin_unlock(&fs_info->zone_active_bgs_lock); goto out_unlock; } if (!is_data) zinfo->reserved_active_zones--; } - spin_unlock(&fs_info->zone_active_bgs_lock); /* Successfully activated all the zones */ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); @@ -2108,8 +2105,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) /* For the active block group list */ btrfs_get_block_group(block_group); - - spin_lock(&fs_info->zone_active_bgs_lock); list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs); spin_unlock(&fs_info->zone_active_bgs_lock); @@ -2117,6 +2112,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) out_unlock: spin_unlock(&block_group->lock); + spin_unlock(&fs_info->zone_active_bgs_lock); return ret; } diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index 5fd74ec60bef..4ba42f1fa3b4 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -539,6 +539,9 @@ int cachefiles_ondemand_init_object(struct cachefiles_object *object) struct fscache_volume *volume = object->volume->vcookie; size_t volume_key_size, cookie_key_size, data_len; + if (!object->ondemand) + return 0; + /* * CacheFiles will firstly check the cache file under the root cache * directory. If the coherency check failed, it will fallback to diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 9c02f328c966..ad1f46c66fbf 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1452,7 +1452,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, if (flushing & CEPH_CAP_XATTR_EXCL) { arg->old_xattr_buf = __ceph_build_xattrs_blob(ci); arg->xattr_version = ci->i_xattrs.version; - arg->xattr_buf = ci->i_xattrs.blob; + arg->xattr_buf = ceph_buffer_get(ci->i_xattrs.blob); } else { arg->xattr_buf = NULL; arg->old_xattr_buf = NULL; @@ -1553,6 +1553,7 @@ static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci) encode_cap_msg(msg, arg); ceph_con_send(&arg->session->s_con, msg); ceph_buffer_put(arg->old_xattr_buf); + ceph_buffer_put(arg->xattr_buf); if (arg->wake) wake_up_all(&ci->i_cap_wq); } @@ -3215,7 +3216,6 @@ static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci, enum put_cap_refs_mode { PUT_CAP_REFS_SYNC = 0, - PUT_CAP_REFS_NO_CHECK, PUT_CAP_REFS_ASYNC, }; @@ -3331,11 +3331,6 @@ void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had) __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_ASYNC); } -void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had) -{ - __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_NO_CHECK); -} - /* * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap * context. Adjust per-snap dirty page accounting as appropriate. diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 0c25d326afc4..7b2e77517f23 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -78,6 +78,8 @@ struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry, if (!inode) return ERR_PTR(-ENOMEM); + inode->i_blkbits = CEPH_FSCRYPT_BLOCK_SHIFT; + if (!S_ISLNK(*mode)) { err = ceph_pre_init_acls(dir, mode, as_ctx); if (err < 0) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 548d1de379f3..f71bb9c9569f 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1089,7 +1089,7 @@ void ceph_mdsc_release_request(struct kref *kref) struct ceph_mds_request *req = container_of(kref, struct ceph_mds_request, r_kref); - ceph_mdsc_release_dir_caps_no_check(req); + ceph_mdsc_release_dir_caps_async(req); destroy_reply_info(&req->r_reply_info); if (req->r_request) ceph_msg_put(req->r_request); @@ -4261,7 +4261,7 @@ void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req) } } -void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req) +void ceph_mdsc_release_dir_caps_async(struct ceph_mds_request *req) { struct ceph_client *cl = req->r_mdsc->fsc->client; int dcaps; @@ -4269,8 +4269,7 @@ void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req) dcaps = xchg(&req->r_dir_caps, 0); if (dcaps) { doutc(cl, "releasing r_dir_caps=%s\n", ceph_cap_string(dcaps)); - ceph_put_cap_refs_no_check_caps(ceph_inode(req->r_parent), - dcaps); + ceph_put_cap_refs_async(ceph_inode(req->r_parent), dcaps); } } @@ -4306,7 +4305,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, if (req->r_session->s_mds != session->s_mds) continue; - ceph_mdsc_release_dir_caps_no_check(req); + ceph_mdsc_release_dir_caps_async(req); __send_request(session, req, true); } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 2e6ddaa13d72..40560af38827 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -552,7 +552,7 @@ extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req); extern void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req); -extern void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req); +extern void ceph_mdsc_release_dir_caps_async(struct ceph_mds_request *req); static inline void ceph_mdsc_get_request(struct ceph_mds_request *req) { kref_get(&req->r_kref); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index b06e2bc86221..b63b4cd9b5b6 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1255,8 +1255,6 @@ extern void ceph_take_cap_refs(struct ceph_inode_info *ci, int caps, extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); extern void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had); -extern void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, - int had); extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc); extern void __ceph_remove_capsnap(struct inode *inode, diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 279933e007d2..7cc5841577b2 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -11,13 +11,12 @@ struct z_erofs_decompress_req { struct super_block *sb; struct page **in, **out; - unsigned short pageofs_in, pageofs_out; unsigned int inputsize, outputsize; - /* indicate the algorithm will be used for decompression */ - unsigned int alg; + unsigned int alg; /* the algorithm for decompression */ bool inplace_io, partial_decoding, fillgaps; + gfp_t gfp; /* allocation flags for extra temporary buffers */ }; struct z_erofs_decompressor { diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 072ef6a66823..d4cee95af14c 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -111,8 +111,9 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, victim = availables[--top]; get_page(victim); } else { - victim = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); + victim = erofs_allocpage(pagepool, rq->gfp); + if (!victim) + return -ENOMEM; set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE); } rq->out[i] = victim; diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c index 4a64a9c91dd3..b98872058abe 100644 --- a/fs/erofs/decompressor_deflate.c +++ b/fs/erofs/decompressor_deflate.c @@ -95,7 +95,7 @@ int z_erofs_load_deflate_config(struct super_block *sb, } int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq, - struct page **pagepool) + struct page **pgpl) { const unsigned int nrpages_out = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; @@ -158,8 +158,12 @@ again: strm->z.avail_out = min_t(u32, outsz, PAGE_SIZE - pofs); outsz -= strm->z.avail_out; if (!rq->out[no]) { - rq->out[no] = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); + rq->out[no] = erofs_allocpage(pgpl, rq->gfp); + if (!rq->out[no]) { + kout = NULL; + err = -ENOMEM; + break; + } set_page_private(rq->out[no], Z_EROFS_SHORTLIVED_PAGE); } @@ -211,8 +215,11 @@ again: DBG_BUGON(erofs_page_is_managed(EROFS_SB(sb), rq->in[j])); - tmppage = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); + tmppage = erofs_allocpage(pgpl, rq->gfp); + if (!tmppage) { + err = -ENOMEM; + goto failed; + } set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE); copy_highpage(tmppage, rq->in[j]); rq->in[j] = tmppage; @@ -230,7 +237,7 @@ again: break; } } - +failed: if (zlib_inflateEnd(&strm->z) != Z_OK && !err) err = -EIO; if (kout) diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c index 2dd14f99c1dc..6ca357d83cfa 100644 --- a/fs/erofs/decompressor_lzma.c +++ b/fs/erofs/decompressor_lzma.c @@ -148,7 +148,7 @@ again: } int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, - struct page **pagepool) + struct page **pgpl) { const unsigned int nrpages_out = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; @@ -215,8 +215,11 @@ again: PAGE_SIZE - pageofs); outlen -= strm->buf.out_size; if (!rq->out[no] && rq->fillgaps) { /* deduped */ - rq->out[no] = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); + rq->out[no] = erofs_allocpage(pgpl, rq->gfp); + if (!rq->out[no]) { + err = -ENOMEM; + break; + } set_page_private(rq->out[no], Z_EROFS_SHORTLIVED_PAGE); } @@ -258,8 +261,11 @@ again: DBG_BUGON(erofs_page_is_managed(EROFS_SB(rq->sb), rq->in[j])); - tmppage = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); + tmppage = erofs_allocpage(pgpl, rq->gfp); + if (!tmppage) { + err = -ENOMEM; + goto failed; + } set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE); copy_highpage(tmppage, rq->in[j]); rq->in[j] = tmppage; @@ -277,6 +283,7 @@ again: break; } } +failed: if (no < nrpages_out && strm->buf.out) kunmap(rq->out[no]); if (ni < nrpages_in) diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index bc12030393b2..5ff90026fd43 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -459,7 +459,7 @@ static struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb inode->i_size = OFFSET_MAX; inode->i_mapping->a_ops = &erofs_fscache_meta_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL); inode->i_blkbits = EROFS_SB(sb)->blkszbits; inode->i_private = ctx; diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 3d616dea55dc..36e638e8b53a 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -60,7 +60,7 @@ static void *erofs_read_inode(struct erofs_buf *buf, } else { const unsigned int gotten = sb->s_blocksize - *ofs; - copied = kmalloc(vi->inode_isize, GFP_NOFS); + copied = kmalloc(vi->inode_isize, GFP_KERNEL); if (!copied) { err = -ENOMEM; goto err_out; diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index 5dea308764b4..e146d09151af 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -81,7 +81,7 @@ struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, repeat: xa_lock(&sbi->managed_pslots); pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index, - NULL, grp, GFP_NOFS); + NULL, grp, GFP_KERNEL); if (pre) { if (xa_is_err(pre)) { pre = ERR_PTR(xa_err(pre)); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 692c0c39be63..ff0aa72b0db3 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -82,6 +82,9 @@ struct z_erofs_pcluster { /* L: indicate several pageofs_outs or not */ bool multibases; + /* L: whether extra buffer allocations are best-effort */ + bool besteffort; + /* A: compressed bvecs (can be cached or inplaced pages) */ struct z_erofs_bvec compressed_bvecs[]; }; @@ -230,7 +233,7 @@ static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, struct page *nextpage = *candidate_bvpage; if (!nextpage) { - nextpage = erofs_allocpage(pagepool, GFP_NOFS); + nextpage = erofs_allocpage(pagepool, GFP_KERNEL); if (!nextpage) return -ENOMEM; set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE); @@ -302,7 +305,7 @@ static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int size) if (nrpages > pcs->maxpages) continue; - pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS); + pcl = kmem_cache_zalloc(pcs->slab, GFP_KERNEL); if (!pcl) return ERR_PTR(-ENOMEM); pcl->pclustersize = size; @@ -563,21 +566,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; unsigned int i; - if (i_blocksize(fe->inode) != PAGE_SIZE) - return; - if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) + if (i_blocksize(fe->inode) != PAGE_SIZE || + fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) return; for (i = 0; i < pclusterpages; ++i) { struct page *page, *newpage; void *t; /* mark pages just found for debugging */ - /* the compressed page was loaded before */ + /* Inaccurate check w/o locking to avoid unneeded lookups */ if (READ_ONCE(pcl->compressed_bvecs[i].page)) continue; page = find_get_page(mc, pcl->obj.index + i); - if (page) { t = (void *)((unsigned long)page | 1); newpage = NULL; @@ -597,9 +598,13 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); t = (void *)((unsigned long)newpage | 1); } - - if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t)) + spin_lock(&pcl->obj.lockref.lock); + if (!pcl->compressed_bvecs[i].page) { + pcl->compressed_bvecs[i].page = t; + spin_unlock(&pcl->obj.lockref.lock); continue; + } + spin_unlock(&pcl->obj.lockref.lock); if (page) put_page(page); @@ -694,7 +699,7 @@ static void z_erofs_cache_invalidate_folio(struct folio *folio, DBG_BUGON(stop > folio_size(folio) || stop < length); if (offset == 0 && stop == folio_size(folio)) - while (!z_erofs_cache_release_folio(folio, GFP_NOFS)) + while (!z_erofs_cache_release_folio(folio, 0)) cond_resched(); } @@ -713,36 +718,30 @@ int erofs_init_managed_cache(struct super_block *sb) set_nlink(inode, 1); inode->i_size = OFFSET_MAX; inode->i_mapping->a_ops = &z_erofs_cache_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL); EROFS_SB(sb)->managed_cache = inode; return 0; } -static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, - struct z_erofs_bvec *bvec) -{ - struct z_erofs_pcluster *const pcl = fe->pcl; - - while (fe->icur > 0) { - if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page, - NULL, bvec->page)) { - pcl->compressed_bvecs[fe->icur] = *bvec; - return true; - } - } - return false; -} - /* callers must be with pcluster lock held */ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, struct z_erofs_bvec *bvec, bool exclusive) { + struct z_erofs_pcluster *pcl = fe->pcl; int ret; if (exclusive) { /* give priority for inplaceio to use file pages first */ - if (z_erofs_try_inplace_io(fe, bvec)) + spin_lock(&pcl->obj.lockref.lock); + while (fe->icur > 0) { + if (pcl->compressed_bvecs[--fe->icur].page) + continue; + pcl->compressed_bvecs[fe->icur] = *bvec; + spin_unlock(&pcl->obj.lockref.lock); return 0; + } + spin_unlock(&pcl->obj.lockref.lock); + /* otherwise, check if it can be used as a bvpage */ if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED && !fe->candidate_bvpage) @@ -964,7 +963,7 @@ static int z_erofs_read_fragment(struct super_block *sb, struct page *page, } static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, - struct page *page) + struct page *page, bool ra) { struct inode *const inode = fe->inode; struct erofs_map_blocks *const map = &fe->map; @@ -1014,6 +1013,7 @@ repeat: err = z_erofs_pcluster_begin(fe); if (err) goto out; + fe->pcl->besteffort |= !ra; } /* @@ -1280,6 +1280,9 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, .inplace_io = overlapped, .partial_decoding = pcl->partial, .fillgaps = pcl->multibases, + .gfp = pcl->besteffort ? + GFP_KERNEL | __GFP_NOFAIL : + GFP_NOWAIT | __GFP_NORETRY }, be->pagepool); /* must handle all compressed pages before actual file pages */ @@ -1322,6 +1325,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, pcl->length = 0; pcl->partial = true; pcl->multibases = false; + pcl->besteffort = false; pcl->bvset.nextpage = NULL; pcl->vcnt = 0; @@ -1423,23 +1427,26 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec, { gfp_t gfp = mapping_gfp_mask(mc); bool tocache = false; - struct z_erofs_bvec *zbv = pcl->compressed_bvecs + nr; + struct z_erofs_bvec zbv; struct address_space *mapping; - struct page *page, *oldpage; + struct page *page; int justfound, bs = i_blocksize(f->inode); /* Except for inplace pages, the entire page can be used for I/Os */ bvec->bv_offset = 0; bvec->bv_len = PAGE_SIZE; repeat: - oldpage = READ_ONCE(zbv->page); - if (!oldpage) + spin_lock(&pcl->obj.lockref.lock); + zbv = pcl->compressed_bvecs[nr]; + page = zbv.page; + justfound = (unsigned long)page & 1UL; + page = (struct page *)((unsigned long)page & ~1UL); + pcl->compressed_bvecs[nr].page = page; + spin_unlock(&pcl->obj.lockref.lock); + if (!page) goto out_allocpage; - justfound = (unsigned long)oldpage & 1UL; - page = (struct page *)((unsigned long)oldpage & ~1UL); bvec->bv_page = page; - DBG_BUGON(z_erofs_is_shortlived_page(page)); /* * Handle preallocated cached pages. We tried to allocate such pages @@ -1448,7 +1455,6 @@ repeat: */ if (page->private == Z_EROFS_PREALLOCATED_PAGE) { set_page_private(page, 0); - WRITE_ONCE(zbv->page, page); tocache = true; goto out_tocache; } @@ -1459,9 +1465,9 @@ repeat: * therefore it is impossible for `mapping` to be NULL. */ if (mapping && mapping != mc) { - if (zbv->offset < 0) - bvec->bv_offset = round_up(-zbv->offset, bs); - bvec->bv_len = round_up(zbv->end, bs) - bvec->bv_offset; + if (zbv.offset < 0) + bvec->bv_offset = round_up(-zbv.offset, bs); + bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset; return; } @@ -1471,7 +1477,6 @@ repeat: /* the cached page is still in managed cache */ if (page->mapping == mc) { - WRITE_ONCE(zbv->page, page); /* * The cached page is still available but without a valid * `->private` pcluster hint. Let's reconnect them. @@ -1503,11 +1508,15 @@ repeat: put_page(page); out_allocpage: page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL); - if (oldpage != cmpxchg(&zbv->page, oldpage, page)) { + spin_lock(&pcl->obj.lockref.lock); + if (pcl->compressed_bvecs[nr].page) { erofs_pagepool_add(&f->pagepool, page); + spin_unlock(&pcl->obj.lockref.lock); cond_resched(); goto repeat; } + pcl->compressed_bvecs[nr].page = page; + spin_unlock(&pcl->obj.lockref.lock); bvec->bv_page = page; out_tocache: if (!tocache || bs != PAGE_SIZE || @@ -1685,6 +1694,7 @@ submit_bio_retry: if (cur + bvec.bv_len > end) bvec.bv_len = end - cur; + DBG_BUGON(bvec.bv_len < sb->s_blocksize); if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len, bvec.bv_offset)) goto submit_bio_retry; @@ -1785,7 +1795,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, if (PageUptodate(page)) unlock_page(page); else - (void)z_erofs_do_read_page(f, page); + (void)z_erofs_do_read_page(f, page, !!rac); put_page(page); } @@ -1806,7 +1816,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT; z_erofs_pcluster_readmore(&f, NULL, true); - err = z_erofs_do_read_page(&f, &folio->page); + err = z_erofs_do_read_page(&f, &folio->page, false); z_erofs_pcluster_readmore(&f, NULL, false); z_erofs_pcluster_end(&f); @@ -1847,7 +1857,7 @@ static void z_erofs_readahead(struct readahead_control *rac) folio = head; head = folio_get_private(folio); - err = z_erofs_do_read_page(&f, &folio->page); + err = z_erofs_do_read_page(&f, &folio->page, true); if (err && err != -EINTR) erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu", folio->index, EROFS_I(inode)->nid); diff --git a/fs/exec.c b/fs/exec.c index 8cdd5b2dd09c..af4fbb61cd53 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -128,7 +128,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) struct filename *tmp = getname(library); int error = PTR_ERR(tmp); static const struct open_flags uselib_flags = { - .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, + .open_flag = O_LARGEFILE | O_RDONLY, .acc_mode = MAY_READ | MAY_EXEC, .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, @@ -904,6 +904,10 @@ EXPORT_SYMBOL(transfer_args_to_stack); #endif /* CONFIG_MMU */ +/* + * On success, caller must call do_close_execat() on the returned + * struct file to close it. + */ static struct file *do_open_execat(int fd, struct filename *name, int flags) { struct file *file; @@ -948,6 +952,17 @@ exit: return ERR_PTR(err); } +/** + * open_exec - Open a path name for execution + * + * @name: path name to open with the intent of executing it. + * + * Returns ERR_PTR on failure or allocated struct file on success. + * + * As this is a wrapper for the internal do_open_execat(), callers + * must call allow_write_access() before fput() on release. Also see + * do_close_execat(). + */ struct file *open_exec(const char *name) { struct filename *filename = getname_kernel(name); @@ -1409,6 +1424,9 @@ int begin_new_exec(struct linux_binprm * bprm) out_unlock: up_write(&me->signal->exec_update_lock); + if (!bprm->cred) + mutex_unlock(&me->signal->cred_guard_mutex); + out: return retval; } @@ -1484,6 +1502,15 @@ static int prepare_bprm_creds(struct linux_binprm *bprm) return -ENOMEM; } +/* Matches do_open_execat() */ +static void do_close_execat(struct file *file) +{ + if (!file) + return; + allow_write_access(file); + fput(file); +} + static void free_bprm(struct linux_binprm *bprm) { if (bprm->mm) { @@ -1495,10 +1522,7 @@ static void free_bprm(struct linux_binprm *bprm) mutex_unlock(¤t->signal->cred_guard_mutex); abort_creds(bprm->cred); } - if (bprm->file) { - allow_write_access(bprm->file); - fput(bprm->file); - } + do_close_execat(bprm->file); if (bprm->executable) fput(bprm->executable); /* If a binfmt changed the interp, free it. */ @@ -1520,8 +1544,7 @@ static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int fl bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); if (!bprm) { - allow_write_access(file); - fput(file); + do_close_execat(file); return ERR_PTR(-ENOMEM); } @@ -1610,6 +1633,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm) } rcu_read_unlock(); + /* "users" and "in_exec" locked for copy_fs() */ if (p->fs->users > n_fs) bprm->unsafe |= LSM_UNSAFE_SHARE; else @@ -1826,9 +1850,6 @@ static int exec_binprm(struct linux_binprm *bprm) return 0; } -/* - * sys_execve() executes a new program. - */ static int bprm_execve(struct linux_binprm *bprm) { int retval; diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 522edcbb2ce4..0687f952956c 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -501,7 +501,7 @@ static ssize_t exfat_direct_IO(struct kiocb *iocb, struct iov_iter *iter) struct inode *inode = mapping->host; struct exfat_inode_info *ei = EXFAT_I(inode); loff_t pos = iocb->ki_pos; - loff_t size = iocb->ki_pos + iov_iter_count(iter); + loff_t size = pos + iov_iter_count(iter); int rw = iov_iter_rw(iter); ssize_t ret; @@ -525,11 +525,10 @@ static ssize_t exfat_direct_IO(struct kiocb *iocb, struct iov_iter *iter) */ ret = blockdev_direct_IO(iocb, inode, iter, exfat_get_block); if (ret < 0) { - if (rw == WRITE) + if (rw == WRITE && ret != -EIOCBQUEUED) exfat_write_failed(mapping, size); - if (ret != -EIOCBQUEUED) - return ret; + return ret; } else size = pos + ret; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a5d784872303..023571f8dd1b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -252,8 +252,10 @@ struct ext4_allocation_request { #define EXT4_MAP_MAPPED BIT(BH_Mapped) #define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten) #define EXT4_MAP_BOUNDARY BIT(BH_Boundary) +#define EXT4_MAP_DELAYED BIT(BH_Delay) #define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ - EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY) + EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ + EXT4_MAP_DELAYED) struct ext4_map_blocks { ext4_fsblk_t m_pblk; @@ -2912,10 +2914,10 @@ extern const struct seq_operations ext4_mb_seq_groups_ops; extern const struct seq_operations ext4_mb_seq_structs_summary_ops; extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset); extern int ext4_mb_init(struct super_block *); -extern int ext4_mb_release(struct super_block *); +extern void ext4_mb_release(struct super_block *); extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, struct ext4_allocation_request *, int *); -extern void ext4_discard_preallocations(struct inode *, unsigned int); +extern void ext4_discard_preallocations(struct inode *); extern int __init ext4_init_mballoc(void); extern void ext4_exit_mballoc(void); extern ext4_group_t ext4_mb_prefetch(struct super_block *sb, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 01299b55a567..7669d154c05e 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -100,7 +100,7 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) * i_rwsem. So we can safely drop the i_data_sem here. */ BUG_ON(EXT4_JOURNAL(inode) == NULL); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); *dropped = 1; return 0; @@ -2229,7 +2229,7 @@ static int ext4_fill_es_cache_info(struct inode *inode, /* - * ext4_ext_determine_hole - determine hole around given block + * ext4_ext_find_hole - find hole around given block according to the given path * @inode: inode we lookup in * @path: path in extent tree to @lblk * @lblk: pointer to logical block around which we want to determine hole @@ -2241,9 +2241,9 @@ static int ext4_fill_es_cache_info(struct inode *inode, * The function returns the length of a hole starting at @lblk. We update @lblk * to the beginning of the hole if we managed to find it. */ -static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode, - struct ext4_ext_path *path, - ext4_lblk_t *lblk) +static ext4_lblk_t ext4_ext_find_hole(struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t *lblk) { int depth = ext_depth(inode); struct ext4_extent *ex; @@ -2271,30 +2271,6 @@ static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode, } /* - * ext4_ext_put_gap_in_cache: - * calculate boundaries of the gap that the requested block fits into - * and cache this gap - */ -static void -ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start, - ext4_lblk_t hole_len) -{ - struct extent_status es; - - ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start, - hole_start + hole_len - 1, &es); - if (es.es_len) { - /* There's delayed extent containing lblock? */ - if (es.es_lblk <= hole_start) - return; - hole_len = min(es.es_lblk - hole_start, hole_len); - } - ext_debug(inode, " -> %u:%u\n", hole_start, hole_len); - ext4_es_insert_extent(inode, hole_start, hole_len, ~0, - EXTENT_STATUS_HOLE); -} - -/* * ext4_ext_rm_idx: * removes index from the index block. */ @@ -4062,6 +4038,72 @@ static int get_implied_cluster_alloc(struct super_block *sb, return 0; } +/* + * Determine hole length around the given logical block, first try to + * locate and expand the hole from the given @path, and then adjust it + * if it's partially or completely converted to delayed extents, insert + * it into the extent cache tree if it's indeed a hole, finally return + * the length of the determined extent. + */ +static ext4_lblk_t ext4_ext_determine_insert_hole(struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t lblk) +{ + ext4_lblk_t hole_start, len; + struct extent_status es; + + hole_start = lblk; + len = ext4_ext_find_hole(inode, path, &hole_start); +again: + ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start, + hole_start + len - 1, &es); + if (!es.es_len) + goto insert_hole; + + /* + * There's a delalloc extent in the hole, handle it if the delalloc + * extent is in front of, behind and straddle the queried range. + */ + if (lblk >= es.es_lblk + es.es_len) { + /* + * The delalloc extent is in front of the queried range, + * find again from the queried start block. + */ + len -= lblk - hole_start; + hole_start = lblk; + goto again; + } else if (in_range(lblk, es.es_lblk, es.es_len)) { + /* + * The delalloc extent containing lblk, it must have been + * added after ext4_map_blocks() checked the extent status + * tree so we are not holding i_rwsem and delalloc info is + * only stabilized by i_data_sem we are going to release + * soon. Don't modify the extent status tree and report + * extent as a hole, just adjust the length to the delalloc + * extent's after lblk. + */ + len = es.es_lblk + es.es_len - lblk; + return len; + } else { + /* + * The delalloc extent is partially or completely behind + * the queried range, update hole length until the + * beginning of the delalloc extent. + */ + len = min(es.es_lblk - hole_start, len); + } + +insert_hole: + /* Put just found gap into cache to speed up subsequent requests */ + ext_debug(inode, " -> %u:%u\n", hole_start, len); + ext4_es_insert_extent(inode, hole_start, len, ~0, EXTENT_STATUS_HOLE); + + /* Update hole_len to reflect hole size after lblk */ + if (hole_start != lblk) + len -= lblk - hole_start; + + return len; +} /* * Block allocation/map/preallocation routine for extents based files @@ -4179,22 +4221,12 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * we couldn't try to create block if create flag is zero */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { - ext4_lblk_t hole_start, hole_len; + ext4_lblk_t len; - hole_start = map->m_lblk; - hole_len = ext4_ext_determine_hole(inode, path, &hole_start); - /* - * put just found gap into cache to speed up - * subsequent requests - */ - ext4_ext_put_gap_in_cache(inode, hole_start, hole_len); + len = ext4_ext_determine_insert_hole(inode, path, map->m_lblk); - /* Update hole_len to reflect hole size after map->m_lblk */ - if (hole_start != map->m_lblk) - hole_len -= map->m_lblk - hole_start; map->m_pblk = 0; - map->m_len = min_t(unsigned int, map->m_len, hole_len); - + map->m_len = min_t(unsigned int, map->m_len, len); goto out; } @@ -4313,7 +4345,7 @@ got_allocated_blocks: * not a good idea to call discard here directly, * but otherwise we'd need to call it every free(). */ - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE; ext4_free_blocks(handle, inode, NULL, newblock, @@ -5357,7 +5389,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, punch_start, EXT_MAX_BLOCKS - punch_start); ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1); @@ -5365,7 +5397,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); ret = ext4_ext_shift_extents(inode, handle, punch_stop, punch_stop - punch_start, SHIFT_LEFT); @@ -5497,7 +5529,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) goto out_stop; down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); path = ext4_find_extent(inode, offset_lblk, NULL, 0); if (IS_ERR(path)) { diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 6aa15dafc677..54d6ff22585c 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -174,7 +174,7 @@ static int ext4_release_file(struct inode *inode, struct file *filp) (atomic_read(&inode->i_writecount) == 1) && !EXT4_I(inode)->i_reserved_data_blocks) { down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); } if (is_dx(inode) && filp->private_data) diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index a9f3716119d3..d8ca7f64f952 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -714,7 +714,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode, * i_rwsem. So we can safely drop the i_data_sem here. */ BUG_ON(EXT4_JOURNAL(inode) == NULL); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); *dropped = 1; return 0; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5af1b0b8680e..2ccf3b5e3a7c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -371,7 +371,7 @@ void ext4_da_update_reserve_space(struct inode *inode, */ if ((ei->i_reserved_data_blocks == 0) && !inode_is_open_for_write(inode)) - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); } static int __check_block_validity(struct inode *inode, const char *func, @@ -515,6 +515,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, map->m_len = retval; } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { map->m_pblk = 0; + map->m_flags |= ext4_es_is_delayed(&es) ? + EXT4_MAP_DELAYED : 0; retval = es.es_len - (map->m_lblk - es.es_lblk); if (retval > map->m_len) retval = map->m_len; @@ -1703,11 +1705,8 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) { - if (ext4_es_is_hole(&es)) { - retval = 0; - down_read(&EXT4_I(inode)->i_data_sem); + if (ext4_es_is_hole(&es)) goto add_delayed; - } /* * Delayed extent could be allocated by fallocate. @@ -1749,26 +1748,11 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, retval = ext4_ext_map_blocks(NULL, inode, map, 0); else retval = ext4_ind_map_blocks(NULL, inode, map, 0); - -add_delayed: - if (retval == 0) { - int ret; - - /* - * XXX: __block_prepare_write() unmaps passed block, - * is it OK? - */ - - ret = ext4_insert_delayed_block(inode, map->m_lblk); - if (ret != 0) { - retval = ret; - goto out_unlock; - } - - map_bh(bh, inode->i_sb, invalid_block); - set_buffer_new(bh); - set_buffer_delay(bh); - } else if (retval > 0) { + if (retval < 0) { + up_read(&EXT4_I(inode)->i_data_sem); + return retval; + } + if (retval > 0) { unsigned int status; if (unlikely(retval != map->m_len)) { @@ -1783,11 +1767,21 @@ add_delayed: EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status); + up_read(&EXT4_I(inode)->i_data_sem); + return retval; } + up_read(&EXT4_I(inode)->i_data_sem); -out_unlock: - up_read((&EXT4_I(inode)->i_data_sem)); +add_delayed: + down_write(&EXT4_I(inode)->i_data_sem); + retval = ext4_insert_delayed_block(inode, map->m_lblk); + up_write(&EXT4_I(inode)->i_data_sem); + if (retval) + return retval; + map_bh(bh, inode->i_sb, invalid_block); + set_buffer_new(bh); + set_buffer_delay(bh); return retval; } @@ -3268,6 +3262,9 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, iomap->addr = (u64) map->m_pblk << blkbits; if (flags & IOMAP_DAX) iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off; + } else if (map->m_flags & EXT4_MAP_DELAYED) { + iomap->type = IOMAP_DELALLOC; + iomap->addr = IOMAP_NULL_ADDR; } else { iomap->type = IOMAP_HOLE; iomap->addr = IOMAP_NULL_ADDR; @@ -3430,35 +3427,11 @@ const struct iomap_ops ext4_iomap_overwrite_ops = { .iomap_end = ext4_iomap_end, }; -static bool ext4_iomap_is_delalloc(struct inode *inode, - struct ext4_map_blocks *map) -{ - struct extent_status es; - ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1; - - ext4_es_find_extent_range(inode, &ext4_es_is_delayed, - map->m_lblk, end, &es); - - if (!es.es_len || es.es_lblk > end) - return false; - - if (es.es_lblk > map->m_lblk) { - map->m_len = es.es_lblk - map->m_lblk; - return false; - } - - offset = map->m_lblk - es.es_lblk; - map->m_len = es.es_len - offset; - - return true; -} - static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { int ret; - bool delalloc = false; struct ext4_map_blocks map; u8 blkbits = inode->i_blkbits; @@ -3499,13 +3472,8 @@ static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) return ret; - if (ret == 0) - delalloc = ext4_iomap_is_delalloc(inode, &map); - set_iomap: ext4_set_iomap(inode, iomap, &map, offset, length, flags); - if (delalloc && iomap->type == IOMAP_HOLE) - iomap->type = IOMAP_DELALLOC; return 0; } @@ -4015,12 +3983,12 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) /* If there are blocks to remove, do it */ if (stop_block > first_block) { + ext4_lblk_t hole_len = stop_block - first_block; down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); - ext4_es_remove_extent(inode, first_block, - stop_block - first_block); + ext4_es_remove_extent(inode, first_block, hole_len); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_remove_space(inode, first_block, @@ -4029,6 +3997,8 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) ret = ext4_ind_remove_space(handle, inode, first_block, stop_block); + ext4_es_insert_extent(inode, first_block, hole_len, ~0, + EXTENT_STATUS_HOLE); up_write(&EXT4_I(inode)->i_data_sem); } ext4_fc_track_range(handle, inode, first_block, stop_block); @@ -4170,7 +4140,7 @@ int ext4_truncate(struct inode *inode) down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) err = ext4_ext_truncate(handle, inode); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index aa6be510eb8f..7160a71044c8 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -467,7 +467,7 @@ static long swap_inode_boot_loader(struct super_block *sb, ext4_reset_inode_seed(inode); ext4_reset_inode_seed(inode_bl); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); err = ext4_mark_inode_dirty(handle, inode); if (err < 0) { diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index f44f668e407f..e4f7cf9d89c4 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -564,14 +564,14 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += EXT4_C2B(EXT4_SB(sb), first + i); + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, inode ? inode->i_ino : 0, blocknr, "freeing block already freed " "(bit %u)", first + i); - ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, - EXT4_GROUP_INFO_BBITMAP_CORRUPT); } mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); } @@ -677,7 +677,7 @@ do { \ } \ } while (0) -static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, +static void __mb_check_buddy(struct ext4_buddy *e4b, char *file, const char *function, int line) { struct super_block *sb = e4b->bd_sb; @@ -696,7 +696,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, void *buddy2; if (e4b->bd_info->bb_check_counter++ % 10) - return 0; + return; while (order > 1) { buddy = mb_find_buddy(e4b, order, &max); @@ -758,7 +758,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, grp = ext4_get_group_info(sb, e4b->bd_group); if (!grp) - return NULL; + return; list_for_each(cur, &grp->bb_prealloc_list) { ext4_group_t groupnr; struct ext4_prealloc_space *pa; @@ -768,7 +768,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, for (i = 0; i < pa->pa_len; i++) MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); } - return 0; } #undef MB_CHECK_ASSERT #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ @@ -842,7 +841,7 @@ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) struct ext4_sb_info *sbi = EXT4_SB(sb); int new_order; - if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0) + if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_fragments == 0) return; new_order = mb_avg_fragment_size_order(sb, @@ -871,7 +870,7 @@ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) * cr level needs an update. */ static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context *ac, - enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) + enum criteria *new_cr, ext4_group_t *group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *iter; @@ -945,7 +944,7 @@ ext4_mb_find_good_group_avg_frag_lists(struct ext4_allocation_context *ac, int o * order. Updates *new_cr if cr level needs an update. */ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *ac, - enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) + enum criteria *new_cr, ext4_group_t *group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *grp = NULL; @@ -990,7 +989,7 @@ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context * * much and fall to CR_GOAL_LEN_SLOW in that case. */ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context *ac, - enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) + enum criteria *new_cr, ext4_group_t *group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *grp = NULL; @@ -1125,11 +1124,11 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, } if (*new_cr == CR_POWER2_ALIGNED) { - ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group, ngroups); + ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group); } else if (*new_cr == CR_GOAL_LEN_FAST) { - ext4_mb_choose_next_group_goal_fast(ac, new_cr, group, ngroups); + ext4_mb_choose_next_group_goal_fast(ac, new_cr, group); } else if (*new_cr == CR_BEST_AVAIL_LEN) { - ext4_mb_choose_next_group_best_avail(ac, new_cr, group, ngroups); + ext4_mb_choose_next_group_best_avail(ac, new_cr, group); } else { /* * TODO: For CR=2, we can arrange groups in an rb tree sorted by @@ -1233,6 +1232,24 @@ void ext4_mb_generate_buddy(struct super_block *sb, atomic64_add(period, &sbi->s_mb_generation_time); } +static void mb_regenerate_buddy(struct ext4_buddy *e4b) +{ + int count; + int order = 1; + void *buddy; + + while ((buddy = mb_find_buddy(e4b, order++, &count))) + mb_set_bits(buddy, 0, count); + + e4b->bd_info->bb_fragments = 0; + memset(e4b->bd_info->bb_counters, 0, + sizeof(*e4b->bd_info->bb_counters) * + (e4b->bd_sb->s_blocksize_bits + 2)); + + ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy, + e4b->bd_bitmap, e4b->bd_group, e4b->bd_info); +} + /* The buddy information is attached the buddy cache inode * for convenience. The information regarding each group * is loaded via ext4_mb_load_buddy. The information involve @@ -1891,11 +1908,6 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, mb_check_buddy(e4b); mb_free_blocks_double(inode, e4b, first, count); - this_cpu_inc(discard_pa_seq); - e4b->bd_info->bb_free += count; - if (first < e4b->bd_info->bb_first_free) - e4b->bd_info->bb_first_free = first; - /* access memory sequentially: check left neighbour, * clear range and then check right neighbour */ @@ -1909,21 +1921,31 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t blocknr; + /* + * Fastcommit replay can free already freed blocks which + * corrupts allocation info. Regenerate it. + */ + if (sbi->s_mount_state & EXT4_FC_REPLAY) { + mb_regenerate_buddy(e4b); + goto check; + } + blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += EXT4_C2B(sbi, block); - if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { - ext4_grp_locked_error(sb, e4b->bd_group, - inode ? inode->i_ino : 0, - blocknr, - "freeing already freed block (bit %u); block bitmap corrupt.", - block); - ext4_mark_group_bitmap_corrupted( - sb, e4b->bd_group, + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); - } - goto done; + ext4_grp_locked_error(sb, e4b->bd_group, + inode ? inode->i_ino : 0, blocknr, + "freeing already freed block (bit %u); block bitmap corrupt.", + block); + return; } + this_cpu_inc(discard_pa_seq); + e4b->bd_info->bb_free += count; + if (first < e4b->bd_info->bb_first_free) + e4b->bd_info->bb_first_free = first; + /* let's maintain fragments counter */ if (left_is_free && right_is_free) e4b->bd_info->bb_fragments--; @@ -1948,9 +1970,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, if (first <= last) mb_buddy_mark_free(e4b, first >> 1, last >> 1); -done: mb_set_largest_free_order(sb, e4b->bd_info); mb_update_avg_fragment_size(sb, e4b->bd_info); +check: mb_check_buddy(e4b); } @@ -2276,6 +2298,9 @@ void ext4_mb_try_best_found(struct ext4_allocation_context *ac, return; ext4_lock_group(ac->ac_sb, group); + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) + goto out; + max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex); if (max > 0) { @@ -2283,6 +2308,7 @@ void ext4_mb_try_best_found(struct ext4_allocation_context *ac, ext4_mb_use_best_found(ac, e4b); } +out: ext4_unlock_group(ac->ac_sb, group); ext4_mb_unload_buddy(e4b); } @@ -2309,12 +2335,10 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, if (err) return err; - if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) { - ext4_mb_unload_buddy(e4b); - return 0; - } - ext4_lock_group(ac->ac_sb, group); + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) + goto out; + max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, ac->ac_g_ex.fe_len, &ex); ex.fe_logical = 0xDEADFA11; /* debug value */ @@ -2347,6 +2371,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); } +out: ext4_unlock_group(ac->ac_sb, group); ext4_mb_unload_buddy(e4b); @@ -2380,12 +2405,12 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, k = mb_find_next_zero_bit(buddy, max, 0); if (k >= max) { + ext4_mark_group_bitmap_corrupted(ac->ac_sb, + e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0, "%d free clusters of order %d. But found 0", grp->bb_counters[i], i); - ext4_mark_group_bitmap_corrupted(ac->ac_sb, - e4b->bd_group, - EXT4_GROUP_INFO_BBITMAP_CORRUPT); break; } ac->ac_found++; @@ -2436,12 +2461,12 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, * free blocks even though group info says we * have free blocks */ + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, "%d free clusters as per " "group info. But bitmap says 0", free); - ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, - EXT4_GROUP_INFO_BBITMAP_CORRUPT); break; } @@ -2467,12 +2492,12 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, if (WARN_ON(ex.fe_len <= 0)) break; if (free < ex.fe_len) { + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, "%d free clusters as per " "group info. But got %d blocks", free, ex.fe_len); - ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, - EXT4_GROUP_INFO_BBITMAP_CORRUPT); /* * The number of free blocks differs. This mostly * indicate that the bitmap is corrupt. So exit @@ -3725,7 +3750,7 @@ static int ext4_mb_cleanup_pa(struct ext4_group_info *grp) return count; } -int ext4_mb_release(struct super_block *sb) +void ext4_mb_release(struct super_block *sb) { ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; @@ -3801,8 +3826,6 @@ int ext4_mb_release(struct super_block *sb) } free_percpu(sbi->s_locality_groups); - - return 0; } static inline int ext4_issue_discard(struct super_block *sb, @@ -5284,7 +5307,7 @@ static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac) * the caller MUST hold group/inode locks. * TODO: optimize the case when there are no in-core structures yet */ -static noinline_for_stack int +static noinline_for_stack void ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, struct ext4_prealloc_space *pa) { @@ -5334,11 +5357,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, */ } atomic_add(free, &sbi->s_mb_discarded); - - return 0; } -static noinline_for_stack int +static noinline_for_stack void ext4_mb_release_group_pa(struct ext4_buddy *e4b, struct ext4_prealloc_space *pa) { @@ -5352,13 +5373,11 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) { ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu", e4b->bd_group, group, pa->pa_pstart); - return 0; + return; } mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len); - - return 0; } /* @@ -5479,7 +5498,7 @@ out_dbg: * * FIXME!! Make sure it is valid at all the call sites */ -void ext4_discard_preallocations(struct inode *inode, unsigned int needed) +void ext4_discard_preallocations(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct super_block *sb = inode->i_sb; @@ -5491,9 +5510,8 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed) struct rb_node *iter; int err; - if (!S_ISREG(inode->i_mode)) { + if (!S_ISREG(inode->i_mode)) return; - } if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) return; @@ -5501,15 +5519,12 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed) mb_debug(sb, "discard preallocation for inode %lu\n", inode->i_ino); trace_ext4_discard_preallocations(inode, - atomic_read(&ei->i_prealloc_active), needed); - - if (needed == 0) - needed = UINT_MAX; + atomic_read(&ei->i_prealloc_active)); repeat: /* first, collect all pa's in the inode */ write_lock(&ei->i_prealloc_lock); - for (iter = rb_first(&ei->i_prealloc_node); iter && needed; + for (iter = rb_first(&ei->i_prealloc_node); iter; iter = rb_next(iter)) { pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); @@ -5533,7 +5548,6 @@ repeat: spin_unlock(&pa->pa_lock); rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); list_add(&pa->u.pa_tmp_list, &list); - needed--; continue; } @@ -5943,7 +5957,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) /* * release all resource we used in allocation */ -static int ext4_mb_release_context(struct ext4_allocation_context *ac) +static void ext4_mb_release_context(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *pa = ac->ac_pa; @@ -5980,7 +5994,6 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) mutex_unlock(&ac->ac_lg->lg_mutex); ext4_mb_collect_stats(ac); - return 0; } static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) @@ -6761,6 +6774,9 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) bool set_trimmed = false; void *bitmap; + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) + return 0; + last = ext4_last_grp_cluster(sb, e4b->bd_group); bitmap = e4b->bd_bitmap; if (start == 0 && max >= last) diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index d7aeb5da7d86..56938532b4ce 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -192,7 +192,6 @@ struct ext4_allocation_context { */ ext4_grpblk_t ac_orig_goal_len; - __u32 ac_groups_considered; __u32 ac_flags; /* allocation hints */ __u16 ac_groups_scanned; __u16 ac_groups_linear_remaining; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 3aa57376d9c2..7cd4afa4de1d 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -618,6 +618,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, goto out; o_end = o_start + len; + *moved_len = 0; while (o_start < o_end) { struct ext4_extent *ex; ext4_lblk_t cur_blk, next_blk; @@ -672,7 +673,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, */ ext4_double_up_write_data_sem(orig_inode, donor_inode); /* Swap original branches with new branches */ - move_extent_per_page(o_filp, donor_inode, + *moved_len += move_extent_per_page(o_filp, donor_inode, orig_page_index, donor_page_index, offset_in_page, cur_len, unwritten, &ret); @@ -682,14 +683,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, o_start += cur_len; d_start += cur_len; } - *moved_len = o_start - orig_blk; - if (*moved_len > len) - *moved_len = len; out: if (*moved_len) { - ext4_discard_preallocations(orig_inode, 0); - ext4_discard_preallocations(donor_inode, 0); + ext4_discard_preallocations(orig_inode); + ext4_discard_preallocations(donor_inode); } ext4_free_ext_path(path); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dcba0f85dfe2..0f931d0c227d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1525,7 +1525,7 @@ void ext4_clear_inode(struct inode *inode) ext4_fc_del(inode); invalidate_inode_buffers(inode); clear_inode(inode); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); dquot_drop(inode); if (EXT4_I(inode)->jinode) { diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index 177f1f41f225..2e215e8c3c88 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -32,25 +32,21 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags) { - struct dentry *parent = NULL; + struct dentry *parent; struct gfs2_sbd *sdp; struct gfs2_inode *dip; - struct inode *dinode, *inode; + struct inode *inode; struct gfs2_holder d_gh; struct gfs2_inode *ip = NULL; int error, valid = 0; int had_lock = 0; - if (flags & LOOKUP_RCU) { - dinode = d_inode_rcu(READ_ONCE(dentry->d_parent)); - if (!dinode) - return -ECHILD; - } else { - parent = dget_parent(dentry); - dinode = d_inode(parent); - } - sdp = GFS2_SB(dinode); - dip = GFS2_I(dinode); + if (flags & LOOKUP_RCU) + return -ECHILD; + + parent = dget_parent(dentry); + sdp = GFS2_SB(d_inode(parent)); + dip = GFS2_I(d_inode(parent)); inode = d_inode(dentry); if (inode) { @@ -66,8 +62,7 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags) had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL); if (!had_lock) { - error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, - flags & LOOKUP_RCU ? GL_NOBLOCK : 0, &d_gh); + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); if (error) goto out; } diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 6bfc9383b7b8..1b95db2c3aac 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1882,10 +1882,10 @@ int gfs2_permission(struct mnt_idmap *idmap, struct inode *inode, WARN_ON_ONCE(!may_not_block); return -ECHILD; } - if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { - int noblock = may_not_block ? GL_NOBLOCK : 0; - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, - LM_FLAG_ANY | noblock, &i_gh); + if (gfs2_glock_is_locked_by_me(gl) == NULL) { + if (may_not_block) + return -ECHILD; + error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (error) return error; } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index ea5b8e57d904..d746866ae3b6 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -100,6 +100,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) loff_t len, vma_len; int ret; struct hstate *h = hstate_file(file); + vm_flags_t vm_flags; /* * vma address alignment (but not the pgoff alignment) has @@ -141,10 +142,20 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); ret = -ENOMEM; + + vm_flags = vma->vm_flags; + /* + * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip + * reserving here. Note: only for SHM hugetlbfs file, the inode + * flag S_PRIVATE is set. + */ + if (inode->i_flags & S_PRIVATE) + vm_flags |= VM_NORESERVE; + if (!hugetlb_reserve_pages(inode, vma->vm_pgoff >> huge_page_order(h), len >> huge_page_shift(h), vma, - vma->vm_flags)) + vm_flags)) goto out; ret = 0; @@ -340,7 +351,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) } else { folio_unlock(folio); - if (!folio_test_has_hwpoisoned(folio)) + if (!folio_test_hwpoison(folio)) want = nr; else { /* @@ -1354,6 +1365,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par { struct hugetlbfs_fs_context *ctx = fc->fs_private; struct fs_parse_result result; + struct hstate *h; char *rest; unsigned long ps; int opt; @@ -1398,11 +1410,12 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par case Opt_pagesize: ps = memparse(param->string, &rest); - ctx->hstate = size_to_hstate(ps); - if (!ctx->hstate) { + h = size_to_hstate(ps); + if (!h) { pr_err("Unsupported page size %lu MB\n", ps / SZ_1M); return -EINVAL; } + ctx->hstate = h; return 0; case Opt_min_size: diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 8eec84c651bf..cb3cda1390ad 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -2763,9 +2763,7 @@ static int dbBackSplit(dmtree_t *tp, int leafno, bool is_ctl) * leafno - the number of the leaf to be updated. * newval - the new value for the leaf. * - * RETURN VALUES: - * 0 - success - * -EIO - i/o error + * RETURN VALUES: none */ static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl) { @@ -2792,10 +2790,6 @@ static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl) * get the buddy size (number of words covered) of * the new value. */ - - if ((newval - tp->dmt_budmin) > BUDMIN) - return -EIO; - budsz = BUDSIZE(newval, tp->dmt_budmin); /* try to join. diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index a59e7b2edaac..3298c29b5548 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -101,7 +101,7 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) } if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { - if (folio_index(folio) == rreq->no_unlock_folio && + if (folio->index == rreq->no_unlock_folio && test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) _debug("no unlock"); else @@ -246,13 +246,13 @@ EXPORT_SYMBOL(netfs_readahead); */ int netfs_read_folio(struct file *file, struct folio *folio) { - struct address_space *mapping = folio_file_mapping(folio); + struct address_space *mapping = folio->mapping; struct netfs_io_request *rreq; struct netfs_inode *ctx = netfs_inode(mapping->host); struct folio *sink = NULL; int ret; - _enter("%lx", folio_index(folio)); + _enter("%lx", folio->index); rreq = netfs_alloc_request(mapping, file, folio_file_pos(folio), folio_size(folio), @@ -460,7 +460,7 @@ retry: ret = PTR_ERR(rreq); goto error; } - rreq->no_unlock_folio = folio_index(folio); + rreq->no_unlock_folio = folio->index; __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags); ret = netfs_begin_cache_read(rreq, ctx); @@ -518,7 +518,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio, size_t offset, size_t len) { struct netfs_io_request *rreq; - struct address_space *mapping = folio_file_mapping(folio); + struct address_space *mapping = folio->mapping; struct netfs_inode *ctx = netfs_inode(mapping->host); unsigned long long start = folio_pos(folio); size_t flen = folio_size(folio); @@ -535,7 +535,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio, goto error; } - rreq->no_unlock_folio = folio_index(folio); + rreq->no_unlock_folio = folio->index; __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags); ret = netfs_begin_cache_read(rreq, ctx); if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 93dc76f34e39..a3059b3168fd 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -221,10 +221,11 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, if (unlikely(fault_in_iov_iter_readable(iter, part) == part)) break; - ret = -ENOMEM; folio = netfs_grab_folio_for_write(mapping, pos, part); - if (!folio) + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); break; + } flen = folio_size(folio); offset = pos & (flen - 1); @@ -343,7 +344,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, break; default: WARN(true, "Unexpected modify type %u ix=%lx\n", - howto, folio_index(folio)); + howto, folio->index); ret = -EIO; goto error_folio_unlock; } @@ -648,7 +649,7 @@ static void netfs_pages_written_back(struct netfs_io_request *wreq) xas_for_each(&xas, folio, last) { WARN(!folio_test_writeback(folio), "bad %zx @%llx page %lx %lx\n", - wreq->len, wreq->start, folio_index(folio), last); + wreq->len, wreq->start, folio->index, last); if ((finfo = netfs_folio_info(folio))) { /* Streaming writes cannot be redirtied whilst under @@ -795,7 +796,7 @@ static void netfs_extend_writeback(struct address_space *mapping, continue; if (xa_is_value(folio)) break; - if (folio_index(folio) != index) { + if (folio->index != index) { xas_reset(xas); break; } @@ -901,7 +902,7 @@ static ssize_t netfs_write_back_from_locked_folio(struct address_space *mapping, long count = wbc->nr_to_write; int ret; - _enter(",%lx,%llx-%llx,%u", folio_index(folio), start, end, caching); + _enter(",%lx,%llx-%llx,%u", folio->index, start, end, caching); wreq = netfs_alloc_request(mapping, NULL, start, folio_size(folio), NETFS_WRITEBACK); @@ -1047,7 +1048,7 @@ search_again: start = folio_pos(folio); /* May regress with THPs */ - _debug("wback %lx", folio_index(folio)); + _debug("wback %lx", folio->index); /* At this point we hold neither the i_pages lock nor the page lock: * the page may be truncated or invalidated (changing page->mapping to diff --git a/fs/netfs/fscache_cache.c b/fs/netfs/fscache_cache.c index d645f8b302a2..9397ed39b0b4 100644 --- a/fs/netfs/fscache_cache.c +++ b/fs/netfs/fscache_cache.c @@ -179,13 +179,14 @@ EXPORT_SYMBOL(fscache_acquire_cache); void fscache_put_cache(struct fscache_cache *cache, enum fscache_cache_trace where) { - unsigned int debug_id = cache->debug_id; + unsigned int debug_id; bool zero; int ref; if (IS_ERR_OR_NULL(cache)) return; + debug_id = cache->debug_id; zero = __refcount_dec_and_test(&cache->ref, &ref); trace_fscache_cache(debug_id, ref - 1, where); diff --git a/fs/netfs/io.c b/fs/netfs/io.c index 4309edf33862..e8ff1e61ce79 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -124,7 +124,7 @@ static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq, /* We might have multiple writes from the same huge * folio, but we mustn't unlock a folio more than once. */ - if (have_unlocked && folio_index(folio) <= unlocked) + if (have_unlocked && folio->index <= unlocked) continue; unlocked = folio_next_index(folio) - 1; trace_netfs_folio(folio, netfs_folio_trace_end_copy); diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 0e3af37fc924..90051ced8e2a 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -180,7 +180,7 @@ void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) struct netfs_folio *finfo = NULL; size_t flen = folio_size(folio); - _enter("{%lx},%zx,%zx", folio_index(folio), offset, length); + _enter("{%lx},%zx,%zx", folio->index, offset, length); folio_wait_fscache(folio); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 2fa54cfd4882..7d6c657e0409 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4945,10 +4945,8 @@ nfsd_break_deleg_cb(struct file_lock *fl) */ fl->fl_break_time = 0; - spin_lock(&fp->fi_lock); fp->fi_had_conflict = true; nfsd_break_one_deleg(dp); - spin_unlock(&fp->fi_lock); return false; } @@ -5557,12 +5555,13 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, if (status) goto out_unlock; + status = -EAGAIN; + if (fp->fi_had_conflict) + goto out_unlock; + spin_lock(&state_lock); spin_lock(&fp->fi_lock); - if (fp->fi_had_conflict) - status = -EAGAIN; - else - status = hash_delegation_locked(dp, fp); + status = hash_delegation_locked(dp, fp); spin_unlock(&fp->fi_lock); spin_unlock(&state_lock); @@ -7911,14 +7910,16 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) { struct file_lock *fl; int status = false; - struct nfsd_file *nf = find_any_file(fp); + struct nfsd_file *nf; struct inode *inode; struct file_lock_context *flctx; + spin_lock(&fp->fi_lock); + nf = find_any_file_locked(fp); if (!nf) { /* Any valid lock stateid should have some sort of access */ WARN_ON_ONCE(1); - return status; + goto out; } inode = file_inode(nf->nf_file); @@ -7934,7 +7935,8 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) } spin_unlock(&flctx->flc_lock); } - nfsd_file_put(nf); +out: + spin_unlock(&fp->fi_lock); return status; } @@ -7944,10 +7946,8 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) * @cstate: NFSv4 COMPOUND state * @u: RELEASE_LOCKOWNER arguments * - * The lockowner's so_count is bumped when a lock record is added - * or when copying a conflicting lock. The latter case is brief, - * but can lead to fleeting false positives when looking for - * locks-in-use. + * Check if theree are any locks still held and if not - free the lockowner + * and any lock state that is owned. * * Return values: * %nfs_ok: lockowner released or not found @@ -7983,10 +7983,13 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, spin_unlock(&clp->cl_lock); return nfs_ok; } - if (atomic_read(&lo->lo_owner.so_count) != 2) { - spin_unlock(&clp->cl_lock); - nfs4_put_stateowner(&lo->lo_owner); - return nfserr_locks_held; + + list_for_each_entry(stp, &lo->lo_owner.so_stateids, st_perstateowner) { + if (check_for_locks(stp->st_stid.sc_file, lo)) { + spin_unlock(&clp->cl_lock); + nfs4_put_stateowner(&lo->lo_owner); + return nfserr_locks_held; + } } unhash_lockowner_locked(lo); while (!list_empty(&lo->lo_owner.so_stateids)) { diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index bec33b89a075..0e3fc5ba33c7 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -107,7 +107,13 @@ static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf) nilfs_transaction_commit(inode->i_sb); mapped: - folio_wait_stable(folio); + /* + * Since checksumming including data blocks is performed to determine + * the validity of the log to be written and used for recovery, it is + * necessary to wait for writeback to finish here, regardless of the + * stable write requirement of the backing device. + */ + folio_wait_writeback(folio); out: sb_end_pagefault(inode->i_sb); return vmf_fs_error(ret); diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index 0955b657938f..a9b8d77c8c1d 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -472,9 +472,10 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs, static int nilfs_recovery_copy_block(struct the_nilfs *nilfs, struct nilfs_recovery_block *rb, - struct page *page) + loff_t pos, struct page *page) { struct buffer_head *bh_org; + size_t from = pos & ~PAGE_MASK; void *kaddr; bh_org = __bread(nilfs->ns_bdev, rb->blocknr, nilfs->ns_blocksize); @@ -482,7 +483,7 @@ static int nilfs_recovery_copy_block(struct the_nilfs *nilfs, return -EIO; kaddr = kmap_atomic(page); - memcpy(kaddr + bh_offset(bh_org), bh_org->b_data, bh_org->b_size); + memcpy(kaddr + from, bh_org->b_data, bh_org->b_size); kunmap_atomic(kaddr); brelse(bh_org); return 0; @@ -521,7 +522,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, goto failed_inode; } - err = nilfs_recovery_copy_block(nilfs, rb, page); + err = nilfs_recovery_copy_block(nilfs, rb, pos, page); if (unlikely(err)) goto failed_page; diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 2590a0860eab..2bfb08052d39 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1703,7 +1703,6 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci) list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { - set_buffer_async_write(bh); if (bh == segbuf->sb_super_root) { if (bh->b_folio != bd_folio) { folio_lock(bd_folio); @@ -1714,6 +1713,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci) } break; } + set_buffer_async_write(bh); if (bh->b_folio != fs_folio) { nilfs_begin_folio_io(fs_folio); fs_folio = bh->b_folio; @@ -1800,7 +1800,6 @@ static void nilfs_abort_logs(struct list_head *logs, int err) list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { - clear_buffer_async_write(bh); if (bh == segbuf->sb_super_root) { clear_buffer_uptodate(bh); if (bh->b_folio != bd_folio) { @@ -1809,6 +1808,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err) } break; } + clear_buffer_async_write(bh); if (bh->b_folio != fs_folio) { nilfs_end_folio_io(fs_folio, err); fs_folio = bh->b_folio; @@ -1896,8 +1896,9 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) BIT(BH_Delay) | BIT(BH_NILFS_Volatile) | BIT(BH_NILFS_Redirected)); - set_mask_bits(&bh->b_state, clear_bits, set_bits); if (bh == segbuf->sb_super_root) { + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); if (bh->b_folio != bd_folio) { folio_end_writeback(bd_folio); bd_folio = bh->b_folio; @@ -1905,6 +1906,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) update_sr = true; break; } + set_mask_bits(&bh->b_state, clear_bits, set_bits); if (bh->b_folio != fs_folio) { nilfs_end_folio_io(fs_folio, 0); fs_folio = bh->b_folio; diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c index 63f70259edc0..7aadf5010999 100644 --- a/fs/ntfs3/attrib.c +++ b/fs/ntfs3/attrib.c @@ -886,7 +886,7 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn, struct runs_tree *run = &ni->file.run; struct ntfs_sb_info *sbi; u8 cluster_bits; - struct ATTRIB *attr = NULL, *attr_b; + struct ATTRIB *attr, *attr_b; struct ATTR_LIST_ENTRY *le, *le_b; struct mft_inode *mi, *mi_b; CLST hint, svcn, to_alloc, evcn1, next_svcn, asize, end, vcn0, alen; @@ -904,12 +904,8 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn, *len = 0; up_read(&ni->file.run_lock); - if (*len) { - if (*lcn != SPARSE_LCN || !new) - return 0; /* Fast normal way without allocation. */ - else if (clen > *len) - clen = *len; - } + if (*len && (*lcn != SPARSE_LCN || !new)) + return 0; /* Fast normal way without allocation. */ /* No cluster in cache or we need to allocate cluster in hole. */ sbi = ni->mi.sbi; @@ -918,6 +914,17 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn, ni_lock(ni); down_write(&ni->file.run_lock); + /* Repeat the code above (under write lock). */ + if (!run_lookup_entry(run, vcn, lcn, len, NULL)) + *len = 0; + + if (*len) { + if (*lcn != SPARSE_LCN || !new) + goto out; /* normal way without allocation. */ + if (clen > *len) + clen = *len; + } + le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); if (!attr_b) { @@ -1736,8 +1743,10 @@ repack: le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); - if (!attr_b) - return -ENOENT; + if (!attr_b) { + err = -ENOENT; + goto out; + } attr = attr_b; le = le_b; @@ -1818,13 +1827,15 @@ ins_ext: ok: run_truncate_around(run, vcn); out: - if (new_valid > data_size) - new_valid = data_size; + if (attr_b) { + if (new_valid > data_size) + new_valid = data_size; - valid_size = le64_to_cpu(attr_b->nres.valid_size); - if (new_valid != valid_size) { - attr_b->nres.valid_size = cpu_to_le64(valid_size); - mi_b->dirty = true; + valid_size = le64_to_cpu(attr_b->nres.valid_size); + if (new_valid != valid_size) { + attr_b->nres.valid_size = cpu_to_le64(valid_size); + mi_b->dirty = true; + } } return err; @@ -2073,7 +2084,7 @@ next_attr: /* Update inode size. */ ni->i_valid = valid_size; - ni->vfs_inode.i_size = data_size; + i_size_write(&ni->vfs_inode, data_size); inode_set_bytes(&ni->vfs_inode, total_size); ni->ni_flags |= NI_FLAG_UPDATE_PARENT; mark_inode_dirty(&ni->vfs_inode); @@ -2488,7 +2499,7 @@ int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes) mi_b->dirty = true; done: - ni->vfs_inode.i_size += bytes; + i_size_write(&ni->vfs_inode, ni->vfs_inode.i_size + bytes); ni->ni_flags |= NI_FLAG_UPDATE_PARENT; mark_inode_dirty(&ni->vfs_inode); diff --git a/fs/ntfs3/attrlist.c b/fs/ntfs3/attrlist.c index 7c01735d1219..9f4bd8d26090 100644 --- a/fs/ntfs3/attrlist.c +++ b/fs/ntfs3/attrlist.c @@ -29,7 +29,7 @@ static inline bool al_is_valid_le(const struct ntfs_inode *ni, void al_destroy(struct ntfs_inode *ni) { run_close(&ni->attr_list.run); - kfree(ni->attr_list.le); + kvfree(ni->attr_list.le); ni->attr_list.le = NULL; ni->attr_list.size = 0; ni->attr_list.dirty = false; @@ -127,12 +127,13 @@ struct ATTR_LIST_ENTRY *al_enumerate(struct ntfs_inode *ni, { size_t off; u16 sz; + const unsigned le_min_size = le_size(0); if (!le) { le = ni->attr_list.le; } else { sz = le16_to_cpu(le->size); - if (sz < sizeof(struct ATTR_LIST_ENTRY)) { + if (sz < le_min_size) { /* Impossible 'cause we should not return such le. */ return NULL; } @@ -141,7 +142,7 @@ struct ATTR_LIST_ENTRY *al_enumerate(struct ntfs_inode *ni, /* Check boundary. */ off = PtrOffset(ni->attr_list.le, le); - if (off + sizeof(struct ATTR_LIST_ENTRY) > ni->attr_list.size) { + if (off + le_min_size > ni->attr_list.size) { /* The regular end of list. */ return NULL; } @@ -149,8 +150,7 @@ struct ATTR_LIST_ENTRY *al_enumerate(struct ntfs_inode *ni, sz = le16_to_cpu(le->size); /* Check le for errors. */ - if (sz < sizeof(struct ATTR_LIST_ENTRY) || - off + sz > ni->attr_list.size || + if (sz < le_min_size || off + sz > ni->attr_list.size || sz < le->name_off + le->name_len * sizeof(short)) { return NULL; } @@ -318,7 +318,7 @@ int al_add_le(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, memcpy(ptr, al->le, off); memcpy(Add2Ptr(ptr, off + sz), le, old_size - off); le = Add2Ptr(ptr, off); - kfree(al->le); + kvfree(al->le); al->le = ptr; } else { memmove(Add2Ptr(le, sz), le, old_size - off); diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c index 63f14a0232f6..845f9b22deef 100644 --- a/fs/ntfs3/bitmap.c +++ b/fs/ntfs3/bitmap.c @@ -124,7 +124,7 @@ void wnd_close(struct wnd_bitmap *wnd) { struct rb_node *node, *next; - kfree(wnd->free_bits); + kvfree(wnd->free_bits); wnd->free_bits = NULL; run_close(&wnd->run); @@ -1360,7 +1360,7 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits) memcpy(new_free, wnd->free_bits, wnd->nwnd * sizeof(short)); memset(new_free + wnd->nwnd, 0, (new_wnd - wnd->nwnd) * sizeof(short)); - kfree(wnd->free_bits); + kvfree(wnd->free_bits); wnd->free_bits = new_free; } diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c index ec0566b322d5..5cf3d9decf64 100644 --- a/fs/ntfs3/dir.c +++ b/fs/ntfs3/dir.c @@ -309,11 +309,31 @@ static inline int ntfs_filldir(struct ntfs_sb_info *sbi, struct ntfs_inode *ni, return 0; } - /* NTFS: symlinks are "dir + reparse" or "file + reparse" */ - if (fname->dup.fa & FILE_ATTRIBUTE_REPARSE_POINT) - dt_type = DT_LNK; - else - dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG; + /* + * NTFS: symlinks are "dir + reparse" or "file + reparse" + * Unfortunately reparse attribute is used for many purposes (several dozens). + * It is not possible here to know is this name symlink or not. + * To get exactly the type of name we should to open inode (read mft). + * getattr for opened file (fstat) correctly returns symlink. + */ + dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG; + + /* + * It is not reliable to detect the type of name using duplicated information + * stored in parent directory. + * The only correct way to get the type of name - read MFT record and find ATTR_STD. + * The code below is not good idea. + * It does additional locks/reads just to get the type of name. + * Should we use additional mount option to enable branch below? + */ + if ((fname->dup.fa & FILE_ATTRIBUTE_REPARSE_POINT) && + ino != ni->mi.rno) { + struct inode *inode = ntfs_iget5(sbi->sb, &e->ref, NULL); + if (!IS_ERR_OR_NULL(inode)) { + dt_type = fs_umode_to_dtype(inode->i_mode); + iput(inode); + } + } return !dir_emit(ctx, (s8 *)name, name_len, ino, dt_type); } @@ -495,11 +515,9 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs, struct INDEX_HDR *hdr; const struct ATTR_FILE_NAME *fname; u32 e_size, off, end; - u64 vbo = 0; size_t drs = 0, fles = 0, bit = 0; - loff_t i_size = ni->vfs_inode.i_size; struct indx_node *node = NULL; - u8 index_bits = ni->dir.index_bits; + size_t max_indx = i_size_read(&ni->vfs_inode) >> ni->dir.index_bits; if (is_empty) *is_empty = true; @@ -518,8 +536,10 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs, e = Add2Ptr(hdr, off); e_size = le16_to_cpu(e->size); if (e_size < sizeof(struct NTFS_DE) || - off + e_size > end) + off + e_size > end) { + /* Looks like corruption. */ break; + } if (de_is_last(e)) break; @@ -543,7 +563,7 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs, fles += 1; } - if (vbo >= i_size) + if (bit >= max_indx) goto out; err = indx_used_bit(&ni->dir, ni, &bit); @@ -553,8 +573,7 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs, if (bit == MINUS_ONE_T) goto out; - vbo = (u64)bit << index_bits; - if (vbo >= i_size) + if (bit >= max_indx) goto out; err = indx_read(&ni->dir, ni, bit << ni->dir.idx2vbn_bits, @@ -564,7 +583,6 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs, hdr = &node->index->ihdr; bit += 1; - vbo = (u64)bit << ni->dir.idx2vbn_bits; } out: @@ -593,5 +611,9 @@ const struct file_operations ntfs_dir_operations = { .iterate_shared = ntfs_readdir, .fsync = generic_file_fsync, .open = ntfs_file_open, + .unlocked_ioctl = ntfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ntfs_compat_ioctl, +#endif }; // clang-format on diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index a5a30a24ce5d..5418662c80d8 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -48,7 +48,7 @@ static int ntfs_ioctl_fitrim(struct ntfs_sb_info *sbi, unsigned long arg) return 0; } -static long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg) +long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg) { struct inode *inode = file_inode(filp); struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info; @@ -61,7 +61,7 @@ static long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg) } #ifdef CONFIG_COMPAT -static long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg) +long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg) { return ntfs_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)); @@ -188,6 +188,7 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) u32 bh_next, bh_off, to; sector_t iblock; struct folio *folio; + bool dirty = false; for (; idx < idx_end; idx += 1, from = 0) { page_off = (loff_t)idx << PAGE_SHIFT; @@ -223,29 +224,27 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) /* Ok, it's mapped. Make sure it's up-to-date. */ if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); - - if (!buffer_uptodate(bh)) { - err = bh_read(bh, 0); - if (err < 0) { - folio_unlock(folio); - folio_put(folio); - goto out; - } + else if (bh_read(bh, 0) < 0) { + err = -EIO; + folio_unlock(folio); + folio_put(folio); + goto out; } mark_buffer_dirty(bh); - } while (bh_off = bh_next, iblock += 1, head != (bh = bh->b_this_page)); folio_zero_segment(folio, from, to); + dirty = true; folio_unlock(folio); folio_put(folio); cond_resched(); } out: - mark_inode_dirty(inode); + if (dirty) + mark_inode_dirty(inode); return err; } @@ -261,6 +260,9 @@ static int ntfs_file_mmap(struct file *file, struct vm_area_struct *vma) bool rw = vma->vm_flags & VM_WRITE; int err; + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + if (is_encrypted(ni)) { ntfs_inode_warn(inode, "mmap encrypted not supported"); return -EOPNOTSUPP; @@ -499,10 +501,14 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) ni_lock(ni); err = attr_punch_hole(ni, vbo, len, &frame_size); ni_unlock(ni); + if (!err) + goto ok; + if (err != E_NTFS_NOTALIGNED) goto out; /* Process not aligned punch. */ + err = 0; mask = frame_size - 1; vbo_a = (vbo + mask) & ~mask; end_a = end & ~mask; @@ -525,6 +531,8 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) ni_lock(ni); err = attr_punch_hole(ni, vbo_a, end_a - vbo_a, NULL); ni_unlock(ni); + if (err) + goto out; } } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { /* @@ -564,6 +572,8 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) ni_lock(ni); err = attr_insert_range(ni, vbo, len); ni_unlock(ni); + if (err) + goto out; } else { /* Check new size. */ u8 cluster_bits = sbi->cluster_bits; @@ -633,11 +643,18 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) &ni->file.run, i_size, &ni->i_valid, true, NULL); ni_unlock(ni); + if (err) + goto out; } else if (new_size > i_size) { - inode->i_size = new_size; + i_size_write(inode, new_size); } } +ok: + err = file_modified(file); + if (err) + goto out; + out: if (map_locked) filemap_invalidate_unlock(mapping); @@ -663,6 +680,9 @@ int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode = inode->i_mode; int err; + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + err = setattr_prepare(idmap, dentry, attr); if (err) goto out; @@ -676,7 +696,7 @@ int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry, goto out; } inode_dio_wait(inode); - oldsize = inode->i_size; + oldsize = i_size_read(inode); newsize = attr->ia_size; if (newsize <= oldsize) @@ -688,7 +708,7 @@ int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry, goto out; ni->ni_flags |= NI_FLAG_UPDATE_PARENT; - inode->i_size = newsize; + i_size_write(inode, newsize); } setattr_copy(idmap, inode, attr); @@ -718,6 +738,9 @@ static ssize_t ntfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) struct inode *inode = file->f_mapping->host; struct ntfs_inode *ni = ntfs_i(inode); + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + if (is_encrypted(ni)) { ntfs_inode_warn(inode, "encrypted i/o not supported"); return -EOPNOTSUPP; @@ -752,6 +775,9 @@ static ssize_t ntfs_file_splice_read(struct file *in, loff_t *ppos, struct inode *inode = in->f_mapping->host; struct ntfs_inode *ni = ntfs_i(inode); + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + if (is_encrypted(ni)) { ntfs_inode_warn(inode, "encrypted i/o not supported"); return -EOPNOTSUPP; @@ -821,7 +847,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) size_t count = iov_iter_count(from); loff_t pos = iocb->ki_pos; struct inode *inode = file_inode(file); - loff_t i_size = inode->i_size; + loff_t i_size = i_size_read(inode); struct address_space *mapping = inode->i_mapping; struct ntfs_inode *ni = ntfs_i(inode); u64 valid = ni->i_valid; @@ -1028,6 +1054,8 @@ out: iocb->ki_pos += written; if (iocb->ki_pos > ni->i_valid) ni->i_valid = iocb->ki_pos; + if (iocb->ki_pos > i_size) + i_size_write(inode, iocb->ki_pos); return written; } @@ -1041,8 +1069,12 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; ssize_t ret; + int err; struct ntfs_inode *ni = ntfs_i(inode); + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + if (is_encrypted(ni)) { ntfs_inode_warn(inode, "encrypted i/o not supported"); return -EOPNOTSUPP; @@ -1068,6 +1100,12 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret <= 0) goto out; + err = file_modified(iocb->ki_filp); + if (err) { + ret = err; + goto out; + } + if (WARN_ON(ni->ni_flags & NI_FLAG_COMPRESSED_MASK)) { /* Should never be here, see ntfs_file_open(). */ ret = -EOPNOTSUPP; @@ -1097,6 +1135,9 @@ int ntfs_file_open(struct inode *inode, struct file *file) { struct ntfs_inode *ni = ntfs_i(inode); + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + if (unlikely((is_compressed(ni) || is_encrypted(ni)) && (file->f_flags & O_DIRECT))) { return -EOPNOTSUPP; @@ -1138,7 +1179,8 @@ static int ntfs_file_release(struct inode *inode, struct file *file) down_write(&ni->file.run_lock); err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run, - inode->i_size, &ni->i_valid, false, NULL); + i_size_read(inode), &ni->i_valid, false, + NULL); up_write(&ni->file.run_lock); ni_unlock(ni); diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index 3df2d9e34b91..3b42938a9d3b 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -778,7 +778,7 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni) run_deallocate(sbi, &ni->attr_list.run, true); run_close(&ni->attr_list.run); ni->attr_list.size = 0; - kfree(ni->attr_list.le); + kvfree(ni->attr_list.le); ni->attr_list.le = NULL; ni->attr_list.dirty = false; @@ -927,7 +927,7 @@ int ni_create_attr_list(struct ntfs_inode *ni) return 0; out: - kfree(ni->attr_list.le); + kvfree(ni->attr_list.le); ni->attr_list.le = NULL; ni->attr_list.size = 0; return err; @@ -2099,7 +2099,7 @@ int ni_readpage_cmpr(struct ntfs_inode *ni, struct page *page) gfp_t gfp_mask; struct page *pg; - if (vbo >= ni->vfs_inode.i_size) { + if (vbo >= i_size_read(&ni->vfs_inode)) { SetPageUptodate(page); err = 0; goto out; @@ -2173,7 +2173,7 @@ int ni_decompress_file(struct ntfs_inode *ni) { struct ntfs_sb_info *sbi = ni->mi.sbi; struct inode *inode = &ni->vfs_inode; - loff_t i_size = inode->i_size; + loff_t i_size = i_size_read(inode); struct address_space *mapping = inode->i_mapping; gfp_t gfp_mask = mapping_gfp_mask(mapping); struct page **pages = NULL; @@ -2457,6 +2457,7 @@ int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages, struct ATTR_LIST_ENTRY *le = NULL; struct runs_tree *run = &ni->file.run; u64 valid_size = ni->i_valid; + loff_t i_size = i_size_read(&ni->vfs_inode); u64 vbo_disk; size_t unc_size; u32 frame_size, i, npages_disk, ondisk_size; @@ -2548,7 +2549,7 @@ int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages, } } - frames = (ni->vfs_inode.i_size - 1) >> frame_bits; + frames = (i_size - 1) >> frame_bits; err = attr_wof_frame_info(ni, attr, run, frame64, frames, frame_bits, &ondisk_size, &vbo_data); @@ -2556,8 +2557,7 @@ int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages, goto out2; if (frame64 == frames) { - unc_size = 1 + ((ni->vfs_inode.i_size - 1) & - (frame_size - 1)); + unc_size = 1 + ((i_size - 1) & (frame_size - 1)); ondisk_size = attr_size(attr) - vbo_data; } else { unc_size = frame_size; @@ -3259,6 +3259,9 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint) if (is_bad_inode(inode) || sb_rdonly(sb)) return 0; + if (unlikely(ntfs3_forced_shutdown(sb))) + return -EIO; + if (!ni_trylock(ni)) { /* 'ni' is under modification, skip for now. */ mark_inode_dirty_sync(inode); @@ -3288,7 +3291,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint) modified = true; } - ts = inode_get_mtime(inode); + ts = inode_get_ctime(inode); dup.c_time = kernel2nt(&ts); if (std->c_time != dup.c_time) { std->c_time = dup.c_time; diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c index 98ccb6650858..855519713bf7 100644 --- a/fs/ntfs3/fslog.c +++ b/fs/ntfs3/fslog.c @@ -465,7 +465,7 @@ static inline bool is_rst_area_valid(const struct RESTART_HDR *rhdr) { const struct RESTART_AREA *ra; u16 cl, fl, ul; - u32 off, l_size, file_dat_bits, file_size_round; + u32 off, l_size, seq_bits; u16 ro = le16_to_cpu(rhdr->ra_off); u32 sys_page = le32_to_cpu(rhdr->sys_page_size); @@ -511,13 +511,15 @@ static inline bool is_rst_area_valid(const struct RESTART_HDR *rhdr) /* Make sure the sequence number bits match the log file size. */ l_size = le64_to_cpu(ra->l_size); - file_dat_bits = sizeof(u64) * 8 - le32_to_cpu(ra->seq_num_bits); - file_size_round = 1u << (file_dat_bits + 3); - if (file_size_round != l_size && - (file_size_round < l_size || (file_size_round / 2) > l_size)) { - return false; + seq_bits = sizeof(u64) * 8 + 3; + while (l_size) { + l_size >>= 1; + seq_bits -= 1; } + if (seq_bits != ra->seq_num_bits) + return false; + /* The log page data offset and record header length must be quad-aligned. */ if (!IS_ALIGNED(le16_to_cpu(ra->data_off), 8) || !IS_ALIGNED(le16_to_cpu(ra->rec_hdr_len), 8)) @@ -974,6 +976,16 @@ skip_looking: return e; } +struct restart_info { + u64 last_lsn; + struct RESTART_HDR *r_page; + u32 vbo; + bool chkdsk_was_run; + bool valid_page; + bool initialized; + bool restart; +}; + #define RESTART_SINGLE_PAGE_IO cpu_to_le16(0x0001) #define NTFSLOG_WRAPPED 0x00000001 @@ -987,6 +999,7 @@ struct ntfs_log { struct ntfs_inode *ni; u32 l_size; + u32 orig_file_size; u32 sys_page_size; u32 sys_page_mask; u32 page_size; @@ -1040,6 +1053,8 @@ struct ntfs_log { struct CLIENT_ID client_id; u32 client_undo_commit; + + struct restart_info rst_info, rst_info2; }; static inline u32 lsn_to_vbo(struct ntfs_log *log, const u64 lsn) @@ -1105,16 +1120,6 @@ static inline bool verify_client_lsn(struct ntfs_log *log, lsn <= le64_to_cpu(log->ra->current_lsn) && lsn; } -struct restart_info { - u64 last_lsn; - struct RESTART_HDR *r_page; - u32 vbo; - bool chkdsk_was_run; - bool valid_page; - bool initialized; - bool restart; -}; - static int read_log_page(struct ntfs_log *log, u32 vbo, struct RECORD_PAGE_HDR **buffer, bool *usa_error) { @@ -1176,7 +1181,7 @@ out: * restart page header. It will stop the first time we find a * valid page header. */ -static int log_read_rst(struct ntfs_log *log, u32 l_size, bool first, +static int log_read_rst(struct ntfs_log *log, bool first, struct restart_info *info) { u32 skip, vbo; @@ -1192,7 +1197,7 @@ static int log_read_rst(struct ntfs_log *log, u32 l_size, bool first, } /* Loop continuously until we succeed. */ - for (; vbo < l_size; vbo = 2 * vbo + skip, skip = 0) { + for (; vbo < log->l_size; vbo = 2 * vbo + skip, skip = 0) { bool usa_error; bool brst, bchk; struct RESTART_AREA *ra; @@ -1285,22 +1290,17 @@ check_result: /* * Ilog_init_pg_hdr - Init @log from restart page header. */ -static void log_init_pg_hdr(struct ntfs_log *log, u32 sys_page_size, - u32 page_size, u16 major_ver, u16 minor_ver) +static void log_init_pg_hdr(struct ntfs_log *log, u16 major_ver, u16 minor_ver) { - log->sys_page_size = sys_page_size; - log->sys_page_mask = sys_page_size - 1; - log->page_size = page_size; - log->page_mask = page_size - 1; - log->page_bits = blksize_bits(page_size); + log->sys_page_size = log->page_size; + log->sys_page_mask = log->page_mask; log->clst_per_page = log->page_size >> log->ni->mi.sbi->cluster_bits; if (!log->clst_per_page) log->clst_per_page = 1; - log->first_page = major_ver >= 2 ? - 0x22 * page_size : - ((sys_page_size << 1) + (page_size << 1)); + log->first_page = major_ver >= 2 ? 0x22 * log->page_size : + 4 * log->page_size; log->major_ver = major_ver; log->minor_ver = minor_ver; } @@ -1308,12 +1308,11 @@ static void log_init_pg_hdr(struct ntfs_log *log, u32 sys_page_size, /* * log_create - Init @log in cases when we don't have a restart area to use. */ -static void log_create(struct ntfs_log *log, u32 l_size, const u64 last_lsn, +static void log_create(struct ntfs_log *log, const u64 last_lsn, u32 open_log_count, bool wrapped, bool use_multi_page) { - log->l_size = l_size; /* All file offsets must be quadword aligned. */ - log->file_data_bits = blksize_bits(l_size) - 3; + log->file_data_bits = blksize_bits(log->l_size) - 3; log->seq_num_mask = (8 << log->file_data_bits) - 1; log->seq_num_bits = sizeof(u64) * 8 - log->file_data_bits; log->seq_num = (last_lsn >> log->file_data_bits) + 2; @@ -3720,10 +3719,8 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) struct ntfs_sb_info *sbi = ni->mi.sbi; struct ntfs_log *log; - struct restart_info rst_info, rst_info2; - u64 rec_lsn, ra_lsn, checkpt_lsn = 0, rlsn = 0; + u64 rec_lsn, checkpt_lsn = 0, rlsn = 0; struct ATTR_NAME_ENTRY *attr_names = NULL; - struct ATTR_NAME_ENTRY *ane; struct RESTART_TABLE *dptbl = NULL; struct RESTART_TABLE *trtbl = NULL; const struct RESTART_TABLE *rt; @@ -3741,9 +3738,7 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) struct TRANSACTION_ENTRY *tr; struct DIR_PAGE_ENTRY *dp; u32 i, bytes_per_attr_entry; - u32 l_size = ni->vfs_inode.i_size; - u32 orig_file_size = l_size; - u32 page_size, vbo, tail, off, dlen; + u32 vbo, tail, off, dlen; u32 saved_len, rec_len, transact_id; bool use_second_page; struct RESTART_AREA *ra2, *ra = NULL; @@ -3758,52 +3753,50 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) u16 t16; u32 t32; - /* Get the size of page. NOTE: To replay we can use default page. */ -#if PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <= DefaultLogPageSize * 2 - page_size = norm_file_page(PAGE_SIZE, &l_size, true); -#else - page_size = norm_file_page(PAGE_SIZE, &l_size, false); -#endif - if (!page_size) - return -EINVAL; - log = kzalloc(sizeof(struct ntfs_log), GFP_NOFS); if (!log) return -ENOMEM; log->ni = ni; - log->l_size = l_size; - log->one_page_buf = kmalloc(page_size, GFP_NOFS); + log->l_size = log->orig_file_size = ni->vfs_inode.i_size; + /* Get the size of page. NOTE: To replay we can use default page. */ +#if PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <= DefaultLogPageSize * 2 + log->page_size = norm_file_page(PAGE_SIZE, &log->l_size, true); +#else + log->page_size = norm_file_page(PAGE_SIZE, &log->l_size, false); +#endif + if (!log->page_size) { + err = -EINVAL; + goto out; + } + + log->one_page_buf = kmalloc(log->page_size, GFP_NOFS); if (!log->one_page_buf) { err = -ENOMEM; goto out; } - log->page_size = page_size; - log->page_mask = page_size - 1; - log->page_bits = blksize_bits(page_size); + log->page_mask = log->page_size - 1; + log->page_bits = blksize_bits(log->page_size); /* Look for a restart area on the disk. */ - memset(&rst_info, 0, sizeof(struct restart_info)); - err = log_read_rst(log, l_size, true, &rst_info); + err = log_read_rst(log, true, &log->rst_info); if (err) goto out; /* remember 'initialized' */ - *initialized = rst_info.initialized; + *initialized = log->rst_info.initialized; - if (!rst_info.restart) { - if (rst_info.initialized) { + if (!log->rst_info.restart) { + if (log->rst_info.initialized) { /* No restart area but the file is not initialized. */ err = -EINVAL; goto out; } - log_init_pg_hdr(log, page_size, page_size, 1, 1); - log_create(log, l_size, 0, get_random_u32(), false, false); - - log->ra = ra; + log_init_pg_hdr(log, 1, 1); + log_create(log, 0, get_random_u32(), false, false); ra = log_create_ra(log); if (!ra) { @@ -3820,25 +3813,26 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) * If the restart offset above wasn't zero then we won't * look for a second restart. */ - if (rst_info.vbo) + if (log->rst_info.vbo) goto check_restart_area; - memset(&rst_info2, 0, sizeof(struct restart_info)); - err = log_read_rst(log, l_size, false, &rst_info2); + err = log_read_rst(log, false, &log->rst_info2); if (err) goto out; /* Determine which restart area to use. */ - if (!rst_info2.restart || rst_info2.last_lsn <= rst_info.last_lsn) + if (!log->rst_info2.restart || + log->rst_info2.last_lsn <= log->rst_info.last_lsn) goto use_first_page; use_second_page = true; - if (rst_info.chkdsk_was_run && page_size != rst_info.vbo) { + if (log->rst_info.chkdsk_was_run && + log->page_size != log->rst_info.vbo) { struct RECORD_PAGE_HDR *sp = NULL; bool usa_error; - if (!read_log_page(log, page_size, &sp, &usa_error) && + if (!read_log_page(log, log->page_size, &sp, &usa_error) && sp->rhdr.sign == NTFS_CHKD_SIGNATURE) { use_second_page = false; } @@ -3846,52 +3840,43 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) } if (use_second_page) { - kfree(rst_info.r_page); - memcpy(&rst_info, &rst_info2, sizeof(struct restart_info)); - rst_info2.r_page = NULL; + kfree(log->rst_info.r_page); + memcpy(&log->rst_info, &log->rst_info2, + sizeof(struct restart_info)); + log->rst_info2.r_page = NULL; } use_first_page: - kfree(rst_info2.r_page); + kfree(log->rst_info2.r_page); check_restart_area: /* * If the restart area is at offset 0, we want * to write the second restart area first. */ - log->init_ra = !!rst_info.vbo; + log->init_ra = !!log->rst_info.vbo; /* If we have a valid page then grab a pointer to the restart area. */ - ra2 = rst_info.valid_page ? - Add2Ptr(rst_info.r_page, - le16_to_cpu(rst_info.r_page->ra_off)) : + ra2 = log->rst_info.valid_page ? + Add2Ptr(log->rst_info.r_page, + le16_to_cpu(log->rst_info.r_page->ra_off)) : NULL; - if (rst_info.chkdsk_was_run || + if (log->rst_info.chkdsk_was_run || (ra2 && ra2->client_idx[1] == LFS_NO_CLIENT_LE)) { bool wrapped = false; bool use_multi_page = false; u32 open_log_count; /* Do some checks based on whether we have a valid log page. */ - if (!rst_info.valid_page) { - open_log_count = get_random_u32(); - goto init_log_instance; - } - open_log_count = le32_to_cpu(ra2->open_log_count); - - /* - * If the restart page size isn't changing then we want to - * check how much work we need to do. - */ - if (page_size != le32_to_cpu(rst_info.r_page->sys_page_size)) - goto init_log_instance; + open_log_count = log->rst_info.valid_page ? + le32_to_cpu(ra2->open_log_count) : + get_random_u32(); -init_log_instance: - log_init_pg_hdr(log, page_size, page_size, 1, 1); + log_init_pg_hdr(log, 1, 1); - log_create(log, l_size, rst_info.last_lsn, open_log_count, - wrapped, use_multi_page); + log_create(log, log->rst_info.last_lsn, open_log_count, wrapped, + use_multi_page); ra = log_create_ra(log); if (!ra) { @@ -3916,28 +3901,27 @@ init_log_instance: * use the log file. We must use the system page size instead of the * default size if there is not a clean shutdown. */ - t32 = le32_to_cpu(rst_info.r_page->sys_page_size); - if (page_size != t32) { - l_size = orig_file_size; - page_size = - norm_file_page(t32, &l_size, t32 == DefaultLogPageSize); + t32 = le32_to_cpu(log->rst_info.r_page->sys_page_size); + if (log->page_size != t32) { + log->l_size = log->orig_file_size; + log->page_size = norm_file_page(t32, &log->l_size, + t32 == DefaultLogPageSize); } - if (page_size != t32 || - page_size != le32_to_cpu(rst_info.r_page->page_size)) { + if (log->page_size != t32 || + log->page_size != le32_to_cpu(log->rst_info.r_page->page_size)) { err = -EINVAL; goto out; } /* If the file size has shrunk then we won't mount it. */ - if (l_size < le64_to_cpu(ra2->l_size)) { + if (log->l_size < le64_to_cpu(ra2->l_size)) { err = -EINVAL; goto out; } - log_init_pg_hdr(log, page_size, page_size, - le16_to_cpu(rst_info.r_page->major_ver), - le16_to_cpu(rst_info.r_page->minor_ver)); + log_init_pg_hdr(log, le16_to_cpu(log->rst_info.r_page->major_ver), + le16_to_cpu(log->rst_info.r_page->minor_ver)); log->l_size = le64_to_cpu(ra2->l_size); log->seq_num_bits = le32_to_cpu(ra2->seq_num_bits); @@ -3945,7 +3929,7 @@ init_log_instance: log->seq_num_mask = (8 << log->file_data_bits) - 1; log->last_lsn = le64_to_cpu(ra2->current_lsn); log->seq_num = log->last_lsn >> log->file_data_bits; - log->ra_off = le16_to_cpu(rst_info.r_page->ra_off); + log->ra_off = le16_to_cpu(log->rst_info.r_page->ra_off); log->restart_size = log->sys_page_size - log->ra_off; log->record_header_len = le16_to_cpu(ra2->rec_hdr_len); log->ra_size = le16_to_cpu(ra2->ra_len); @@ -4045,7 +4029,7 @@ find_oldest: log->current_avail = current_log_avail(log); /* Remember which restart area to write first. */ - log->init_ra = rst_info.vbo; + log->init_ra = log->rst_info.vbo; process_log: /* 1.0, 1.1, 2.0 log->major_ver/minor_ver - short values. */ @@ -4105,7 +4089,7 @@ process_log: log->client_id.seq_num = cr->seq_num; log->client_id.client_idx = client; - err = read_rst_area(log, &rst, &ra_lsn); + err = read_rst_area(log, &rst, &checkpt_lsn); if (err) goto out; @@ -4114,9 +4098,8 @@ process_log: bytes_per_attr_entry = !rst->major_ver ? 0x2C : 0x28; - checkpt_lsn = le64_to_cpu(rst->check_point_start); - if (!checkpt_lsn) - checkpt_lsn = ra_lsn; + if (rst->check_point_start) + checkpt_lsn = le64_to_cpu(rst->check_point_start); /* Allocate and Read the Transaction Table. */ if (!rst->transact_table_len) @@ -4330,23 +4313,20 @@ check_attr_table: lcb = NULL; check_attribute_names2: - if (!rst->attr_names_len) - goto trace_attribute_table; - - ane = attr_names; - if (!oatbl) - goto trace_attribute_table; - while (ane->off) { - /* TODO: Clear table on exit! */ - oe = Add2Ptr(oatbl, le16_to_cpu(ane->off)); - t16 = le16_to_cpu(ane->name_bytes); - oe->name_len = t16 / sizeof(short); - oe->ptr = ane->name; - oe->is_attr_name = 2; - ane = Add2Ptr(ane, sizeof(struct ATTR_NAME_ENTRY) + t16); - } - -trace_attribute_table: + if (rst->attr_names_len && oatbl) { + struct ATTR_NAME_ENTRY *ane = attr_names; + while (ane->off) { + /* TODO: Clear table on exit! */ + oe = Add2Ptr(oatbl, le16_to_cpu(ane->off)); + t16 = le16_to_cpu(ane->name_bytes); + oe->name_len = t16 / sizeof(short); + oe->ptr = ane->name; + oe->is_attr_name = 2; + ane = Add2Ptr(ane, + sizeof(struct ATTR_NAME_ENTRY) + t16); + } + } + /* * If the checkpt_lsn is zero, then this is a freshly * formatted disk and we have no work to do. @@ -5189,7 +5169,7 @@ out: kfree(oatbl); kfree(dptbl); kfree(attr_names); - kfree(rst_info.r_page); + kfree(log->rst_info.r_page); kfree(ra); kfree(log->one_page_buf); diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c index fbfe21dbb425..ae2ef5c11868 100644 --- a/fs/ntfs3/fsntfs.c +++ b/fs/ntfs3/fsntfs.c @@ -853,7 +853,8 @@ void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait) /* * sb can be NULL here. In this case sbi->flags should be 0 too. */ - if (!sb || !(sbi->flags & NTFS_FLAGS_MFTMIRR)) + if (!sb || !(sbi->flags & NTFS_FLAGS_MFTMIRR) || + unlikely(ntfs3_forced_shutdown(sb))) return; blocksize = sb->s_blocksize; @@ -1006,6 +1007,30 @@ static inline __le32 security_hash(const void *sd, size_t bytes) return cpu_to_le32(hash); } +/* + * simple wrapper for sb_bread_unmovable. + */ +struct buffer_head *ntfs_bread(struct super_block *sb, sector_t block) +{ + struct ntfs_sb_info *sbi = sb->s_fs_info; + struct buffer_head *bh; + + if (unlikely(block >= sbi->volume.blocks)) { + /* prevent generic message "attempt to access beyond end of device" */ + ntfs_err(sb, "try to read out of volume at offset 0x%llx", + (u64)block << sb->s_blocksize_bits); + return NULL; + } + + bh = sb_bread_unmovable(sb, block); + if (bh) + return bh; + + ntfs_err(sb, "failed to read volume at offset 0x%llx", + (u64)block << sb->s_blocksize_bits); + return NULL; +} + int ntfs_sb_read(struct super_block *sb, u64 lbo, size_t bytes, void *buffer) { struct block_device *bdev = sb->s_bdev; @@ -2128,8 +2153,8 @@ int ntfs_insert_security(struct ntfs_sb_info *sbi, if (le32_to_cpu(d_security->size) == new_sec_size && d_security->key.hash == hash_key.hash && !memcmp(d_security + 1, sd, size_sd)) { - *security_id = d_security->key.sec_id; /* Such security already exists. */ + *security_id = d_security->key.sec_id; err = 0; goto out; } diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c index cf92b2433f7a..daabaad63aaf 100644 --- a/fs/ntfs3/index.c +++ b/fs/ntfs3/index.c @@ -1462,7 +1462,7 @@ static int indx_create_allocate(struct ntfs_index *indx, struct ntfs_inode *ni, goto out2; if (in->name == I30_NAME) { - ni->vfs_inode.i_size = data_size; + i_size_write(&ni->vfs_inode, data_size); inode_set_bytes(&ni->vfs_inode, alloc_size); } @@ -1544,7 +1544,7 @@ static int indx_add_allocate(struct ntfs_index *indx, struct ntfs_inode *ni, } if (in->name == I30_NAME) - ni->vfs_inode.i_size = data_size; + i_size_write(&ni->vfs_inode, data_size); *vbn = bit << indx->idx2vbn_bits; @@ -2090,7 +2090,7 @@ static int indx_shrink(struct ntfs_index *indx, struct ntfs_inode *ni, return err; if (in->name == I30_NAME) - ni->vfs_inode.i_size = new_data; + i_size_write(&ni->vfs_inode, new_data); bpb = bitmap_size(bit); if (bpb * 8 == nbits) @@ -2576,7 +2576,7 @@ int indx_delete_entry(struct ntfs_index *indx, struct ntfs_inode *ni, err = attr_set_size(ni, ATTR_ALLOC, in->name, in->name_len, &indx->alloc_run, 0, NULL, false, NULL); if (in->name == I30_NAME) - ni->vfs_inode.i_size = 0; + i_size_write(&ni->vfs_inode, 0); err = ni_remove_attr(ni, ATTR_ALLOC, in->name, in->name_len, false, NULL); diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 5e3d71374918..eb7a8c9fba01 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -345,9 +345,7 @@ next_attr: inode->i_size = le16_to_cpu(rp.SymbolicLinkReparseBuffer .PrintNameLength) / sizeof(u16); - ni->i_valid = inode->i_size; - /* Clear directory bit. */ if (ni->ni_flags & NI_FLAG_DIR) { indx_clear(&ni->dir); @@ -412,7 +410,6 @@ end_enum: goto out; if (!is_match && name) { - /* Reuse rec as buffer for ascii name. */ err = -ENOENT; goto out; } @@ -427,6 +424,7 @@ end_enum: if (names != le16_to_cpu(rec->hard_links)) { /* Correct minor error on the fly. Do not mark inode as dirty. */ + ntfs_inode_warn(inode, "Correct links count -> %u.", names); rec->hard_links = cpu_to_le16(names); ni->mi.dirty = true; } @@ -653,9 +651,10 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo, off = vbo & (PAGE_SIZE - 1); folio_set_bh(bh, folio, off); - err = bh_read(bh, 0); - if (err < 0) + if (bh_read(bh, 0) < 0) { + err = -EIO; goto out; + } folio_zero_segment(folio, off + voff, off + block_size); } } @@ -853,9 +852,13 @@ static int ntfs_resident_writepage(struct folio *folio, struct writeback_control *wbc, void *data) { struct address_space *mapping = data; - struct ntfs_inode *ni = ntfs_i(mapping->host); + struct inode *inode = mapping->host; + struct ntfs_inode *ni = ntfs_i(inode); int ret; + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + ni_lock(ni); ret = attr_data_write_resident(ni, &folio->page); ni_unlock(ni); @@ -869,7 +872,12 @@ static int ntfs_resident_writepage(struct folio *folio, static int ntfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { - if (is_resident(ntfs_i(mapping->host))) + struct inode *inode = mapping->host; + + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + + if (is_resident(ntfs_i(inode))) return write_cache_pages(mapping, wbc, ntfs_resident_writepage, mapping); return mpage_writepages(mapping, wbc, ntfs_get_block); @@ -889,6 +897,9 @@ int ntfs_write_begin(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; struct ntfs_inode *ni = ntfs_i(inode); + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + *pagep = NULL; if (is_resident(ni)) { struct page *page = @@ -974,7 +985,7 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, } if (pos + err > inode->i_size) { - inode->i_size = pos + err; + i_size_write(inode, pos + err); dirty = true; } @@ -1306,6 +1317,11 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir, goto out1; } + if (unlikely(ntfs3_forced_shutdown(sb))) { + err = -EIO; + goto out2; + } + /* Mark rw ntfs as dirty. it will be cleared at umount. */ ntfs_set_state(sbi, NTFS_DIRTY_DIRTY); diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index ee3093be5170..cae41db0aaa7 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -181,6 +181,9 @@ static int ntfs_unlink(struct inode *dir, struct dentry *dentry) struct ntfs_inode *ni = ntfs_i(dir); int err; + if (unlikely(ntfs3_forced_shutdown(dir->i_sb))) + return -EIO; + ni_lock_dir(ni); err = ntfs_unlink_inode(dir, dentry); @@ -199,6 +202,9 @@ static int ntfs_symlink(struct mnt_idmap *idmap, struct inode *dir, u32 size = strlen(symname); struct inode *inode; + if (unlikely(ntfs3_forced_shutdown(dir->i_sb))) + return -EIO; + inode = ntfs_create_inode(idmap, dir, dentry, NULL, S_IFLNK | 0777, 0, symname, size, NULL); @@ -227,6 +233,9 @@ static int ntfs_rmdir(struct inode *dir, struct dentry *dentry) struct ntfs_inode *ni = ntfs_i(dir); int err; + if (unlikely(ntfs3_forced_shutdown(dir->i_sb))) + return -EIO; + ni_lock_dir(ni); err = ntfs_unlink_inode(dir, dentry); @@ -264,6 +273,9 @@ static int ntfs_rename(struct mnt_idmap *idmap, struct inode *dir, 1024); static_assert(PATH_MAX >= 4 * 1024); + if (unlikely(ntfs3_forced_shutdown(sb))) + return -EIO; + if (flags & ~RENAME_NOREPLACE) return -EINVAL; diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h index 86aecbb01a92..9c7478150a03 100644 --- a/fs/ntfs3/ntfs.h +++ b/fs/ntfs3/ntfs.h @@ -523,12 +523,10 @@ struct ATTR_LIST_ENTRY { __le64 vcn; // 0x08: Starting VCN of this attribute. struct MFT_REF ref; // 0x10: MFT record number with attribute. __le16 id; // 0x18: struct ATTRIB ID. - __le16 name[3]; // 0x1A: Just to align. To get real name can use bNameOffset. + __le16 name[]; // 0x1A: To get real name use name_off. }; // sizeof(0x20) -static_assert(sizeof(struct ATTR_LIST_ENTRY) == 0x20); - static inline u32 le_size(u8 name_len) { return ALIGN(offsetof(struct ATTR_LIST_ENTRY, name) + diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index f6706143d14b..79356fd29a14 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -61,6 +61,8 @@ enum utf16_endian; /* sbi->flags */ #define NTFS_FLAGS_NODISCARD 0x00000001 +/* ntfs in shutdown state. */ +#define NTFS_FLAGS_SHUTDOWN_BIT 0x00000002 /* == 4*/ /* Set when LogFile is replaying. */ #define NTFS_FLAGS_LOG_REPLAYING 0x00000008 /* Set when we changed first MFT's which copy must be updated in $MftMirr. */ @@ -226,7 +228,7 @@ struct ntfs_sb_info { u64 maxbytes; // Maximum size for normal files. u64 maxbytes_sparse; // Maximum size for sparse file. - u32 flags; // See NTFS_FLAGS_XXX. + unsigned long flags; // See NTFS_FLAGS_ CLST zone_max; // Maximum MFT zone length in clusters CLST bad_clusters; // The count of marked bad clusters. @@ -473,7 +475,7 @@ bool al_delete_le(struct ntfs_inode *ni, enum ATTR_TYPE type, CLST vcn, int al_update(struct ntfs_inode *ni, int sync); static inline size_t al_aligned(size_t size) { - return (size + 1023) & ~(size_t)1023; + return size_add(size, 1023) & ~(size_t)1023; } /* Globals from bitfunc.c */ @@ -500,6 +502,8 @@ int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry, int ntfs_file_open(struct inode *inode, struct file *file); int ntfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); +long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg); +long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg); extern const struct inode_operations ntfs_special_inode_operations; extern const struct inode_operations ntfs_file_inode_operations; extern const struct file_operations ntfs_file_operations; @@ -584,6 +588,7 @@ bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes); int log_replay(struct ntfs_inode *ni, bool *initialized); /* Globals from fsntfs.c */ +struct buffer_head *ntfs_bread(struct super_block *sb, sector_t block); bool ntfs_fix_pre_write(struct NTFS_RECORD_HEADER *rhdr, size_t bytes); int ntfs_fix_post_read(struct NTFS_RECORD_HEADER *rhdr, size_t bytes, bool simple); @@ -872,7 +877,7 @@ int ntfs_init_acl(struct mnt_idmap *idmap, struct inode *inode, int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry); ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size); -extern const struct xattr_handler * const ntfs_xattr_handlers[]; +extern const struct xattr_handler *const ntfs_xattr_handlers[]; int ntfs_save_wsl_perm(struct inode *inode, __le16 *ea_size); void ntfs_get_wsl_perm(struct inode *inode); @@ -999,6 +1004,11 @@ static inline struct ntfs_sb_info *ntfs_sb(struct super_block *sb) return sb->s_fs_info; } +static inline int ntfs3_forced_shutdown(struct super_block *sb) +{ + return test_bit(NTFS_FLAGS_SHUTDOWN_BIT, &ntfs_sb(sb)->flags); +} + /* * ntfs_up_cluster - Align up on cluster boundary. */ @@ -1025,19 +1035,6 @@ static inline u64 bytes_to_block(const struct super_block *sb, u64 size) return (size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; } -static inline struct buffer_head *ntfs_bread(struct super_block *sb, - sector_t block) -{ - struct buffer_head *bh = sb_bread(sb, block); - - if (bh) - return bh; - - ntfs_err(sb, "failed to read volume at offset 0x%llx", - (u64)block << sb->s_blocksize_bits); - return NULL; -} - static inline struct ntfs_inode *ntfs_i(struct inode *inode) { return container_of(inode, struct ntfs_inode, vfs_inode); diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c index 53629b1f65e9..6aa3a9d44df1 100644 --- a/fs/ntfs3/record.c +++ b/fs/ntfs3/record.c @@ -279,7 +279,7 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) if (t16 > asize) return NULL; - if (t16 + le32_to_cpu(attr->res.data_size) > asize) + if (le32_to_cpu(attr->res.data_size) > asize - t16) return NULL; t32 = sizeof(short) * attr->name_len; @@ -535,8 +535,20 @@ bool mi_remove_attr(struct ntfs_inode *ni, struct mft_inode *mi, return false; if (ni && is_attr_indexed(attr)) { - le16_add_cpu(&ni->mi.mrec->hard_links, -1); - ni->mi.dirty = true; + u16 links = le16_to_cpu(ni->mi.mrec->hard_links); + struct ATTR_FILE_NAME *fname = + attr->type != ATTR_NAME ? + NULL : + resident_data_ex(attr, + SIZEOF_ATTRIBUTE_FILENAME); + if (fname && fname->type == FILE_NAME_DOS) { + /* Do not decrease links count deleting DOS name. */ + } else if (!links) { + /* minor error. Not critical. */ + } else { + ni->mi.mrec->hard_links = cpu_to_le16(links - 1); + ni->mi.dirty = true; + } } used -= asize; diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 9153dffde950..cef5467fd928 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -122,13 +122,12 @@ void ntfs_inode_printk(struct inode *inode, const char *fmt, ...) if (name) { struct dentry *de = d_find_alias(inode); - const u32 name_len = ARRAY_SIZE(s_name_buf) - 1; if (de) { spin_lock(&de->d_lock); - snprintf(name, name_len, " \"%s\"", de->d_name.name); + snprintf(name, sizeof(s_name_buf), " \"%s\"", + de->d_name.name); spin_unlock(&de->d_lock); - name[name_len] = 0; /* To be sure. */ } else { name[0] = 0; } @@ -625,7 +624,7 @@ static void ntfs3_free_sbi(struct ntfs_sb_info *sbi) { kfree(sbi->new_rec); kvfree(ntfs_put_shared(sbi->upcase)); - kfree(sbi->def_table); + kvfree(sbi->def_table); kfree(sbi->compress.lznt); #ifdef CONFIG_NTFS3_LZX_XPRESS xpress_free_decompressor(sbi->compress.xpress); @@ -715,6 +714,14 @@ static int ntfs_show_options(struct seq_file *m, struct dentry *root) } /* + * ntfs_shutdown - super_operations::shutdown + */ +static void ntfs_shutdown(struct super_block *sb) +{ + set_bit(NTFS_FLAGS_SHUTDOWN_BIT, &ntfs_sb(sb)->flags); +} + +/* * ntfs_sync_fs - super_operations::sync_fs */ static int ntfs_sync_fs(struct super_block *sb, int wait) @@ -724,6 +731,9 @@ static int ntfs_sync_fs(struct super_block *sb, int wait) struct ntfs_inode *ni; struct inode *inode; + if (unlikely(ntfs3_forced_shutdown(sb))) + return -EIO; + ni = sbi->security.ni; if (ni) { inode = &ni->vfs_inode; @@ -763,6 +773,7 @@ static const struct super_operations ntfs_sops = { .put_super = ntfs_put_super, .statfs = ntfs_statfs, .show_options = ntfs_show_options, + .shutdown = ntfs_shutdown, .sync_fs = ntfs_sync_fs, .write_inode = ntfs3_write_inode, }; @@ -866,6 +877,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, u16 fn, ao; u8 cluster_bits; u32 boot_off = 0; + sector_t boot_block = 0; const char *hint = "Primary boot"; /* Save original dev_size. Used with alternative boot. */ @@ -873,11 +885,11 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, sbi->volume.blocks = dev_size >> PAGE_SHIFT; - bh = ntfs_bread(sb, 0); +read_boot: + bh = ntfs_bread(sb, boot_block); if (!bh) - return -EIO; + return boot_block ? -EINVAL : -EIO; -check_boot: err = -EINVAL; /* Corrupted image; do not read OOB */ @@ -1108,26 +1120,24 @@ check_boot: } out: - if (err == -EINVAL && !bh->b_blocknr && dev_size0 > PAGE_SHIFT) { + brelse(bh); + + if (err == -EINVAL && !boot_block && dev_size0 > PAGE_SHIFT) { u32 block_size = min_t(u32, sector_size, PAGE_SIZE); u64 lbo = dev_size0 - sizeof(*boot); - /* - * Try alternative boot (last sector) - */ - brelse(bh); - - sb_set_blocksize(sb, block_size); - bh = ntfs_bread(sb, lbo >> blksize_bits(block_size)); - if (!bh) - return -EINVAL; - + boot_block = lbo >> blksize_bits(block_size); boot_off = lbo & (block_size - 1); - hint = "Alternative boot"; - dev_size = dev_size0; /* restore original size. */ - goto check_boot; + if (boot_block && block_size >= boot_off + sizeof(*boot)) { + /* + * Try alternative boot (last sector) + */ + sb_set_blocksize(sb, block_size); + hint = "Alternative boot"; + dev_size = dev_size0; /* restore original size. */ + goto read_boot; + } } - brelse(bh); return err; } diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index 4274b6f31cfa..53e7d1fa036a 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -219,6 +219,9 @@ static ssize_t ntfs_list_ea(struct ntfs_inode *ni, char *buffer, if (!ea->name_len) break; + if (ea->name_len > ea_size) + break; + if (buffer) { /* Check if we can use field ea->name */ if (off + ea_size > size) @@ -744,6 +747,9 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de, int err; struct ntfs_inode *ni = ntfs_i(inode); + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + /* Dispatch request. */ if (!strcmp(name, SYSTEM_DOS_ATTRIB)) { /* system.dos_attrib */ diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 984ffdaeed6c..5764f91d283e 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -18,10 +18,11 @@ struct ovl_lookup_data { struct super_block *sb; - struct vfsmount *mnt; + const struct ovl_layer *layer; struct qstr name; bool is_dir; bool opaque; + bool xwhiteouts; bool stop; bool last; char *redirect; @@ -201,17 +202,13 @@ struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh, return real; } -static bool ovl_is_opaquedir(struct ovl_fs *ofs, const struct path *path) -{ - return ovl_path_check_dir_xattr(ofs, path, OVL_XATTR_OPAQUE); -} - static struct dentry *ovl_lookup_positive_unlocked(struct ovl_lookup_data *d, const char *name, struct dentry *base, int len, bool drop_negative) { - struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->mnt), name, base, len); + struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->layer->mnt), name, + base, len); if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { if (drop_negative && ret->d_lockref.count == 1) { @@ -232,10 +229,13 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, size_t prelen, const char *post, struct dentry **ret, bool drop_negative) { + struct ovl_fs *ofs = OVL_FS(d->sb); struct dentry *this; struct path path; int err; bool last_element = !post[0]; + bool is_upper = d->layer->idx == 0; + char val; this = ovl_lookup_positive_unlocked(d, name, base, namelen, drop_negative); if (IS_ERR(this)) { @@ -253,8 +253,8 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, } path.dentry = this; - path.mnt = d->mnt; - if (ovl_path_is_whiteout(OVL_FS(d->sb), &path)) { + path.mnt = d->layer->mnt; + if (ovl_path_is_whiteout(ofs, &path)) { d->stop = d->opaque = true; goto put_and_out; } @@ -272,7 +272,7 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, d->stop = true; goto put_and_out; } - err = ovl_check_metacopy_xattr(OVL_FS(d->sb), &path, NULL); + err = ovl_check_metacopy_xattr(ofs, &path, NULL); if (err < 0) goto out_err; @@ -292,7 +292,12 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, if (d->last) goto out; - if (ovl_is_opaquedir(OVL_FS(d->sb), &path)) { + /* overlay.opaque=x means xwhiteouts directory */ + val = ovl_get_opaquedir_val(ofs, &path); + if (last_element && !is_upper && val == 'x') { + d->xwhiteouts = true; + ovl_layer_set_xwhiteouts(ofs, d->layer); + } else if (val == 'y') { d->stop = true; if (last_element) d->opaque = true; @@ -863,7 +868,8 @@ fail: * Returns next layer in stack starting from top. * Returns -1 if this is the last layer. */ -int ovl_path_next(int idx, struct dentry *dentry, struct path *path) +int ovl_path_next(int idx, struct dentry *dentry, struct path *path, + const struct ovl_layer **layer) { struct ovl_entry *oe = OVL_E(dentry); struct ovl_path *lowerstack = ovl_lowerstack(oe); @@ -871,13 +877,16 @@ int ovl_path_next(int idx, struct dentry *dentry, struct path *path) BUG_ON(idx < 0); if (idx == 0) { ovl_path_upper(dentry, path); - if (path->dentry) + if (path->dentry) { + *layer = &OVL_FS(dentry->d_sb)->layers[0]; return ovl_numlower(oe) ? 1 : -1; + } idx++; } BUG_ON(idx > ovl_numlower(oe)); path->dentry = lowerstack[idx - 1].dentry; - path->mnt = lowerstack[idx - 1].layer->mnt; + *layer = lowerstack[idx - 1].layer; + path->mnt = (*layer)->mnt; return (idx < ovl_numlower(oe)) ? idx + 1 : -1; } @@ -1055,7 +1064,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, old_cred = ovl_override_creds(dentry->d_sb); upperdir = ovl_dentry_upper(dentry->d_parent); if (upperdir) { - d.mnt = ovl_upper_mnt(ofs); + d.layer = &ofs->layers[0]; err = ovl_lookup_layer(upperdir, &d, &upperdentry, true); if (err) goto out; @@ -1111,7 +1120,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, else if (d.is_dir || !ofs->numdatalayer) d.last = lower.layer->idx == ovl_numlower(roe); - d.mnt = lower.layer->mnt; + d.layer = lower.layer; err = ovl_lookup_layer(lower.dentry, &d, &this, false); if (err) goto out_put; @@ -1278,6 +1287,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (upperopaque) ovl_dentry_set_opaque(dentry); + if (d.xwhiteouts) + ovl_dentry_set_xwhiteouts(dentry); if (upperdentry) ovl_dentry_set_upper_alias(dentry); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 5ba11eb43767..ee949f3e7c77 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -50,7 +50,6 @@ enum ovl_xattr { OVL_XATTR_METACOPY, OVL_XATTR_PROTATTR, OVL_XATTR_XWHITEOUT, - OVL_XATTR_XWHITEOUTS, }; enum ovl_inode_flag { @@ -70,6 +69,8 @@ enum ovl_entry_flag { OVL_E_UPPER_ALIAS, OVL_E_OPAQUE, OVL_E_CONNECTED, + /* Lower stack may contain xwhiteout entries */ + OVL_E_XWHITEOUTS, }; enum { @@ -477,6 +478,10 @@ bool ovl_dentry_test_flag(unsigned long flag, struct dentry *dentry); bool ovl_dentry_is_opaque(struct dentry *dentry); bool ovl_dentry_is_whiteout(struct dentry *dentry); void ovl_dentry_set_opaque(struct dentry *dentry); +bool ovl_dentry_has_xwhiteouts(struct dentry *dentry); +void ovl_dentry_set_xwhiteouts(struct dentry *dentry); +void ovl_layer_set_xwhiteouts(struct ovl_fs *ofs, + const struct ovl_layer *layer); bool ovl_dentry_has_upper_alias(struct dentry *dentry); void ovl_dentry_set_upper_alias(struct dentry *dentry); bool ovl_dentry_needs_data_copy_up(struct dentry *dentry, int flags); @@ -494,11 +499,10 @@ struct file *ovl_path_open(const struct path *path, int flags); int ovl_copy_up_start(struct dentry *dentry, int flags); void ovl_copy_up_end(struct dentry *dentry); bool ovl_already_copied_up(struct dentry *dentry, int flags); -bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path, - enum ovl_xattr ox); +char ovl_get_dir_xattr_val(struct ovl_fs *ofs, const struct path *path, + enum ovl_xattr ox); bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path); bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path); -bool ovl_path_check_xwhiteouts_xattr(struct ovl_fs *ofs, const struct path *path); bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs, const struct path *upperpath); @@ -573,7 +577,13 @@ static inline bool ovl_is_impuredir(struct super_block *sb, .mnt = ovl_upper_mnt(ofs), }; - return ovl_path_check_dir_xattr(ofs, &upperpath, OVL_XATTR_IMPURE); + return ovl_get_dir_xattr_val(ofs, &upperpath, OVL_XATTR_IMPURE) == 'y'; +} + +static inline char ovl_get_opaquedir_val(struct ovl_fs *ofs, + const struct path *path) +{ + return ovl_get_dir_xattr_val(ofs, path, OVL_XATTR_OPAQUE); } static inline bool ovl_redirect_follow(struct ovl_fs *ofs) @@ -680,7 +690,8 @@ int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin, struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh); struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, struct dentry *origin, bool verify); -int ovl_path_next(int idx, struct dentry *dentry, struct path *path); +int ovl_path_next(int idx, struct dentry *dentry, struct path *path, + const struct ovl_layer **layer); int ovl_verify_lowerdata(struct dentry *dentry); struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 5fa9c58af65f..cb449ab310a7 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -40,6 +40,8 @@ struct ovl_layer { int idx; /* One fsid per unique underlying sb (upper fsid == 0) */ int fsid; + /* xwhiteouts were found on this layer */ + bool has_xwhiteouts; }; struct ovl_path { @@ -59,7 +61,7 @@ struct ovl_fs { unsigned int numfs; /* Number of data-only lower layers */ unsigned int numdatalayer; - const struct ovl_layer *layers; + struct ovl_layer *layers; struct ovl_sb *fs; /* workbasedir is the path at workdir= mount option */ struct dentry *workbasedir; diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index e71156baa7bc..0ca8af060b0c 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -305,8 +305,6 @@ static inline int ovl_dir_read(const struct path *realpath, if (IS_ERR(realfile)) return PTR_ERR(realfile); - rdd->in_xwhiteouts_dir = rdd->dentry && - ovl_path_check_xwhiteouts_xattr(OVL_FS(rdd->dentry->d_sb), realpath); rdd->first_maybe_whiteout = NULL; rdd->ctx.pos = 0; do { @@ -359,10 +357,13 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list, .is_lowest = false, }; int idx, next; + const struct ovl_layer *layer; for (idx = 0; idx != -1; idx = next) { - next = ovl_path_next(idx, dentry, &realpath); + next = ovl_path_next(idx, dentry, &realpath, &layer); rdd.is_upper = ovl_dentry_upper(dentry) == realpath.dentry; + rdd.in_xwhiteouts_dir = layer->has_xwhiteouts && + ovl_dentry_has_xwhiteouts(dentry); if (next != -1) { err = ovl_dir_read(&realpath, &rdd); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 4ab66e3d4cff..2eef6c70b2ae 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1249,6 +1249,7 @@ static struct dentry *ovl_get_root(struct super_block *sb, struct ovl_entry *oe) { struct dentry *root; + struct ovl_fs *ofs = OVL_FS(sb); struct ovl_path *lowerpath = ovl_lowerstack(oe); unsigned long ino = d_inode(lowerpath->dentry)->i_ino; int fsid = lowerpath->layer->fsid; @@ -1270,6 +1271,20 @@ static struct dentry *ovl_get_root(struct super_block *sb, ovl_set_flag(OVL_IMPURE, d_inode(root)); } + /* Look for xwhiteouts marker except in the lowermost layer */ + for (int i = 0; i < ovl_numlower(oe) - 1; i++, lowerpath++) { + struct path path = { + .mnt = lowerpath->layer->mnt, + .dentry = lowerpath->dentry, + }; + + /* overlay.opaque=x means xwhiteouts directory */ + if (ovl_get_opaquedir_val(ofs, &path) == 'x') { + ovl_layer_set_xwhiteouts(ofs, lowerpath->layer); + ovl_dentry_set_xwhiteouts(root); + } + } + /* Root is always merge -> can have whiteouts */ ovl_set_flag(OVL_WHITEOUTS, d_inode(root)); ovl_dentry_set_flag(OVL_E_CONNECTED, root); diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 0217094c23ea..a8e17f14d7a2 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -461,6 +461,33 @@ void ovl_dentry_set_opaque(struct dentry *dentry) ovl_dentry_set_flag(OVL_E_OPAQUE, dentry); } +bool ovl_dentry_has_xwhiteouts(struct dentry *dentry) +{ + return ovl_dentry_test_flag(OVL_E_XWHITEOUTS, dentry); +} + +void ovl_dentry_set_xwhiteouts(struct dentry *dentry) +{ + ovl_dentry_set_flag(OVL_E_XWHITEOUTS, dentry); +} + +/* + * ovl_layer_set_xwhiteouts() is called before adding the overlay dir + * dentry to dcache, while readdir of that same directory happens after + * the overlay dir dentry is in dcache, so if some cpu observes that + * ovl_dentry_is_xwhiteouts(), it will also observe layer->has_xwhiteouts + * for the layers where xwhiteouts marker was found in that merge dir. + */ +void ovl_layer_set_xwhiteouts(struct ovl_fs *ofs, + const struct ovl_layer *layer) +{ + if (layer->has_xwhiteouts) + return; + + /* Write once to read-mostly layer properties */ + ofs->layers[layer->idx].has_xwhiteouts = true; +} + /* * For hard links and decoded file handles, it's possible for ovl_dentry_upper() * to return positive, while there's no actual upper alias for the inode. @@ -739,19 +766,6 @@ bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path) return res >= 0; } -bool ovl_path_check_xwhiteouts_xattr(struct ovl_fs *ofs, const struct path *path) -{ - struct dentry *dentry = path->dentry; - int res; - - /* xattr.whiteouts must be a directory */ - if (!d_is_dir(dentry)) - return false; - - res = ovl_path_getxattr(ofs, path, OVL_XATTR_XWHITEOUTS, NULL, 0); - return res >= 0; -} - /* * Load persistent uuid from xattr into s_uuid if found, or store a new * random generated value in s_uuid and in xattr. @@ -811,20 +825,17 @@ fail: return false; } -bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path, - enum ovl_xattr ox) +char ovl_get_dir_xattr_val(struct ovl_fs *ofs, const struct path *path, + enum ovl_xattr ox) { int res; char val; if (!d_is_dir(path->dentry)) - return false; + return 0; res = ovl_path_getxattr(ofs, path, ox, &val, 1); - if (res == 1 && val == 'y') - return true; - - return false; + return res == 1 ? val : 0; } #define OVL_XATTR_OPAQUE_POSTFIX "opaque" @@ -837,7 +848,6 @@ bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path, #define OVL_XATTR_METACOPY_POSTFIX "metacopy" #define OVL_XATTR_PROTATTR_POSTFIX "protattr" #define OVL_XATTR_XWHITEOUT_POSTFIX "whiteout" -#define OVL_XATTR_XWHITEOUTS_POSTFIX "whiteouts" #define OVL_XATTR_TAB_ENTRY(x) \ [x] = { [false] = OVL_XATTR_TRUSTED_PREFIX x ## _POSTFIX, \ @@ -854,7 +864,6 @@ const char *const ovl_xattr_table[][2] = { OVL_XATTR_TAB_ENTRY(OVL_XATTR_METACOPY), OVL_XATTR_TAB_ENTRY(OVL_XATTR_PROTATTR), OVL_XATTR_TAB_ENTRY(OVL_XATTR_XWHITEOUT), - OVL_XATTR_TAB_ENTRY(OVL_XATTR_XWHITEOUTS), }; int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry, diff --git a/fs/proc/array.c b/fs/proc/array.c index ff08a8957552..34a47fb0c57f 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -477,13 +477,13 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, int permitted; struct mm_struct *mm; unsigned long long start_time; - unsigned long cmin_flt = 0, cmaj_flt = 0; - unsigned long min_flt = 0, maj_flt = 0; - u64 cutime, cstime, utime, stime; - u64 cgtime, gtime; + unsigned long cmin_flt, cmaj_flt, min_flt, maj_flt; + u64 cutime, cstime, cgtime, utime, stime, gtime; unsigned long rsslim = 0; unsigned long flags; int exit_code = task->exit_code; + struct signal_struct *sig = task->signal; + unsigned int seq = 1; state = *get_task_state(task); vsize = eip = esp = 0; @@ -511,12 +511,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, sigemptyset(&sigign); sigemptyset(&sigcatch); - cutime = cstime = utime = stime = 0; - cgtime = gtime = 0; if (lock_task_sighand(task, &flags)) { - struct signal_struct *sig = task->signal; - if (sig->tty) { struct pid *pgrp = tty_get_pgrp(sig->tty); tty_pgrp = pid_nr_ns(pgrp, ns); @@ -527,28 +523,9 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, num_threads = get_nr_threads(task); collect_sigign_sigcatch(task, &sigign, &sigcatch); - cmin_flt = sig->cmin_flt; - cmaj_flt = sig->cmaj_flt; - cutime = sig->cutime; - cstime = sig->cstime; - cgtime = sig->cgtime; rsslim = READ_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur); - /* add up live thread stats at the group level */ if (whole) { - struct task_struct *t; - - __for_each_thread(sig, t) { - min_flt += t->min_flt; - maj_flt += t->maj_flt; - gtime += task_gtime(t); - } - - min_flt += sig->min_flt; - maj_flt += sig->maj_flt; - thread_group_cputime_adjusted(task, &utime, &stime); - gtime += sig->gtime; - if (sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_STOP_STOPPED)) exit_code = sig->group_exit_code; } @@ -562,10 +539,41 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, if (permitted && (!whole || num_threads < 2)) wchan = !task_is_running(task); - if (!whole) { + + do { + seq++; /* 2 on the 1st/lockless path, otherwise odd */ + flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); + + cmin_flt = sig->cmin_flt; + cmaj_flt = sig->cmaj_flt; + cutime = sig->cutime; + cstime = sig->cstime; + cgtime = sig->cgtime; + + if (whole) { + struct task_struct *t; + + min_flt = sig->min_flt; + maj_flt = sig->maj_flt; + gtime = sig->gtime; + + rcu_read_lock(); + __for_each_thread(sig, t) { + min_flt += t->min_flt; + maj_flt += t->maj_flt; + gtime += task_gtime(t); + } + rcu_read_unlock(); + } + } while (need_seqretry(&sig->stats_lock, seq)); + done_seqretry_irqrestore(&sig->stats_lock, seq, flags); + + if (whole) { + thread_group_cputime_adjusted(task, &utime, &stime); + } else { + task_cputime_adjusted(task, &utime, &stime); min_flt = task->min_flt; maj_flt = task->maj_flt; - task_cputime_adjusted(task, &utime, &stime); gtime = task_gtime(task); } diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index 971892620504..1daeb5714faa 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -145,21 +145,27 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, struct cached_fid *cfid; struct cached_fids *cfids; const char *npath; + int retries = 0, cur_sleep = 1; if (tcon == NULL || tcon->cfids == NULL || tcon->nohandlecache || is_smb1_server(tcon->ses->server) || (dir_cache_timeout == 0)) return -EOPNOTSUPP; ses = tcon->ses; - server = cifs_pick_channel(ses); cfids = tcon->cfids; - if (!server->ops->new_lease_key) - return -EIO; - if (cifs_sb->root == NULL) return -ENOENT; +replay_again: + /* reinitialize for possible replay */ + flags = 0; + oplock = SMB2_OPLOCK_LEVEL_II; + server = cifs_pick_channel(ses); + + if (!server->ops->new_lease_key) + return -EIO; + utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); if (!utf16_path) return -ENOMEM; @@ -268,6 +274,11 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, */ cfid->has_lease = true; + if (retries) { + smb2_set_replay(server, &rqst[0]); + smb2_set_replay(server, &rqst[1]); + } + rc = compound_send_recv(xid, ses, server, flags, 2, rqst, resp_buftype, rsp_iov); @@ -367,6 +378,11 @@ out: atomic_inc(&tcon->num_remote_opens); } kfree(utf16_path); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c index ef4c2e3c9fa6..6322f0f68a17 100644 --- a/fs/smb/client/cifsencrypt.c +++ b/fs/smb/client/cifsencrypt.c @@ -572,7 +572,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, len = cifs_strtoUTF16(user, ses->user_name, len, nls_cp); UniStrupr(user); } else { - memset(user, '\0', 2); + *(u16 *)user = 0; } rc = crypto_shash_update(ses->server->secmech.hmacmd5, diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index e902de4e475a..2a4a4e3a8751 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -396,7 +396,7 @@ cifs_alloc_inode(struct super_block *sb) spin_lock_init(&cifs_inode->writers_lock); cifs_inode->writers = 0; cifs_inode->netfs.inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ - cifs_inode->server_eof = 0; + cifs_inode->netfs.remote_i_size = 0; cifs_inode->uniqueid = 0; cifs_inode->createtime = 0; cifs_inode->epoch = 0; @@ -1380,6 +1380,7 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, struct inode *src_inode = file_inode(src_file); struct inode *target_inode = file_inode(dst_file); struct cifsInodeInfo *src_cifsi = CIFS_I(src_inode); + struct cifsInodeInfo *target_cifsi = CIFS_I(target_inode); struct cifsFileInfo *smb_file_src; struct cifsFileInfo *smb_file_target; struct cifs_tcon *src_tcon; @@ -1428,7 +1429,7 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, * Advance the EOF marker after the flush above to the end of the range * if it's short of that. */ - if (src_cifsi->server_eof < off + len) { + if (src_cifsi->netfs.remote_i_size < off + len) { rc = cifs_precopy_set_eof(src_inode, src_cifsi, src_tcon, xid, off + len); if (rc < 0) goto unlock; @@ -1452,12 +1453,22 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, /* Discard all the folios that overlap the destination region. */ truncate_inode_pages_range(&target_inode->i_data, fstart, fend); + fscache_invalidate(cifs_inode_cookie(target_inode), NULL, + i_size_read(target_inode), 0); + rc = file_modified(dst_file); if (!rc) { rc = target_tcon->ses->server->ops->copychunk_range(xid, smb_file_src, smb_file_target, off, len, destoff); - if (rc > 0 && destoff + rc > i_size_read(target_inode)) + if (rc > 0 && destoff + rc > i_size_read(target_inode)) { truncate_setsize(target_inode, destoff + rc); + netfs_resize_file(&target_cifsi->netfs, + i_size_read(target_inode), true); + fscache_resize_cookie(cifs_inode_cookie(target_inode), + i_size_read(target_inode)); + } + if (rc > 0 && destoff + rc > target_cifsi->netfs.zero_point) + target_cifsi->netfs.zero_point = destoff + rc; } file_accessed(src_file); diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 20036fb16cec..c86a72c9d9ec 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -50,6 +50,11 @@ #define CIFS_DEF_ACTIMEO (1 * HZ) /* + * max sleep time before retry to server + */ +#define CIFS_MAX_SLEEP 2000 + +/* * max attribute cache timeout (jiffies) - 2^30 */ #define CIFS_MAX_ACTIMEO (1 << 30) @@ -82,7 +87,7 @@ #define SMB_INTERFACE_POLL_INTERVAL 600 /* maximum number of PDUs in one compound */ -#define MAX_COMPOUND 5 +#define MAX_COMPOUND 7 /* * Default number of credits to keep available for SMB3. @@ -1027,6 +1032,8 @@ struct cifs_chan { __u8 signkey[SMB3_SIGN_KEY_SIZE]; }; +#define CIFS_SES_FLAG_SCALE_CHANNELS (0x1) + /* * Session structure. One of these for each uid session with a particular host */ @@ -1059,6 +1066,7 @@ struct cifs_ses { enum securityEnum sectype; /* what security flavor was specified? */ bool sign; /* is signing required? */ bool domainAuto:1; + unsigned int flags; __u16 session_flags; __u8 smb3signingkey[SMB3_SIGN_KEY_SIZE]; __u8 smb3encryptionkey[SMB3_ENC_DEC_KEY_SIZE]; @@ -1501,6 +1509,7 @@ struct cifs_writedata { struct smbd_mr *mr; #endif struct cifs_credits credits; + bool replay; }; /* @@ -1561,7 +1570,6 @@ struct cifsInodeInfo { spinlock_t writers_lock; unsigned int writers; /* Number of writers on this inode */ unsigned long time; /* jiffies of last update of inode */ - u64 server_eof; /* current file size on server -- protected by i_lock */ u64 uniqueid; /* server inode number */ u64 createtime; /* creation time on server */ __u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for this inode */ @@ -1831,6 +1839,13 @@ static inline bool is_retryable_error(int error) return false; } +static inline bool is_replayable_error(int error) +{ + if (error == -EAGAIN || error == -ECONNABORTED) + return true; + return false; +} + /* cifs_get_writable_file() flags */ #define FIND_WR_ANY 0 diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index bfd568f89710..d03253f8f145 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -233,6 +233,12 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, list_for_each_entry_safe(ses, nses, &pserver->smb_ses_list, smb_ses_list) { /* check if iface is still active */ spin_lock(&ses->chan_lock); + if (cifs_ses_get_chan_index(ses, server) == + CIFS_INVAL_CHAN_INDEX) { + spin_unlock(&ses->chan_lock); + continue; + } + if (!cifs_chan_is_iface_active(ses, server)) { spin_unlock(&ses->chan_lock); cifs_chan_update_iface(ses, server); @@ -4228,6 +4234,11 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru /* only send once per connect */ spin_lock(&tcon->tc_lock); + + /* if tcon is marked for needing reconnect, update state */ + if (tcon->need_reconnect) + tcon->status = TID_NEED_TCON; + if (tcon->status == TID_GOOD) { spin_unlock(&tcon->tc_lock); return 0; diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c index a8a1d386da65..449c59830039 100644 --- a/fs/smb/client/dfs.c +++ b/fs/smb/client/dfs.c @@ -565,6 +565,11 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru /* only send once per connect */ spin_lock(&tcon->tc_lock); + + /* if tcon is marked for needing reconnect, update state */ + if (tcon->need_reconnect) + tcon->status = TID_NEED_TCON; + if (tcon->status == TID_GOOD) { spin_unlock(&tcon->tc_lock); return 0; @@ -625,8 +630,8 @@ out: spin_lock(&tcon->tc_lock); if (tcon->status == TID_IN_TCON) tcon->status = TID_GOOD; - spin_unlock(&tcon->tc_lock); tcon->need_reconnect = false; + spin_unlock(&tcon->tc_lock); } return rc; diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 3a213432775b..f391c9b803d8 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -87,7 +87,7 @@ void cifs_pages_written_back(struct inode *inode, loff_t start, unsigned int len continue; if (!folio_test_writeback(folio)) { WARN_ONCE(1, "bad %x @%llx page %lx %lx\n", - len, start, folio_index(folio), end); + len, start, folio->index, end); continue; } @@ -120,7 +120,7 @@ void cifs_pages_write_failed(struct inode *inode, loff_t start, unsigned int len continue; if (!folio_test_writeback(folio)) { WARN_ONCE(1, "bad %x @%llx page %lx %lx\n", - len, start, folio_index(folio), end); + len, start, folio->index, end); continue; } @@ -151,7 +151,7 @@ void cifs_pages_write_redirty(struct inode *inode, loff_t start, unsigned int le xas_for_each(&xas, folio, end) { if (!folio_test_writeback(folio)) { WARN_ONCE(1, "bad %x @%llx page %lx %lx\n", - len, start, folio_index(folio), end); + len, start, folio->index, end); continue; } @@ -175,6 +175,9 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) /* only send once per connect */ spin_lock(&tcon->tc_lock); + if (tcon->need_reconnect) + tcon->status = TID_NEED_RECON; + if (tcon->status != TID_NEED_RECON) { spin_unlock(&tcon->tc_lock); return; @@ -2120,8 +2123,8 @@ cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, { loff_t end_of_write = offset + bytes_written; - if (end_of_write > cifsi->server_eof) - cifsi->server_eof = end_of_write; + if (end_of_write > cifsi->netfs.remote_i_size) + netfs_resize_file(&cifsi->netfs, end_of_write, true); } static ssize_t @@ -2651,7 +2654,7 @@ static void cifs_extend_writeback(struct address_space *mapping, continue; if (xa_is_value(folio)) break; - if (folio_index(folio) != index) + if (folio->index != index) break; if (!folio_try_get_rcu(folio)) { xas_reset(&xas); @@ -2899,7 +2902,7 @@ redo_folio: goto skip_write; } - if (folio_mapping(folio) != mapping || + if (folio->mapping != mapping || !folio_test_dirty(folio)) { start += folio_size(folio); folio_unlock(folio); @@ -3247,8 +3250,8 @@ cifs_uncached_writev_complete(struct work_struct *work) spin_lock(&inode->i_lock); cifs_update_eof(cifsi, wdata->offset, wdata->bytes); - if (cifsi->server_eof > inode->i_size) - i_size_write(inode, cifsi->server_eof); + if (cifsi->netfs.remote_i_size > inode->i_size) + i_size_write(inode, cifsi->netfs.remote_i_size); spin_unlock(&inode->i_lock); complete(&wdata->done); @@ -3300,6 +3303,7 @@ cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list, if (wdata->cfile->invalidHandle) rc = -EAGAIN; else { + wdata->replay = true; #ifdef CONFIG_CIFS_SMB_DIRECT if (wdata->mr) { wdata->mr->need_invalidate = true; diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index 52cbef2eeb28..aec8dbd1f9db 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -211,7 +211,7 @@ cifs_parse_security_flavors(struct fs_context *fc, char *value, struct smb3_fs_c switch (match_token(value, cifs_secflavor_tokens, args)) { case Opt_sec_krb5p: - cifs_errorf(fc, "sec=krb5p is not supported!\n"); + cifs_errorf(fc, "sec=krb5p is not supported. Use sec=krb5,seal instead\n"); return 1; case Opt_sec_krb5i: ctx->sign = true; diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index f0989484f2c6..d02f8ba29cb5 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -104,7 +104,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) fattr->cf_mtime = timestamp_truncate(fattr->cf_mtime, inode); mtime = inode_get_mtime(inode); if (timespec64_equal(&mtime, &fattr->cf_mtime) && - cifs_i->server_eof == fattr->cf_eof) { + cifs_i->netfs.remote_i_size == fattr->cf_eof) { cifs_dbg(FYI, "%s: inode %llu is unchanged\n", __func__, cifs_i->uniqueid); return; @@ -194,7 +194,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) else clear_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags); - cifs_i->server_eof = fattr->cf_eof; + cifs_i->netfs.remote_i_size = fattr->cf_eof; /* * Can't safely change the file size here if the client is writing to * it due to potential races. @@ -2858,7 +2858,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs, set_size_out: if (rc == 0) { - cifsInode->server_eof = attrs->ia_size; + netfs_resize_file(&cifsInode->netfs, attrs->ia_size, true); cifs_setsize(inode, attrs->ia_size); /* * i_blocks is not related to (i_size / i_blksize), but instead @@ -3011,6 +3011,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) if ((attrs->ia_valid & ATTR_SIZE) && attrs->ia_size != i_size_read(inode)) { truncate_setsize(inode, attrs->ia_size); + netfs_resize_file(&cifsInode->netfs, attrs->ia_size, true); fscache_resize_cookie(cifs_inode_cookie(inode), attrs->ia_size); } @@ -3210,6 +3211,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) if ((attrs->ia_valid & ATTR_SIZE) && attrs->ia_size != i_size_read(inode)) { truncate_setsize(inode, attrs->ia_size); + netfs_resize_file(&cifsInode->netfs, attrs->ia_size, true); fscache_resize_cookie(cifs_inode_cookie(inode), attrs->ia_size); } diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c index 94255401b38d..b520eea7bfce 100644 --- a/fs/smb/client/readdir.c +++ b/fs/smb/client/readdir.c @@ -141,7 +141,7 @@ retry: if (likely(reparse_inode_match(inode, fattr))) { fattr->cf_mode = inode->i_mode; fattr->cf_rdev = inode->i_rdev; - fattr->cf_eof = CIFS_I(inode)->server_eof; + fattr->cf_eof = CIFS_I(inode)->netfs.remote_i_size; fattr->cf_symlink_target = NULL; } else { CIFS_I(inode)->time = 0; @@ -307,14 +307,16 @@ cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info, } static void cifs_fulldir_info_to_fattr(struct cifs_fattr *fattr, - SEARCH_ID_FULL_DIR_INFO *info, + const void *info, struct cifs_sb_info *cifs_sb) { + const FILE_FULL_DIRECTORY_INFO *di = info; + __dir_info_to_fattr(fattr, info); - /* See MS-FSCC 2.4.19 FileIdFullDirectoryInformation */ + /* See MS-FSCC 2.4.14, 2.4.19 */ if (fattr->cf_cifsattrs & ATTR_REPARSE) - fattr->cf_cifstag = le32_to_cpu(info->EaSize); + fattr->cf_cifstag = le32_to_cpu(di->EaSize); cifs_fill_common_info(fattr, cifs_sb); } @@ -396,7 +398,7 @@ ffirst_retry: } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { cifsFile->srch_inf.info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO; } else /* not srvinos - BB fixme add check for backlevel? */ { - cifsFile->srch_inf.info_level = SMB_FIND_FILE_DIRECTORY_INFO; + cifsFile->srch_inf.info_level = SMB_FIND_FILE_FULL_DIRECTORY_INFO; } search_flags = CIFS_SEARCH_CLOSE_AT_END | CIFS_SEARCH_RETURN_RESUME; @@ -987,10 +989,9 @@ static int cifs_filldir(char *find_entry, struct file *file, (FIND_FILE_STANDARD_INFO *)find_entry, cifs_sb); break; + case SMB_FIND_FILE_FULL_DIRECTORY_INFO: case SMB_FIND_FILE_ID_FULL_DIR_INFO: - cifs_fulldir_info_to_fattr(&fattr, - (SEARCH_ID_FULL_DIR_INFO *)find_entry, - cifs_sb); + cifs_fulldir_info_to_fattr(&fattr, find_entry, cifs_sb); break; default: cifs_dir_info_to_fattr(&fattr, diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index cde81042bebd..8f37373fd333 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -75,6 +75,10 @@ cifs_ses_get_chan_index(struct cifs_ses *ses, { unsigned int i; + /* if the channel is waiting for termination */ + if (server && server->terminate) + return CIFS_INVAL_CHAN_INDEX; + for (i = 0; i < ses->chan_count; i++) { if (ses->chans[i].server == server) return i; @@ -84,7 +88,6 @@ cifs_ses_get_chan_index(struct cifs_ses *ses, if (server) cifs_dbg(VFS, "unable to get chan index for server: 0x%llx", server->conn_id); - WARN_ON(1); return CIFS_INVAL_CHAN_INDEX; } @@ -269,6 +272,8 @@ int cifs_try_adding_channels(struct cifs_ses *ses) &iface->sockaddr, rc); kref_put(&iface->refcount, release_iface); + /* failure to add chan should increase weight */ + iface->weight_fulfilled++; continue; } diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index a652200540c8..05818cd6d932 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -120,6 +120,14 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, unsigned int size[2]; void *data[2]; int len; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + oplock = SMB2_OPLOCK_LEVEL_NONE; + num_rqst = 0; + server = cifs_pick_channel(ses); vars = kzalloc(sizeof(*vars), GFP_ATOMIC); if (vars == NULL) @@ -127,8 +135,6 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, rqst = &vars->rqst[0]; rsp_iov = &vars->rsp_iov[0]; - server = cifs_pick_channel(ses); - if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; @@ -463,15 +469,24 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, num_rqst++; if (cfile) { + if (retries) + for (i = 1; i < num_rqst - 2; i++) + smb2_set_replay(server, &rqst[i]); + rc = compound_send_recv(xid, ses, server, flags, num_rqst - 2, &rqst[1], &resp_buftype[1], &rsp_iov[1]); - } else + } else { + if (retries) + for (i = 0; i < num_rqst; i++) + smb2_set_replay(server, &rqst[i]); + rc = compound_send_recv(xid, ses, server, flags, num_rqst, rqst, resp_buftype, rsp_iov); + } finished: num_rqst = 0; @@ -620,9 +635,6 @@ finished: } SMB2_close_free(&rqst[num_rqst]); - if (cfile) - cifsFileInfo_put(cfile); - num_cmds += 2; if (out_iov && out_buftype) { memcpy(out_iov, rsp_iov, num_cmds * sizeof(*out_iov)); @@ -632,7 +644,16 @@ finished: for (i = 0; i < num_cmds; i++) free_rsp_buf(resp_buftype[i], rsp_iov[i].iov_base); } + num_cmds -= 2; /* correct num_cmds as there could be a retry */ kfree(vars); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + + if (cfile) + cifsFileInfo_put(cfile); + return rc; } diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index d9553c2556a2..83c898afc835 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -1108,7 +1108,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, { struct smb2_compound_vars *vars; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; struct smb_rqst *rqst; struct kvec *rsp_iov; __le16 *utf16_path = NULL; @@ -1124,6 +1124,13 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, struct smb2_file_full_ea_info *ea = NULL; struct smb2_query_info_rsp *rsp; int rc, used_len = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = CIFS_CP_CREATE_CLOSE_OP; + oplock = SMB2_OPLOCK_LEVEL_NONE; + server = cifs_pick_channel(ses); if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; @@ -1244,6 +1251,12 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, goto sea_exit; smb2_set_related(&rqst[2]); + if (retries) { + smb2_set_replay(server, &rqst[0]); + smb2_set_replay(server, &rqst[1]); + smb2_set_replay(server, &rqst[2]); + } + rc = compound_send_recv(xid, ses, server, flags, 3, rqst, resp_buftype, rsp_iov); @@ -1260,6 +1273,11 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, kfree(vars); out_free_path: kfree(utf16_path); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } #endif @@ -1484,7 +1502,7 @@ smb2_ioctl_query_info(const unsigned int xid, struct smb_rqst *rqst; struct kvec *rsp_iov; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; char __user *arg = (char __user *)p; struct smb_query_info qi; struct smb_query_info __user *pqi; @@ -1501,6 +1519,13 @@ smb2_ioctl_query_info(const unsigned int xid, void *data[2]; int create_options = is_dir ? CREATE_NOT_FILE : CREATE_NOT_DIR; void (*free_req1_func)(struct smb_rqst *r); + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = CIFS_CP_CREATE_CLOSE_OP; + oplock = SMB2_OPLOCK_LEVEL_NONE; + server = cifs_pick_channel(ses); vars = kzalloc(sizeof(*vars), GFP_ATOMIC); if (vars == NULL) @@ -1641,6 +1666,12 @@ smb2_ioctl_query_info(const unsigned int xid, goto free_req_1; smb2_set_related(&rqst[2]); + if (retries) { + smb2_set_replay(server, &rqst[0]); + smb2_set_replay(server, &rqst[1]); + smb2_set_replay(server, &rqst[2]); + } + rc = compound_send_recv(xid, ses, server, flags, 3, rqst, resp_buftype, rsp_iov); @@ -1701,6 +1732,11 @@ free_output_buffer: kfree(buffer); free_vars: kfree(vars); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -2227,8 +2263,14 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct smb2_query_directory_rsp *qd_rsp = NULL; struct smb2_create_rsp *op_rsp = NULL; - struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses); - int retry_count = 0; + struct TCP_Server_Info *server; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + oplock = SMB2_OPLOCK_LEVEL_NONE; + server = cifs_pick_channel(tcon->ses); utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); if (!utf16_path) @@ -2278,14 +2320,15 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, smb2_set_related(&rqst[1]); -again: + if (retries) { + smb2_set_replay(server, &rqst[0]); + smb2_set_replay(server, &rqst[1]); + } + rc = compound_send_recv(xid, tcon->ses, server, flags, 2, rqst, resp_buftype, rsp_iov); - if (rc == -EAGAIN && retry_count++ < 10) - goto again; - /* If the open failed there is nothing to do */ op_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base; if (op_rsp == NULL || op_rsp->hdr.Status != STATUS_SUCCESS) { @@ -2333,6 +2376,11 @@ again: SMB2_query_directory_free(&rqst[1]); free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -2458,6 +2506,22 @@ smb2_oplock_response(struct cifs_tcon *tcon, __u64 persistent_fid, } void +smb2_set_replay(struct TCP_Server_Info *server, struct smb_rqst *rqst) +{ + struct smb2_hdr *shdr; + + if (server->dialect < SMB30_PROT_ID) + return; + + shdr = (struct smb2_hdr *)(rqst->rq_iov[0].iov_base); + if (shdr == NULL) { + cifs_dbg(FYI, "shdr NULL in smb2_set_related\n"); + return; + } + shdr->Flags |= SMB2_FLAGS_REPLAY_OPERATION; +} + +void smb2_set_related(struct smb_rqst *rqst) { struct smb2_hdr *shdr; @@ -2530,6 +2594,27 @@ smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst) } /* + * helper function for exponential backoff and check if replayable + */ +bool smb2_should_replay(struct cifs_tcon *tcon, + int *pretries, + int *pcur_sleep) +{ + if (!pretries || !pcur_sleep) + return false; + + if (tcon->retry || (*pretries)++ < tcon->ses->server->retrans) { + msleep(*pcur_sleep); + (*pcur_sleep) = ((*pcur_sleep) << 1); + if ((*pcur_sleep) > CIFS_MAX_SLEEP) + (*pcur_sleep) = CIFS_MAX_SLEEP; + return true; + } + + return false; +} + +/* * Passes the query info response back to the caller on success. * Caller need to free this with free_rsp_buf(). */ @@ -2542,7 +2627,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, { struct smb2_compound_vars *vars; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; int flags = CIFS_CP_CREATE_CLOSE_OP; struct smb_rqst *rqst; int resp_buftype[3]; @@ -2553,6 +2638,13 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, int rc; __le16 *utf16_path; struct cached_fid *cfid = NULL; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = CIFS_CP_CREATE_CLOSE_OP; + oplock = SMB2_OPLOCK_LEVEL_NONE; + server = cifs_pick_channel(ses); if (!path) path = ""; @@ -2633,6 +2725,14 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, goto qic_exit; smb2_set_related(&rqst[2]); + if (retries) { + if (!cfid) { + smb2_set_replay(server, &rqst[0]); + smb2_set_replay(server, &rqst[2]); + } + smb2_set_replay(server, &rqst[1]); + } + if (cfid) { rc = compound_send_recv(xid, ses, server, flags, 1, &rqst[1], @@ -2665,6 +2765,11 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, kfree(vars); out_free_path: kfree(utf16_path); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -3213,6 +3318,9 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, cfile->fid.volatile_fid, cfile->pid, new_size); if (rc >= 0) { truncate_setsize(inode, new_size); + netfs_resize_file(&cifsi->netfs, new_size, true); + if (offset < cifsi->netfs.zero_point) + cifsi->netfs.zero_point = offset; fscache_resize_cookie(cifs_inode_cookie(inode), new_size); } } @@ -3436,7 +3544,7 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, cfile->pid, new_eof); if (rc == 0) { - cifsi->server_eof = new_eof; + netfs_resize_file(&cifsi->netfs, new_eof, true); cifs_setsize(inode, new_eof); cifs_truncate_page(inode->i_mapping, inode->i_size); truncate_setsize(inode, new_eof); @@ -3528,8 +3636,9 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon, int rc; unsigned int xid; struct inode *inode = file_inode(file); - struct cifsFileInfo *cfile = file->private_data; struct cifsInodeInfo *cifsi = CIFS_I(inode); + struct cifsFileInfo *cfile = file->private_data; + struct netfs_inode *ictx = &cifsi->netfs; loff_t old_eof, new_eof; xid = get_xid(); @@ -3549,6 +3658,7 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon, goto out_2; truncate_pagecache_range(inode, off, old_eof); + ictx->zero_point = old_eof; rc = smb2_copychunk_range(xid, cfile, cfile, off + len, old_eof - off - len, off); @@ -3563,9 +3673,10 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon, rc = 0; - cifsi->server_eof = i_size_read(inode) - len; - truncate_setsize(inode, cifsi->server_eof); - fscache_resize_cookie(cifs_inode_cookie(inode), cifsi->server_eof); + truncate_setsize(inode, new_eof); + netfs_resize_file(&cifsi->netfs, new_eof, true); + ictx->zero_point = new_eof; + fscache_resize_cookie(cifs_inode_cookie(inode), new_eof); out_2: filemap_invalidate_unlock(inode->i_mapping); out: @@ -3581,6 +3692,7 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon, unsigned int xid; struct cifsFileInfo *cfile = file->private_data; struct inode *inode = file_inode(file); + struct cifsInodeInfo *cifsi = CIFS_I(inode); __u64 count, old_eof, new_eof; xid = get_xid(); @@ -3608,6 +3720,7 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon, goto out_2; truncate_setsize(inode, new_eof); + netfs_resize_file(&cifsi->netfs, i_size_read(inode), true); fscache_resize_cookie(cifs_inode_cookie(inode), i_size_read(inode)); rc = smb2_copychunk_range(xid, cfile, cfile, off, count, off + len); diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 288199f0b987..4085ce27fd38 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -178,6 +178,7 @@ cifs_chan_skip_or_disable(struct cifs_ses *ses, } ses->chans[chan_index].server = NULL; + server->terminate = true; spin_unlock(&ses->chan_lock); /* @@ -188,14 +189,12 @@ cifs_chan_skip_or_disable(struct cifs_ses *ses, */ cifs_put_tcp_session(server, from_reconnect); - server->terminate = true; cifs_signal_cifsd_for_reconnect(server, false); /* mark primary server as needing reconnect */ pserver = server->primary_server; cifs_signal_cifsd_for_reconnect(pserver, false); skip_terminate: - mutex_unlock(&ses->session_mutex); return -EHOSTDOWN; } @@ -400,6 +399,15 @@ skip_sess_setup: goto out; } + spin_lock(&ses->ses_lock); + if (ses->flags & CIFS_SES_FLAG_SCALE_CHANNELS) { + spin_unlock(&ses->ses_lock); + mutex_unlock(&ses->session_mutex); + goto skip_add_channels; + } + ses->flags |= CIFS_SES_FLAG_SCALE_CHANNELS; + spin_unlock(&ses->ses_lock); + if (!rc && (server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) { mutex_unlock(&ses->session_mutex); @@ -411,7 +419,7 @@ skip_sess_setup: rc = SMB3_request_interfaces(xid, tcon, false); free_xid(xid); - if (rc == -EOPNOTSUPP) { + if (rc == -EOPNOTSUPP && ses->chan_count > 1) { /* * some servers like Azure SMB server do not advertise * that multichannel has been disabled with server @@ -429,17 +437,22 @@ skip_sess_setup: if (ses->chan_max > ses->chan_count && ses->iface_count && !SERVER_IS_CHAN(server)) { - if (ses->chan_count == 1) + if (ses->chan_count == 1) { cifs_server_dbg(VFS, "supports multichannel now\n"); + queue_delayed_work(cifsiod_wq, &tcon->query_interfaces, + (SMB_INTERFACE_POLL_INTERVAL * HZ)); + } cifs_try_adding_channels(ses); - queue_delayed_work(cifsiod_wq, &tcon->query_interfaces, - (SMB_INTERFACE_POLL_INTERVAL * HZ)); } } else { mutex_unlock(&ses->session_mutex); } + skip_add_channels: + spin_lock(&ses->ses_lock); + ses->flags &= ~CIFS_SES_FLAG_SCALE_CHANNELS; + spin_unlock(&ses->ses_lock); if (smb2_command != SMB2_INTERNAL_CMD) mod_delayed_work(cifsiod_wq, &server->reconnect, 0); @@ -2765,7 +2778,14 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, int flags = 0; unsigned int total_len; __le16 *utf16_path = NULL; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + n_iov = 2; + server = cifs_pick_channel(ses); cifs_dbg(FYI, "mkdir\n"); @@ -2869,6 +2889,10 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, /* no need to inc num_remote_opens because we close it just below */ trace_smb3_posix_mkdir_enter(xid, tcon->tid, ses->Suid, full_path, CREATE_NOT_FILE, FILE_WRITE_ATTRIBUTES); + + if (retries) + smb2_set_replay(server, &rqst); + /* resource #4: response buffer */ rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); @@ -2906,6 +2930,11 @@ err_free_req: cifs_small_buf_release(req); err_free_path: kfree(utf16_path); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -3101,12 +3130,18 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, struct smb2_create_rsp *rsp = NULL; struct cifs_tcon *tcon = oparms->tcon; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; struct kvec iov[SMB2_CREATE_IOV_SIZE]; struct kvec rsp_iov = {NULL, 0}; int resp_buftype = CIFS_NO_BUFFER; int rc = 0; int flags = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + server = cifs_pick_channel(ses); cifs_dbg(FYI, "create/open\n"); if (!ses || !server) @@ -3128,6 +3163,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, trace_smb3_open_enter(xid, tcon->tid, tcon->ses->Suid, oparms->path, oparms->create_options, oparms->desired_access); + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); @@ -3181,6 +3219,11 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, creat_exit: SMB2_open_free(&rqst); free_rsp_buf(resp_buftype, rsp); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -3305,15 +3348,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, int resp_buftype = CIFS_NO_BUFFER; int rc = 0; int flags = 0; - - cifs_dbg(FYI, "SMB2 IOCTL\n"); - - if (out_data != NULL) - *out_data = NULL; - - /* zero out returned data len, in case of error */ - if (plen) - *plen = 0; + int retries = 0, cur_sleep = 1; if (!tcon) return -EIO; @@ -3322,10 +3357,23 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, if (!ses) return -EIO; +replay_again: + /* reinitialize for possible replay */ + flags = 0; server = cifs_pick_channel(ses); + if (!server) return -EIO; + cifs_dbg(FYI, "SMB2 IOCTL\n"); + + if (out_data != NULL) + *out_data = NULL; + + /* zero out returned data len, in case of error */ + if (plen) + *plen = 0; + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; @@ -3340,6 +3388,9 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, if (rc) goto ioctl_exit; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); @@ -3409,6 +3460,11 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, ioctl_exit: SMB2_ioctl_free(&rqst); free_rsp_buf(resp_buftype, rsp); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -3480,13 +3536,20 @@ __SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, struct smb_rqst rqst; struct smb2_close_rsp *rsp = NULL; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; struct kvec iov[1]; struct kvec rsp_iov; int resp_buftype = CIFS_NO_BUFFER; int rc = 0; int flags = 0; bool query_attrs = false; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + query_attrs = false; + server = cifs_pick_channel(ses); cifs_dbg(FYI, "Close\n"); @@ -3512,6 +3575,9 @@ __SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, if (rc) goto close_exit; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); rsp = (struct smb2_close_rsp *)rsp_iov.iov_base; @@ -3545,6 +3611,11 @@ close_exit: cifs_dbg(VFS, "handle cancelled close fid 0x%llx returned error %d\n", persistent_fid, tmp_rc); } + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -3675,12 +3746,19 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, struct TCP_Server_Info *server; int flags = 0; bool allocated = false; + int retries = 0, cur_sleep = 1; cifs_dbg(FYI, "Query Info\n"); if (!ses) return -EIO; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + allocated = false; server = cifs_pick_channel(ses); + if (!server) return -EIO; @@ -3702,6 +3780,9 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, trace_smb3_query_info_enter(xid, persistent_fid, tcon->tid, ses->Suid, info_class, (__u32)info_type); + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base; @@ -3744,6 +3825,11 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, qinf_exit: SMB2_query_info_free(&rqst); free_rsp_buf(resp_buftype, rsp); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -3844,7 +3930,7 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, u32 *plen /* returned data len */) { struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; struct smb_rqst rqst; struct smb2_change_notify_rsp *smb_rsp; struct kvec iov[1]; @@ -3852,6 +3938,12 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, int resp_buftype = CIFS_NO_BUFFER; int flags = 0; int rc = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + server = cifs_pick_channel(ses); cifs_dbg(FYI, "change notify\n"); if (!ses || !server) @@ -3876,6 +3968,10 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, trace_smb3_notify_enter(xid, persistent_fid, tcon->tid, ses->Suid, (u8)watch_tree, completion_filter); + + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); @@ -3910,6 +4006,11 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, if (rqst.rq_iov) cifs_small_buf_release(rqst.rq_iov[0].iov_base); /* request */ free_rsp_buf(resp_buftype, rsp_iov.iov_base); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -4152,10 +4253,16 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, struct smb_rqst rqst; struct kvec iov[1]; struct kvec rsp_iov = {NULL, 0}; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; int resp_buftype = CIFS_NO_BUFFER; int flags = 0; int rc = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + server = cifs_pick_channel(ses); cifs_dbg(FYI, "flush\n"); if (!ses || !(ses->server)) @@ -4175,6 +4282,10 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, goto flush_exit; trace_smb3_flush_enter(xid, persistent_fid, tcon->tid, ses->Suid); + + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); @@ -4189,6 +4300,11 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, flush_exit: SMB2_flush_free(&rqst); free_rsp_buf(resp_buftype, rsp_iov.iov_base); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -4668,7 +4784,7 @@ smb2_async_writev(struct cifs_writedata *wdata, struct cifs_io_parms *io_parms = NULL; int credit_request; - if (!wdata->server) + if (!wdata->server || wdata->replay) server = wdata->server = cifs_pick_channel(tcon->ses); /* @@ -4753,6 +4869,8 @@ smb2_async_writev(struct cifs_writedata *wdata, rqst.rq_nvec = 1; rqst.rq_iter = wdata->iter; rqst.rq_iter_size = iov_iter_count(&rqst.rq_iter); + if (wdata->replay) + smb2_set_replay(server, &rqst); #ifdef CONFIG_CIFS_SMB_DIRECT if (wdata->mr) iov[0].iov_len += sizeof(struct smbd_buffer_descriptor_v1); @@ -4826,18 +4944,21 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, int flags = 0; unsigned int total_len; struct TCP_Server_Info *server; + int retries = 0, cur_sleep = 1; +replay_again: + /* reinitialize for possible replay */ + flags = 0; *nbytes = 0; - - if (n_vec < 1) - return rc; - if (!io_parms->server) io_parms->server = cifs_pick_channel(io_parms->tcon->ses); server = io_parms->server; if (server == NULL) return -ECONNABORTED; + if (n_vec < 1) + return rc; + rc = smb2_plain_req_init(SMB2_WRITE, io_parms->tcon, server, (void **) &req, &total_len); if (rc) @@ -4871,6 +4992,9 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, rqst.rq_iov = iov; rqst.rq_nvec = n_vec + 1; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, io_parms->tcon->ses, server, &rqst, &resp_buftype, flags, &rsp_iov); @@ -4895,6 +5019,11 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, cifs_small_buf_release(req); free_rsp_buf(resp_buftype, rsp); + + if (is_replayable_error(rc) && + smb2_should_replay(io_parms->tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -5077,6 +5206,9 @@ int SMB2_query_directory_init(const unsigned int xid, case SMB_FIND_FILE_POSIX_INFO: req->FileInformationClass = SMB_FIND_FILE_POSIX_INFO; break; + case SMB_FIND_FILE_FULL_DIRECTORY_INFO: + req->FileInformationClass = FILE_FULL_DIRECTORY_INFORMATION; + break; default: cifs_tcon_dbg(VFS, "info level %u isn't supported\n", info_level); @@ -5146,6 +5278,9 @@ smb2_parse_query_directory(struct cifs_tcon *tcon, /* note that posix payload are variable size */ info_buf_size = sizeof(struct smb2_posix_info); break; + case SMB_FIND_FILE_FULL_DIRECTORY_INFO: + info_buf_size = sizeof(FILE_FULL_DIRECTORY_INFO); + break; default: cifs_tcon_dbg(VFS, "info level %u isn't supported\n", srch_inf->info_level); @@ -5206,8 +5341,14 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, struct kvec rsp_iov; int rc = 0; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; int flags = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + server = cifs_pick_channel(ses); if (!ses || !(ses->server)) return -EIO; @@ -5227,6 +5368,9 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, if (rc) goto qdir_exit; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); rsp = (struct smb2_query_directory_rsp *)rsp_iov.iov_base; @@ -5261,6 +5405,11 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, qdir_exit: SMB2_query_directory_free(&rqst); free_rsp_buf(resp_buftype, rsp); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -5327,8 +5476,14 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; int resp_buftype; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; int flags = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + server = cifs_pick_channel(ses); if (!ses || !server) return -EIO; @@ -5356,6 +5511,8 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, return rc; } + if (retries) + smb2_set_replay(server, &rqst); rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, @@ -5371,6 +5528,11 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, free_rsp_buf(resp_buftype, rsp); kfree(iov); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -5423,12 +5585,18 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, int rc; struct smb2_oplock_break *req = NULL; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; int flags = CIFS_OBREAK_OP; unsigned int total_len; struct kvec iov[1]; struct kvec rsp_iov; int resp_buf_type; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = CIFS_OBREAK_OP; + server = cifs_pick_channel(ses); cifs_dbg(FYI, "SMB2_oplock_break\n"); rc = smb2_plain_req_init(SMB2_OPLOCK_BREAK, tcon, server, @@ -5453,15 +5621,21 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = iov; rqst.rq_nvec = 1; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); - if (rc) { cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE); cifs_dbg(FYI, "Send error in Oplock Break = %d\n", rc); } + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -5547,9 +5721,15 @@ SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; int resp_buftype; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; FILE_SYSTEM_POSIX_INFO *info = NULL; int flags = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + server = cifs_pick_channel(ses); rc = build_qfs_info_req(&iov, tcon, server, FS_POSIX_INFORMATION, @@ -5565,6 +5745,9 @@ SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = &iov; rqst.rq_nvec = 1; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); free_qfs_info_req(&iov); @@ -5584,6 +5767,11 @@ SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon, posix_qfsinf_exit: free_rsp_buf(resp_buftype, rsp_iov.iov_base); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -5598,9 +5786,15 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; int resp_buftype; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; struct smb2_fs_full_size_info *info = NULL; int flags = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + server = cifs_pick_channel(ses); rc = build_qfs_info_req(&iov, tcon, server, FS_FULL_SIZE_INFORMATION, @@ -5616,6 +5810,9 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = &iov; rqst.rq_nvec = 1; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); free_qfs_info_req(&iov); @@ -5635,6 +5832,11 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, qfsinf_exit: free_rsp_buf(resp_buftype, rsp_iov.iov_base); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -5649,9 +5851,15 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; int resp_buftype, max_len, min_len; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; unsigned int rsp_len, offset; int flags = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + server = cifs_pick_channel(ses); if (level == FS_DEVICE_INFORMATION) { max_len = sizeof(FILE_SYSTEM_DEVICE_INFO); @@ -5683,6 +5891,9 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = &iov; rqst.rq_nvec = 1; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); free_qfs_info_req(&iov); @@ -5720,6 +5931,11 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, qfsattr_exit: free_rsp_buf(resp_buftype, rsp_iov.iov_base); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -5737,7 +5953,13 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, unsigned int count; int flags = CIFS_NO_RSP_BUF; unsigned int total_len; - struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses); + struct TCP_Server_Info *server; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = CIFS_NO_RSP_BUF; + server = cifs_pick_channel(tcon->ses); cifs_dbg(FYI, "smb2_lockv num lock %d\n", num_lock); @@ -5768,6 +5990,9 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = iov; rqst.rq_nvec = 2; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, tcon->ses, server, &rqst, &resp_buf_type, flags, &rsp_iov); @@ -5779,6 +6004,10 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, tcon->ses->Suid, rc); } + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h index 0034b537b0b3..b3069911e9dd 100644 --- a/fs/smb/client/smb2proto.h +++ b/fs/smb/client/smb2proto.h @@ -122,6 +122,11 @@ extern unsigned long smb_rqst_len(struct TCP_Server_Info *server, extern void smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst); extern void smb2_set_related(struct smb_rqst *rqst); +extern void smb2_set_replay(struct TCP_Server_Info *server, + struct smb_rqst *rqst); +extern bool smb2_should_replay(struct cifs_tcon *tcon, + int *pretries, + int *pcur_sleep); /* * SMB2 Worker functions - most of protocol specific implementation details diff --git a/fs/smb/client/smbencrypt.c b/fs/smb/client/smbencrypt.c index f0ce26414f17..1d1ee9f18f37 100644 --- a/fs/smb/client/smbencrypt.c +++ b/fs/smb/client/smbencrypt.c @@ -26,13 +26,6 @@ #include "cifsproto.h" #include "../common/md4.h" -#ifndef false -#define false 0 -#endif -#ifndef true -#define true 1 -#endif - /* following came from the other byteorder.h to avoid include conflicts */ #define CVAL(buf,pos) (((unsigned char *)(buf))[pos]) #define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8) diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index 4f717ad7c21b..994d70193432 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -400,10 +400,17 @@ unmask: server->conn_id, server->hostname); } smbd_done: - if (rc < 0 && rc != -EINTR) + /* + * there's hardly any use for the layers above to know the + * actual error code here. All they should do at this point is + * to retry the connection and hope it goes away. + */ + if (rc < 0 && rc != -EINTR && rc != -EAGAIN) { cifs_server_dbg(VFS, "Error %d sending data on socket to server\n", rc); - else if (rc > 0) + rc = -ECONNABORTED; + cifs_signal_cifsd_for_reconnect(server, false); + } else if (rc > 0) rc = 0; out: cifs_in_send_dec(server); @@ -428,8 +435,8 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, if (!(flags & CIFS_TRANSFORM_REQ)) return __smb_send_rqst(server, num_rqst, rqst); - if (num_rqst > MAX_COMPOUND - 1) - return -ENOMEM; + if (WARN_ON_ONCE(num_rqst > MAX_COMPOUND - 1)) + return -EIO; if (!server->ops->init_transform_rq) { cifs_server_dbg(VFS, "Encryption requested but transform callback is missing\n"); @@ -1026,6 +1033,9 @@ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses) if (!server || server->terminate) continue; + if (CIFS_CHAN_NEEDS_RECONNECT(ses, i)) + continue; + /* * strictly speaking, we should pick up req_lock to read * server->in_flight. But it shouldn't matter much here if we diff --git a/fs/smb/server/ksmbd_netlink.h b/fs/smb/server/ksmbd_netlink.h index b7521e41402e..0ebf91ffa236 100644 --- a/fs/smb/server/ksmbd_netlink.h +++ b/fs/smb/server/ksmbd_netlink.h @@ -304,7 +304,8 @@ enum ksmbd_event { KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST, KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE = 15, - KSMBD_EVENT_MAX + __KSMBD_EVENT_MAX, + KSMBD_EVENT_MAX = __KSMBD_EVENT_MAX - 1 }; /* diff --git a/fs/smb/server/misc.c b/fs/smb/server/misc.c index 9e8afaa686e3..1a5faa6f6e7b 100644 --- a/fs/smb/server/misc.c +++ b/fs/smb/server/misc.c @@ -261,6 +261,7 @@ out_ascii: /** * ksmbd_extract_sharename() - get share name from tree connect request + * @um: pointer to a unicode_map structure for character encoding handling * @treename: buffer containing tree name and share name * * Return: share name on success, otherwise error diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index ba7a72a6a4f4..0c97d3c86072 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -6173,8 +6173,10 @@ static noinline int smb2_read_pipe(struct ksmbd_work *work) err = ksmbd_iov_pin_rsp_read(work, (void *)rsp, offsetof(struct smb2_read_rsp, Buffer), aux_payload_buf, nbytes); - if (err) + if (err) { + kvfree(aux_payload_buf); goto out; + } kvfree(rpc_resp); } else { err = ksmbd_iov_pin_rsp(work, (void *)rsp, @@ -6384,8 +6386,10 @@ int smb2_read(struct ksmbd_work *work) err = ksmbd_iov_pin_rsp_read(work, (void *)rsp, offsetof(struct smb2_read_rsp, Buffer), aux_payload_buf, nbytes); - if (err) + if (err) { + kvfree(aux_payload_buf); goto out; + } ksmbd_fd_put(work, fp); return 0; diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c index b49d47bdafc9..f29bb03f0dc4 100644 --- a/fs/smb/server/transport_ipc.c +++ b/fs/smb/server/transport_ipc.c @@ -74,7 +74,7 @@ static int handle_unsupported_event(struct sk_buff *skb, struct genl_info *info) static int handle_generic_event(struct sk_buff *skb, struct genl_info *info); static int ksmbd_ipc_heartbeat_request(void); -static const struct nla_policy ksmbd_nl_policy[KSMBD_EVENT_MAX] = { +static const struct nla_policy ksmbd_nl_policy[KSMBD_EVENT_MAX + 1] = { [KSMBD_EVENT_UNSPEC] = { .len = 0, }, @@ -403,7 +403,7 @@ static int handle_generic_event(struct sk_buff *skb, struct genl_info *info) return -EPERM; #endif - if (type >= KSMBD_EVENT_MAX) { + if (type > KSMBD_EVENT_MAX) { WARN_ON(1); return -EINVAL; } diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c index 9d4222154dcc..002a3f0dc7c5 100644 --- a/fs/smb/server/transport_tcp.c +++ b/fs/smb/server/transport_tcp.c @@ -365,6 +365,7 @@ static int ksmbd_tcp_readv(struct tcp_transport *t, struct kvec *iov_orig, * @t: TCP transport instance * @buf: buffer to store read data from socket * @to_read: number of bytes to read from socket + * @max_retries: number of retries if reading from socket fails * * Return: on success return number of bytes read from socket, * otherwise return error number @@ -416,6 +417,7 @@ static void tcp_destroy_socket(struct socket *ksmbd_socket) /** * create_socket - create socket for ksmbd/0 + * @iface: interface to bind the created socket to * * Return: 0 on success, error number otherwise */ diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 6795fda2af19..110e8a272189 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -34,7 +34,15 @@ static DEFINE_MUTEX(eventfs_mutex); /* Choose something "unique" ;-) */ #define EVENTFS_FILE_INODE_INO 0x12c4e37 -#define EVENTFS_DIR_INODE_INO 0x134b2f5 + +/* Just try to make something consistent and unique */ +static int eventfs_dir_ino(struct eventfs_inode *ei) +{ + if (!ei->ino) + ei->ino = get_next_ino(); + + return ei->ino; +} /* * The eventfs_inode (ei) itself is protected by SRCU. It is released from @@ -54,6 +62,46 @@ enum { #define EVENTFS_MODE_MASK (EVENTFS_SAVE_MODE - 1) +/* + * eventfs_inode reference count management. + * + * NOTE! We count only references from dentries, in the + * form 'dentry->d_fsdata'. There are also references from + * directory inodes ('ti->private'), but the dentry reference + * count is always a superset of the inode reference count. + */ +static void release_ei(struct kref *ref) +{ + struct eventfs_inode *ei = container_of(ref, struct eventfs_inode, kref); + + WARN_ON_ONCE(!ei->is_freed); + + kfree(ei->entry_attrs); + kfree_const(ei->name); + kfree_rcu(ei, rcu); +} + +static inline void put_ei(struct eventfs_inode *ei) +{ + if (ei) + kref_put(&ei->kref, release_ei); +} + +static inline void free_ei(struct eventfs_inode *ei) +{ + if (ei) { + ei->is_freed = 1; + put_ei(ei); + } +} + +static inline struct eventfs_inode *get_ei(struct eventfs_inode *ei) +{ + if (ei) + kref_get(&ei->kref); + return ei; +} + static struct dentry *eventfs_root_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); @@ -148,33 +196,30 @@ static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry, return ret; } -static void update_top_events_attr(struct eventfs_inode *ei, struct dentry *dentry) +static void update_top_events_attr(struct eventfs_inode *ei, struct super_block *sb) { - struct inode *inode; + struct inode *root; /* Only update if the "events" was on the top level */ if (!ei || !(ei->attr.mode & EVENTFS_TOPLEVEL)) return; /* Get the tracefs root inode. */ - inode = d_inode(dentry->d_sb->s_root); - ei->attr.uid = inode->i_uid; - ei->attr.gid = inode->i_gid; + root = d_inode(sb->s_root); + ei->attr.uid = root->i_uid; + ei->attr.gid = root->i_gid; } static void set_top_events_ownership(struct inode *inode) { struct tracefs_inode *ti = get_tracefs(inode); struct eventfs_inode *ei = ti->private; - struct dentry *dentry; /* The top events directory doesn't get automatically updated */ if (!ei || !ei->is_events || !(ei->attr.mode & EVENTFS_TOPLEVEL)) return; - dentry = ei->dentry; - - update_top_events_attr(ei, dentry); + update_top_events_attr(ei, inode->i_sb); if (!(ei->attr.mode & EVENTFS_SAVE_UID)) inode->i_uid = ei->attr.uid; @@ -225,10 +270,11 @@ static struct eventfs_inode *eventfs_find_events(struct dentry *dentry) { struct eventfs_inode *ei; - mutex_lock(&eventfs_mutex); do { - /* The parent always has an ei, except for events itself */ - ei = dentry->d_parent->d_fsdata; + // The parent is stable because we do not do renames + dentry = dentry->d_parent; + // ... and directories always have d_fsdata + ei = dentry->d_fsdata; /* * If the ei is being freed, the ownership of the children @@ -238,12 +284,10 @@ static struct eventfs_inode *eventfs_find_events(struct dentry *dentry) ei = NULL; break; } - - dentry = ei->dentry; + // Walk upwards until you find the events inode } while (!ei->is_events); - mutex_unlock(&eventfs_mutex); - update_top_events_attr(ei, dentry); + update_top_events_attr(ei, dentry->d_sb); return ei; } @@ -273,50 +317,11 @@ static void update_inode_attr(struct dentry *dentry, struct inode *inode, inode->i_gid = attr->gid; } -static void update_gid(struct eventfs_inode *ei, kgid_t gid, int level) -{ - struct eventfs_inode *ei_child; - - /* at most we have events/system/event */ - if (WARN_ON_ONCE(level > 3)) - return; - - ei->attr.gid = gid; - - if (ei->entry_attrs) { - for (int i = 0; i < ei->nr_entries; i++) { - ei->entry_attrs[i].gid = gid; - } - } - - /* - * Only eventfs_inode with dentries are updated, make sure - * all eventfs_inodes are updated. If one of the children - * do not have a dentry, this function must traverse it. - */ - list_for_each_entry_srcu(ei_child, &ei->children, list, - srcu_read_lock_held(&eventfs_srcu)) { - if (!ei_child->dentry) - update_gid(ei_child, gid, level + 1); - } -} - -void eventfs_update_gid(struct dentry *dentry, kgid_t gid) -{ - struct eventfs_inode *ei = dentry->d_fsdata; - int idx; - - idx = srcu_read_lock(&eventfs_srcu); - update_gid(ei, gid, 0); - srcu_read_unlock(&eventfs_srcu, idx); -} - /** - * create_file - create a file in the tracefs filesystem - * @name: the name of the file to create. + * lookup_file - look up a file in the tracefs filesystem + * @dentry: the dentry to look up * @mode: the permission that the file should have. * @attr: saved attributes changed by user - * @parent: parent dentry for this file. * @data: something that the caller will want to get to later on. * @fop: struct file_operations that should be used for this file. * @@ -324,30 +329,25 @@ void eventfs_update_gid(struct dentry *dentry, kgid_t gid) * directory. The inode.i_private pointer will point to @data in the open() * call. */ -static struct dentry *create_file(const char *name, umode_t mode, +static struct dentry *lookup_file(struct eventfs_inode *parent_ei, + struct dentry *dentry, + umode_t mode, struct eventfs_attr *attr, - struct dentry *parent, void *data, + void *data, const struct file_operations *fop) { struct tracefs_inode *ti; - struct dentry *dentry; struct inode *inode; if (!(mode & S_IFMT)) mode |= S_IFREG; if (WARN_ON_ONCE(!S_ISREG(mode))) - return NULL; - - WARN_ON_ONCE(!parent); - dentry = eventfs_start_creating(name, parent); - - if (IS_ERR(dentry)) - return dentry; + return ERR_PTR(-EIO); inode = tracefs_get_inode(dentry->d_sb); if (unlikely(!inode)) - return eventfs_failed_creating(dentry); + return ERR_PTR(-ENOMEM); /* If the user updated the directory's attributes, use them */ update_inode_attr(dentry, inode, attr, mode); @@ -361,32 +361,31 @@ static struct dentry *create_file(const char *name, umode_t mode, ti = get_tracefs(inode); ti->flags |= TRACEFS_EVENT_INODE; - d_instantiate(dentry, inode); - fsnotify_create(dentry->d_parent->d_inode, dentry); - return eventfs_end_creating(dentry); + + // Files have their parent's ei as their fsdata + dentry->d_fsdata = get_ei(parent_ei); + + d_add(dentry, inode); + return NULL; }; /** - * create_dir - create a dir in the tracefs filesystem + * lookup_dir_entry - look up a dir in the tracefs filesystem + * @dentry: the directory to look up * @ei: the eventfs_inode that represents the directory to create - * @parent: parent dentry for this file. * - * This function will create a dentry for a directory represented by + * This function will look up a dentry for a directory represented by * a eventfs_inode. */ -static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent) +static struct dentry *lookup_dir_entry(struct dentry *dentry, + struct eventfs_inode *pei, struct eventfs_inode *ei) { struct tracefs_inode *ti; - struct dentry *dentry; struct inode *inode; - dentry = eventfs_start_creating(ei->name, parent); - if (IS_ERR(dentry)) - return dentry; - inode = tracefs_get_inode(dentry->d_sb); if (unlikely(!inode)) - return eventfs_failed_creating(dentry); + return ERR_PTR(-ENOMEM); /* If the user updated the directory's attributes, use them */ update_inode_attr(dentry, inode, &ei->attr, @@ -396,68 +395,50 @@ static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent inode->i_fop = &eventfs_file_operations; /* All directories will have the same inode number */ - inode->i_ino = EVENTFS_DIR_INODE_INO; + inode->i_ino = eventfs_dir_ino(ei); ti = get_tracefs(inode); ti->flags |= TRACEFS_EVENT_INODE; + /* Only directories have ti->private set to an ei, not files */ + ti->private = ei; - inc_nlink(inode); - d_instantiate(dentry, inode); - inc_nlink(dentry->d_parent->d_inode); - fsnotify_mkdir(dentry->d_parent->d_inode, dentry); - return eventfs_end_creating(dentry); + dentry->d_fsdata = get_ei(ei); + + d_add(dentry, inode); + return NULL; } -static void free_ei(struct eventfs_inode *ei) +static inline struct eventfs_inode *alloc_ei(const char *name) { - kfree_const(ei->name); - kfree(ei->d_children); - kfree(ei->entry_attrs); - kfree(ei); + struct eventfs_inode *ei = kzalloc(sizeof(*ei), GFP_KERNEL); + + if (!ei) + return NULL; + + ei->name = kstrdup_const(name, GFP_KERNEL); + if (!ei->name) { + kfree(ei); + return NULL; + } + kref_init(&ei->kref); + return ei; } /** - * eventfs_set_ei_status_free - remove the dentry reference from an eventfs_inode - * @ti: the tracefs_inode of the dentry + * eventfs_d_release - dentry is going away * @dentry: dentry which has the reference to remove. * * Remove the association between a dentry from an eventfs_inode. */ -void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry) +void eventfs_d_release(struct dentry *dentry) { - struct eventfs_inode *ei; - int i; - - mutex_lock(&eventfs_mutex); - - ei = dentry->d_fsdata; - if (!ei) - goto out; - - /* This could belong to one of the files of the ei */ - if (ei->dentry != dentry) { - for (i = 0; i < ei->nr_entries; i++) { - if (ei->d_children[i] == dentry) - break; - } - if (WARN_ON_ONCE(i == ei->nr_entries)) - goto out; - ei->d_children[i] = NULL; - } else if (ei->is_freed) { - free_ei(ei); - } else { - ei->dentry = NULL; - } - - dentry->d_fsdata = NULL; - out: - mutex_unlock(&eventfs_mutex); + put_ei(dentry->d_fsdata); } /** - * create_file_dentry - create a dentry for a file of an eventfs_inode + * lookup_file_dentry - create a dentry for a file of an eventfs_inode * @ei: the eventfs_inode that the file will be created under - * @idx: the index into the d_children[] of the @ei + * @idx: the index into the entry_attrs[] of the @ei * @parent: The parent dentry of the created file. * @name: The name of the file to create * @mode: The mode of the file. @@ -468,163 +449,17 @@ void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry) * address located at @e_dentry. */ static struct dentry * -create_file_dentry(struct eventfs_inode *ei, int idx, - struct dentry *parent, const char *name, umode_t mode, void *data, +lookup_file_dentry(struct dentry *dentry, + struct eventfs_inode *ei, int idx, + umode_t mode, void *data, const struct file_operations *fops) { struct eventfs_attr *attr = NULL; - struct dentry **e_dentry = &ei->d_children[idx]; - struct dentry *dentry; - - WARN_ON_ONCE(!inode_is_locked(parent->d_inode)); - mutex_lock(&eventfs_mutex); - if (ei->is_freed) { - mutex_unlock(&eventfs_mutex); - return NULL; - } - /* If the e_dentry already has a dentry, use it */ - if (*e_dentry) { - dget(*e_dentry); - mutex_unlock(&eventfs_mutex); - return *e_dentry; - } - - /* ei->entry_attrs are protected by SRCU */ if (ei->entry_attrs) attr = &ei->entry_attrs[idx]; - mutex_unlock(&eventfs_mutex); - - dentry = create_file(name, mode, attr, parent, data, fops); - - mutex_lock(&eventfs_mutex); - - if (IS_ERR_OR_NULL(dentry)) { - /* - * When the mutex was released, something else could have - * created the dentry for this e_dentry. In which case - * use that one. - * - * If ei->is_freed is set, the e_dentry is currently on its - * way to being freed, don't return it. If e_dentry is NULL - * it means it was already freed. - */ - if (ei->is_freed) { - dentry = NULL; - } else { - dentry = *e_dentry; - dget(dentry); - } - mutex_unlock(&eventfs_mutex); - return dentry; - } - - if (!*e_dentry && !ei->is_freed) { - *e_dentry = dentry; - dentry->d_fsdata = ei; - } else { - /* - * Should never happen unless we get here due to being freed. - * Otherwise it means two dentries exist with the same name. - */ - WARN_ON_ONCE(!ei->is_freed); - dentry = NULL; - } - mutex_unlock(&eventfs_mutex); - - return dentry; -} - -/** - * eventfs_post_create_dir - post create dir routine - * @ei: eventfs_inode of recently created dir - * - * Map the meta-data of files within an eventfs dir to their parent dentry - */ -static void eventfs_post_create_dir(struct eventfs_inode *ei) -{ - struct eventfs_inode *ei_child; - struct tracefs_inode *ti; - - lockdep_assert_held(&eventfs_mutex); - - /* srcu lock already held */ - /* fill parent-child relation */ - list_for_each_entry_srcu(ei_child, &ei->children, list, - srcu_read_lock_held(&eventfs_srcu)) { - ei_child->d_parent = ei->dentry; - } - - ti = get_tracefs(ei->dentry->d_inode); - ti->private = ei; -} - -/** - * create_dir_dentry - Create a directory dentry for the eventfs_inode - * @pei: The eventfs_inode parent of ei. - * @ei: The eventfs_inode to create the directory for - * @parent: The dentry of the parent of this directory - * - * This creates and attaches a directory dentry to the eventfs_inode @ei. - */ -static struct dentry * -create_dir_dentry(struct eventfs_inode *pei, struct eventfs_inode *ei, - struct dentry *parent) -{ - struct dentry *dentry = NULL; - - WARN_ON_ONCE(!inode_is_locked(parent->d_inode)); - - mutex_lock(&eventfs_mutex); - if (pei->is_freed || ei->is_freed) { - mutex_unlock(&eventfs_mutex); - return NULL; - } - if (ei->dentry) { - /* If the eventfs_inode already has a dentry, use it */ - dentry = ei->dentry; - dget(dentry); - mutex_unlock(&eventfs_mutex); - return dentry; - } - mutex_unlock(&eventfs_mutex); - - dentry = create_dir(ei, parent); - - mutex_lock(&eventfs_mutex); - - if (IS_ERR_OR_NULL(dentry) && !ei->is_freed) { - /* - * When the mutex was released, something else could have - * created the dentry for this e_dentry. In which case - * use that one. - * - * If ei->is_freed is set, the e_dentry is currently on its - * way to being freed. - */ - dentry = ei->dentry; - if (dentry) - dget(dentry); - mutex_unlock(&eventfs_mutex); - return dentry; - } - - if (!ei->dentry && !ei->is_freed) { - ei->dentry = dentry; - eventfs_post_create_dir(ei); - dentry->d_fsdata = ei; - } else { - /* - * Should never happen unless we get here due to being freed. - * Otherwise it means two dentries exist with the same name. - */ - WARN_ON_ONCE(!ei->is_freed); - dentry = NULL; - } - mutex_unlock(&eventfs_mutex); - - return dentry; + return lookup_file(ei, dentry, mode, attr, data, fops); } /** @@ -641,79 +476,50 @@ static struct dentry *eventfs_root_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - const struct file_operations *fops; - const struct eventfs_entry *entry; struct eventfs_inode *ei_child; struct tracefs_inode *ti; struct eventfs_inode *ei; - struct dentry *ei_dentry = NULL; - struct dentry *ret = NULL; - struct dentry *d; const char *name = dentry->d_name.name; - umode_t mode; - void *data; - int idx; - int i; - int r; + struct dentry *result = NULL; ti = get_tracefs(dir); if (!(ti->flags & TRACEFS_EVENT_INODE)) - return NULL; - - /* Grab srcu to prevent the ei from going away */ - idx = srcu_read_lock(&eventfs_srcu); + return ERR_PTR(-EIO); - /* - * Grab the eventfs_mutex to consistent value from ti->private. - * This s - */ mutex_lock(&eventfs_mutex); - ei = READ_ONCE(ti->private); - if (ei && !ei->is_freed) - ei_dentry = READ_ONCE(ei->dentry); - mutex_unlock(&eventfs_mutex); - if (!ei || !ei_dentry) + ei = ti->private; + if (!ei || ei->is_freed) goto out; - data = ei->data; - - list_for_each_entry_srcu(ei_child, &ei->children, list, - srcu_read_lock_held(&eventfs_srcu)) { + list_for_each_entry(ei_child, &ei->children, list) { if (strcmp(ei_child->name, name) != 0) continue; - ret = simple_lookup(dir, dentry, flags); - if (IS_ERR(ret)) + if (ei_child->is_freed) goto out; - d = create_dir_dentry(ei, ei_child, ei_dentry); - dput(d); + result = lookup_dir_entry(dentry, ei, ei_child); goto out; } - for (i = 0; i < ei->nr_entries; i++) { - entry = &ei->entries[i]; - if (strcmp(name, entry->name) == 0) { - void *cdata = data; - mutex_lock(&eventfs_mutex); - /* If ei->is_freed, then the event itself may be too */ - if (!ei->is_freed) - r = entry->callback(name, &mode, &cdata, &fops); - else - r = -1; - mutex_unlock(&eventfs_mutex); - if (r <= 0) - continue; - ret = simple_lookup(dir, dentry, flags); - if (IS_ERR(ret)) - goto out; - d = create_file_dentry(ei, i, ei_dentry, name, mode, cdata, fops); - dput(d); - break; - } + for (int i = 0; i < ei->nr_entries; i++) { + void *data; + umode_t mode; + const struct file_operations *fops; + const struct eventfs_entry *entry = &ei->entries[i]; + + if (strcmp(name, entry->name) != 0) + continue; + + data = ei->data; + if (entry->callback(name, &mode, &data, &fops) <= 0) + goto out; + + result = lookup_file_dentry(dentry, ei, i, mode, data, fops); + goto out; } out: - srcu_read_unlock(&eventfs_srcu, idx); - return ret; + mutex_unlock(&eventfs_mutex); + return result; } /* @@ -802,7 +608,7 @@ static int eventfs_iterate(struct file *file, struct dir_context *ctx) name = ei_child->name; - ino = EVENTFS_DIR_INODE_INO; + ino = eventfs_dir_ino(ei_child); if (!dir_emit(ctx, name, strlen(name), ino, DT_DIR)) goto out_dec; @@ -863,25 +669,10 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode if (!parent) return ERR_PTR(-EINVAL); - ei = kzalloc(sizeof(*ei), GFP_KERNEL); + ei = alloc_ei(name); if (!ei) return ERR_PTR(-ENOMEM); - ei->name = kstrdup_const(name, GFP_KERNEL); - if (!ei->name) { - kfree(ei); - return ERR_PTR(-ENOMEM); - } - - if (size) { - ei->d_children = kcalloc(size, sizeof(*ei->d_children), GFP_KERNEL); - if (!ei->d_children) { - kfree_const(ei->name); - kfree(ei); - return ERR_PTR(-ENOMEM); - } - } - ei->entries = entries; ei->nr_entries = size; ei->data = data; @@ -889,10 +680,8 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode INIT_LIST_HEAD(&ei->list); mutex_lock(&eventfs_mutex); - if (!parent->is_freed) { + if (!parent->is_freed) list_add_tail(&ei->list, &parent->children); - ei->d_parent = parent->dentry; - } mutex_unlock(&eventfs_mutex); /* Was the parent freed? */ @@ -932,28 +721,20 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry if (IS_ERR(dentry)) return ERR_CAST(dentry); - ei = kzalloc(sizeof(*ei), GFP_KERNEL); + ei = alloc_ei(name); if (!ei) - goto fail_ei; + goto fail; inode = tracefs_get_inode(dentry->d_sb); if (unlikely(!inode)) goto fail; - if (size) { - ei->d_children = kcalloc(size, sizeof(*ei->d_children), GFP_KERNEL); - if (!ei->d_children) - goto fail; - } - - ei->dentry = dentry; + // Note: we have a ref to the dentry from tracefs_start_creating() + ei->events_dir = dentry; ei->entries = entries; ei->nr_entries = size; ei->is_events = 1; ei->data = data; - ei->name = kstrdup_const(name, GFP_KERNEL); - if (!ei->name) - goto fail; /* Save the ownership of this directory */ uid = d_inode(dentry->d_parent)->i_uid; @@ -984,11 +765,19 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry inode->i_op = &eventfs_root_dir_inode_operations; inode->i_fop = &eventfs_file_operations; - dentry->d_fsdata = ei; + dentry->d_fsdata = get_ei(ei); - /* directory inodes start off with i_nlink == 2 (for "." entry) */ - inc_nlink(inode); + /* + * Keep all eventfs directories with i_nlink == 1. + * Due to the dynamic nature of the dentry creations and not + * wanting to add a pointer to the parent eventfs_inode in the + * eventfs_inode structure, keeping the i_nlink in sync with the + * number of directories would cause too much complexity for + * something not worth much. Keeping directory links at 1 + * tells userspace not to trust the link number. + */ d_instantiate(dentry, inode); + /* The dentry of the "events" parent does keep track though */ inc_nlink(dentry->d_parent->d_inode); fsnotify_mkdir(dentry->d_parent->d_inode, dentry); tracefs_end_creating(dentry); @@ -996,72 +785,11 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry return ei; fail: - kfree(ei->d_children); - kfree(ei); - fail_ei: + free_ei(ei); tracefs_failed_creating(dentry); return ERR_PTR(-ENOMEM); } -static LLIST_HEAD(free_list); - -static void eventfs_workfn(struct work_struct *work) -{ - struct eventfs_inode *ei, *tmp; - struct llist_node *llnode; - - llnode = llist_del_all(&free_list); - llist_for_each_entry_safe(ei, tmp, llnode, llist) { - /* This dput() matches the dget() from unhook_dentry() */ - for (int i = 0; i < ei->nr_entries; i++) { - if (ei->d_children[i]) - dput(ei->d_children[i]); - } - /* This should only get here if it had a dentry */ - if (!WARN_ON_ONCE(!ei->dentry)) - dput(ei->dentry); - } -} - -static DECLARE_WORK(eventfs_work, eventfs_workfn); - -static void free_rcu_ei(struct rcu_head *head) -{ - struct eventfs_inode *ei = container_of(head, struct eventfs_inode, rcu); - - if (ei->dentry) { - /* Do not free the ei until all references of dentry are gone */ - if (llist_add(&ei->llist, &free_list)) - queue_work(system_unbound_wq, &eventfs_work); - return; - } - - /* If the ei doesn't have a dentry, neither should its children */ - for (int i = 0; i < ei->nr_entries; i++) { - WARN_ON_ONCE(ei->d_children[i]); - } - - free_ei(ei); -} - -static void unhook_dentry(struct dentry *dentry) -{ - if (!dentry) - return; - /* - * Need to add a reference to the dentry that is expected by - * simple_recursive_removal(), which will include a dput(). - */ - dget(dentry); - - /* - * Also add a reference for the dput() in eventfs_workfn(). - * That is required as that dput() will free the ei after - * the SRCU grace period is over. - */ - dget(dentry); -} - /** * eventfs_remove_rec - remove eventfs dir or file from list * @ei: eventfs_inode to be removed. @@ -1074,8 +802,6 @@ static void eventfs_remove_rec(struct eventfs_inode *ei, int level) { struct eventfs_inode *ei_child; - if (!ei) - return; /* * Check recursion depth. It should never be greater than 3: * 0 - events/ @@ -1087,28 +813,11 @@ static void eventfs_remove_rec(struct eventfs_inode *ei, int level) return; /* search for nested folders or files */ - list_for_each_entry_srcu(ei_child, &ei->children, list, - lockdep_is_held(&eventfs_mutex)) { - /* Children only have dentry if parent does */ - WARN_ON_ONCE(ei_child->dentry && !ei->dentry); + list_for_each_entry(ei_child, &ei->children, list) eventfs_remove_rec(ei_child, level + 1); - } - - - ei->is_freed = 1; - for (int i = 0; i < ei->nr_entries; i++) { - if (ei->d_children[i]) { - /* Children only have dentry if parent does */ - WARN_ON_ONCE(!ei->dentry); - unhook_dentry(ei->d_children[i]); - } - } - - unhook_dentry(ei->dentry); - - list_del_rcu(&ei->list); - call_srcu(&eventfs_srcu, &ei->rcu, free_rcu_ei); + list_del(&ei->list); + free_ei(ei); } /** @@ -1119,22 +828,12 @@ static void eventfs_remove_rec(struct eventfs_inode *ei, int level) */ void eventfs_remove_dir(struct eventfs_inode *ei) { - struct dentry *dentry; - if (!ei) return; mutex_lock(&eventfs_mutex); - dentry = ei->dentry; eventfs_remove_rec(ei, 0); mutex_unlock(&eventfs_mutex); - - /* - * If any of the ei children has a dentry, then the ei itself - * must have a dentry. - */ - if (dentry) - simple_recursive_removal(dentry, NULL); } /** @@ -1147,7 +846,11 @@ void eventfs_remove_events_dir(struct eventfs_inode *ei) { struct dentry *dentry; - dentry = ei->dentry; + dentry = ei->events_dir; + if (!dentry) + return; + + ei->events_dir = NULL; eventfs_remove_dir(ei); /* @@ -1157,5 +860,6 @@ void eventfs_remove_events_dir(struct eventfs_inode *ei) * sticks around while the other ei->dentry are created * and destroyed dynamically. */ + d_invalidate(dentry); dput(dentry); } diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index e1b172c0e091..d65ffad4c327 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -38,8 +38,6 @@ static struct inode *tracefs_alloc_inode(struct super_block *sb) if (!ti) return NULL; - ti->flags = 0; - return &ti->vfs_inode; } @@ -379,21 +377,30 @@ static const struct super_operations tracefs_super_operations = { .show_options = tracefs_show_options, }; -static void tracefs_dentry_iput(struct dentry *dentry, struct inode *inode) +/* + * It would be cleaner if eventfs had its own dentry ops. + * + * Note that d_revalidate is called potentially under RCU, + * so it can't take the eventfs mutex etc. It's fine - if + * we open a file just as it's marked dead, things will + * still work just fine, and just see the old stale case. + */ +static void tracefs_d_release(struct dentry *dentry) { - struct tracefs_inode *ti; + if (dentry->d_fsdata) + eventfs_d_release(dentry); +} - if (!dentry || !inode) - return; +static int tracefs_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct eventfs_inode *ei = dentry->d_fsdata; - ti = get_tracefs(inode); - if (ti && ti->flags & TRACEFS_EVENT_INODE) - eventfs_set_ei_status_free(ti, dentry); - iput(inode); + return !(ei && ei->is_freed); } static const struct dentry_operations tracefs_dentry_operations = { - .d_iput = tracefs_dentry_iput, + .d_revalidate = tracefs_d_revalidate, + .d_release = tracefs_d_release, }; static int trace_fill_super(struct super_block *sb, void *data, int silent) @@ -497,75 +504,6 @@ struct dentry *tracefs_end_creating(struct dentry *dentry) return dentry; } -/** - * eventfs_start_creating - start the process of creating a dentry - * @name: Name of the file created for the dentry - * @parent: The parent dentry where this dentry will be created - * - * This is a simple helper function for the dynamically created eventfs - * files. When the directory of the eventfs files are accessed, their - * dentries are created on the fly. This function is used to start that - * process. - */ -struct dentry *eventfs_start_creating(const char *name, struct dentry *parent) -{ - struct dentry *dentry; - int error; - - /* Must always have a parent. */ - if (WARN_ON_ONCE(!parent)) - return ERR_PTR(-EINVAL); - - error = simple_pin_fs(&trace_fs_type, &tracefs_mount, - &tracefs_mount_count); - if (error) - return ERR_PTR(error); - - if (unlikely(IS_DEADDIR(parent->d_inode))) - dentry = ERR_PTR(-ENOENT); - else - dentry = lookup_one_len(name, parent, strlen(name)); - - if (!IS_ERR(dentry) && dentry->d_inode) { - dput(dentry); - dentry = ERR_PTR(-EEXIST); - } - - if (IS_ERR(dentry)) - simple_release_fs(&tracefs_mount, &tracefs_mount_count); - - return dentry; -} - -/** - * eventfs_failed_creating - clean up a failed eventfs dentry creation - * @dentry: The dentry to clean up - * - * If after calling eventfs_start_creating(), a failure is detected, the - * resources created by eventfs_start_creating() needs to be cleaned up. In - * that case, this function should be called to perform that clean up. - */ -struct dentry *eventfs_failed_creating(struct dentry *dentry) -{ - dput(dentry); - simple_release_fs(&tracefs_mount, &tracefs_mount_count); - return NULL; -} - -/** - * eventfs_end_creating - Finish the process of creating a eventfs dentry - * @dentry: The dentry that has successfully been created. - * - * This function is currently just a place holder to match - * eventfs_start_creating(). In case any synchronization needs to be added, - * this function will be used to implement that without having to modify - * the callers of eventfs_start_creating(). - */ -struct dentry *eventfs_end_creating(struct dentry *dentry) -{ - return dentry; -} - /* Find the inode that this will use for default */ static struct inode *instance_inode(struct dentry *parent, struct inode *inode) { @@ -779,7 +717,11 @@ static void init_once(void *foo) { struct tracefs_inode *ti = (struct tracefs_inode *) foo; + /* inode_init_once() calls memset() on the vfs_inode portion */ inode_init_once(&ti->vfs_inode); + + /* Zero out the rest */ + memset_after(ti, 0, vfs_inode); } static int __init tracefs_init(void) diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h index 12b7d0150ae9..beb3dcd0e434 100644 --- a/fs/tracefs/internal.h +++ b/fs/tracefs/internal.h @@ -11,9 +11,10 @@ enum { }; struct tracefs_inode { + struct inode vfs_inode; + /* The below gets initialized with memset_after(ti, 0, vfs_inode) */ unsigned long flags; void *private; - struct inode vfs_inode; }; /* @@ -31,42 +32,37 @@ struct eventfs_attr { /* * struct eventfs_inode - hold the properties of the eventfs directories. * @list: link list into the parent directory + * @rcu: Union with @list for freeing + * @children: link list into the child eventfs_inode * @entries: the array of entries representing the files in the directory * @name: the name of the directory to create - * @children: link list into the child eventfs_inode - * @dentry: the dentry of the directory - * @d_parent: pointer to the parent's dentry - * @d_children: The array of dentries to represent the files when created + * @events_dir: the dentry of the events directory * @entry_attrs: Saved mode and ownership of the @d_children - * @attr: Saved mode and ownership of eventfs_inode itself * @data: The private data to pass to the callbacks + * @attr: Saved mode and ownership of eventfs_inode itself * @is_freed: Flag set if the eventfs is on its way to be freed * Note if is_freed is set, then dentry is corrupted. + * @is_events: Flag set for only the top level "events" directory * @nr_entries: The number of items in @entries + * @ino: The saved inode number */ struct eventfs_inode { - struct list_head list; + union { + struct list_head list; + struct rcu_head rcu; + }; + struct list_head children; const struct eventfs_entry *entries; const char *name; - struct list_head children; - struct dentry *dentry; /* Check is_freed to access */ - struct dentry *d_parent; - struct dentry **d_children; + struct dentry *events_dir; struct eventfs_attr *entry_attrs; - struct eventfs_attr attr; void *data; - /* - * Union - used for deletion - * @llist: for calling dput() if needed after RCU - * @rcu: eventfs_inode to delete in RCU - */ - union { - struct llist_node llist; - struct rcu_head rcu; - }; + struct eventfs_attr attr; + struct kref kref; unsigned int is_freed:1; unsigned int is_events:1; unsigned int nr_entries:30; + unsigned int ino; }; static inline struct tracefs_inode *get_tracefs(const struct inode *inode) @@ -78,10 +74,7 @@ struct dentry *tracefs_start_creating(const char *name, struct dentry *parent); struct dentry *tracefs_end_creating(struct dentry *dentry); struct dentry *tracefs_failed_creating(struct dentry *dentry); struct inode *tracefs_get_inode(struct super_block *sb); -struct dentry *eventfs_start_creating(const char *name, struct dentry *parent); -struct dentry *eventfs_failed_creating(struct dentry *dentry); -struct dentry *eventfs_end_creating(struct dentry *dentry); -void eventfs_update_gid(struct dentry *dentry, kgid_t gid); -void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry); + +void eventfs_d_release(struct dentry *dentry); #endif /* _TRACEFS_INTERNAL_H */ diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 9976a00a73f9..e965a48e7db9 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -421,10 +421,10 @@ xfs_attr_complete_op( bool do_replace = args->op_flags & XFS_DA_OP_REPLACE; args->op_flags &= ~XFS_DA_OP_REPLACE; - if (do_replace) { - args->attr_filter &= ~XFS_ATTR_INCOMPLETE; + args->attr_filter &= ~XFS_ATTR_INCOMPLETE; + if (do_replace) return replace_state; - } + return XFS_DAS_DONE; } diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 31100120b2c5..e31663cb7b43 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1119,20 +1119,6 @@ xfs_rtbitmap_blockcount( } /* - * Compute the maximum level number of the realtime summary file, as defined by - * mkfs. The historic use of highbit32 on a 64-bit quantity prohibited correct - * use of rt volumes with more than 2^32 extents. - */ -uint8_t -xfs_compute_rextslog( - xfs_rtbxlen_t rtextents) -{ - if (!rtextents) - return 0; - return xfs_highbit64(rtextents); -} - -/* * Compute the number of rtbitmap words needed to populate every block of a * bitmap that is large enough to track the given number of rt extents. */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h index 274dc7dae1fa..152a66750af5 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.h +++ b/fs/xfs/libxfs/xfs_rtbitmap.h @@ -351,20 +351,6 @@ xfs_rtfree_extent( int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno, xfs_filblks_t rtlen); -uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents); - -/* Do we support an rt volume having this number of rtextents? */ -static inline bool -xfs_validate_rtextents( - xfs_rtbxlen_t rtextents) -{ - /* No runt rt volumes */ - if (rtextents == 0) - return false; - - return true; -} - xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents); unsigned long long xfs_rtbitmap_wordcount(struct xfs_mount *mp, @@ -383,8 +369,6 @@ unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp, # define xfs_rtsummary_read_buf(a,b) (-ENOSYS) # define xfs_rtbuf_cache_relse(a) (0) # define xfs_rtalloc_extent_is_free(m,t,s,l,i) (-ENOSYS) -# define xfs_compute_rextslog(rtx) (0) -# define xfs_validate_rtextents(rtx) (false) static inline xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents) { diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 4a9e8588f4c9..5bb6e2bd6dee 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -1377,3 +1377,17 @@ xfs_validate_stripe_geometry( } return true; } + +/* + * Compute the maximum level number of the realtime summary file, as defined by + * mkfs. The historic use of highbit32 on a 64-bit quantity prohibited correct + * use of rt volumes with more than 2^32 extents. + */ +uint8_t +xfs_compute_rextslog( + xfs_rtbxlen_t rtextents) +{ + if (!rtextents) + return 0; + return xfs_highbit64(rtextents); +} diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h index 19134b23c10b..2e8e8d63d4eb 100644 --- a/fs/xfs/libxfs/xfs_sb.h +++ b/fs/xfs/libxfs/xfs_sb.h @@ -38,4 +38,6 @@ extern int xfs_sb_get_secondary(struct xfs_mount *mp, extern bool xfs_validate_stripe_geometry(struct xfs_mount *mp, __s64 sunit, __s64 swidth, int sectorsize, bool silent); +uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents); + #endif /* __XFS_SB_H__ */ diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 20b5375f2d9c..62e02d5380ad 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -251,4 +251,16 @@ bool xfs_verify_fileoff(struct xfs_mount *mp, xfs_fileoff_t off); bool xfs_verify_fileext(struct xfs_mount *mp, xfs_fileoff_t off, xfs_fileoff_t len); +/* Do we support an rt volume having this number of rtextents? */ +static inline bool +xfs_validate_rtextents( + xfs_rtbxlen_t rtextents) +{ + /* No runt rt volumes */ + if (rtextents == 0) + return false; + + return true; +} + #endif /* __XFS_TYPES_H__ */ diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 441ca9977652..46583517377f 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -15,6 +15,7 @@ #include "xfs_inode.h" #include "xfs_bmap.h" #include "xfs_bit.h" +#include "xfs_sb.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/repair.h" diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c index fabd0ed9dfa6..b1ff4f33324a 100644 --- a/fs/xfs/scrub/rtsummary.c +++ b/fs/xfs/scrub/rtsummary.c @@ -16,6 +16,7 @@ #include "xfs_rtbitmap.h" #include "xfs_bit.h" #include "xfs_bmap.h" +#include "xfs_sb.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index aff20ddd4a9f..5a2512d20bd0 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1496,6 +1496,18 @@ xfs_fs_fill_super( mp->m_super = sb; + /* + * Copy VFS mount flags from the context now that all parameter parsing + * is guaranteed to have been completed by either the old mount API or + * the newer fsopen/fsconfig API. + */ + if (fc->sb_flags & SB_RDONLY) + set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); + if (fc->sb_flags & SB_DIRSYNC) + mp->m_features |= XFS_FEAT_DIRSYNC; + if (fc->sb_flags & SB_SYNCHRONOUS) + mp->m_features |= XFS_FEAT_WSYNC; + error = xfs_fs_validate_params(mp); if (error) return error; @@ -1965,6 +1977,11 @@ static const struct fs_context_operations xfs_context_ops = { .free = xfs_fs_free, }; +/* + * WARNING: do not initialise any parameters in this function that depend on + * mount option parsing having already been performed as this can be called from + * fsopen() before any parameters have been set. + */ static int xfs_init_fs_context( struct fs_context *fc) { @@ -1996,16 +2013,6 @@ static int xfs_init_fs_context( mp->m_logbsize = -1; mp->m_allocsize_log = 16; /* 64k */ - /* - * Copy binary VFS mount flags we are interested in. - */ - if (fc->sb_flags & SB_RDONLY) - set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); - if (fc->sb_flags & SB_DIRSYNC) - mp->m_features |= XFS_FEAT_DIRSYNC; - if (fc->sb_flags & SB_SYNCHRONOUS) - mp->m_features |= XFS_FEAT_WSYNC; - fc->s_fs_info = mp; fc->ops = &xfs_context_ops; |