diff options
Diffstat (limited to 'fs')
47 files changed, 449 insertions, 179 deletions
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index 85eea7a4dea3..fc7efd0a7525 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -61,6 +61,13 @@ config BCACHEFS_DEBUG The resulting code will be significantly slower than normal; you probably shouldn't select this option unless you're a developer. +config BCACHEFS_INJECT_TRANSACTION_RESTARTS + bool "Randomly inject transaction restarts" + depends on BCACHEFS_DEBUG + help + Randomly inject transaction restarts in a few core paths - may have a + significant performance penalty + config BCACHEFS_TESTS bool "bcachefs unit and performance tests" depends on BCACHEFS_FS diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 5988219c6908..e32fce4fd258 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2357,6 +2357,12 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en bch2_btree_iter_verify_entry_exit(iter); EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX)); + ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto out_no_locked; + } + if (iter->update_path) { bch2_path_put_nokeep(trans, iter->update_path, iter->flags & BTREE_ITER_intent); @@ -2622,6 +2628,12 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp bch2_btree_iter_verify_entry_exit(iter); EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN)); + int ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto out_no_locked; + } + while (1) { k = __bch2_btree_iter_peek_prev(iter, search_key); if (unlikely(!k.k)) @@ -2749,6 +2761,12 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) bch2_btree_iter_verify_entry_exit(iter); EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache)); + ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto out_no_locked; + } + /* extents can't span inode numbers: */ if ((iter->flags & BTREE_ITER_is_extents) && unlikely(iter->pos.offset == KEY_OFFSET_MAX)) { @@ -3106,6 +3124,10 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); + ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (ret) + return ERR_PTR(ret); + struct btree_transaction_stats *s = btree_trans_stats(trans); s->max_mem = max(s->max_mem, new_bytes); @@ -3163,7 +3185,8 @@ out_new_mem: if (old_bytes) { trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); - return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced)); + return ERR_PTR(btree_trans_restart_ip(trans, + BCH_ERR_transaction_restart_mem_realloced, _RET_IP_)); } out_change_top: p = trans->mem + trans->mem_top; @@ -3271,6 +3294,14 @@ u32 bch2_trans_begin(struct btree_trans *trans) trans->last_begin_ip = _RET_IP_; +#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS + if (trans->restarted) { + trans->restart_count_this_trans++; + } else { + trans->restart_count_this_trans = 0; + } +#endif + trans_set_locked(trans, false); if (trans->restarted) { diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index b9538e6e6d65..b96157f3dc9c 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -355,6 +355,18 @@ static int btree_trans_restart(struct btree_trans *trans, int err) return btree_trans_restart_ip(trans, err, _THIS_IP_); } +static inline int trans_maybe_inject_restart(struct btree_trans *trans, unsigned long ip) +{ +#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS + if (!(ktime_get_ns() & ~(~0ULL << min(63, (10 + trans->restart_count_this_trans))))) { + trace_and_count(trans->c, trans_restart_injected, trans, ip); + return btree_trans_restart_ip(trans, + BCH_ERR_transaction_restart_fault_inject, ip); + } +#endif + return 0; +} + bool bch2_btree_node_upgrade(struct btree_trans *, struct btree_path *, unsigned); @@ -739,7 +751,7 @@ transaction_restart: \ if (!_ret2) \ bch2_trans_verify_not_restarted(_trans, _restart_count);\ \ - _ret2 ?: trans_was_restarted(_trans, _restart_count); \ + _ret2 ?: trans_was_restarted(_trans, _orig_restart_count); \ }) #define for_each_btree_key_max_continue(_trans, _iter, \ diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 2760dd9569ed..c4f524b2ca9a 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -999,6 +999,10 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) bch2_trans_verify_not_unlocked_or_in_restart(trans); + ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (unlikely(ret)) + goto out_reset; + if (!trans->nr_updates && !trans->journal_entries_u64s) goto out_reset; diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index a6f251eb4164..a09cbe9cd94f 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -509,6 +509,9 @@ struct btree_trans { bool notrace_relock_fail:1; enum bch_errcode restarted:16; u32 restart_count; +#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS + u32 restart_count_this_trans; +#endif u64 last_begin_time; unsigned long last_begin_ip; diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index f4aeadbe53c1..e4e7c804625e 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -681,9 +681,11 @@ static void btree_update_nodes_written(struct btree_update *as) b = as->old_nodes[i]; + bch2_trans_begin(trans); btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); seq = b->data ? b->data->keys.seq : 0; six_unlock_read(&b->c.lock); + bch2_trans_unlock_long(trans); if (seq == as->old_nodes_seq[i]) wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner, diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 7930ffea3075..26d646e1275c 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -278,12 +278,12 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct bt { struct bset_tree *t = bset_tree_last(b); struct btree_node_entry *bne = max(write_block(b), - (void *) btree_bkey_last(b, bset_tree_last(b))); + (void *) btree_bkey_last(b, t)); ssize_t remaining_space = __bch2_btree_u64s_remaining(b, bne->keys.start); if (unlikely(bset_written(b, bset(b, t)))) { - if (remaining_space > (ssize_t) (block_bytes(c) >> 3)) + if (b->written + block_sectors(c) <= btree_sectors(c)) return bne; } else { if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) && diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index 5360cbb3ec29..f4372cafea2e 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -210,11 +210,13 @@ static inline void bch2_accounting_mem_read_counters(struct bch_accounting_mem * static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p, u64 *v, unsigned nr) { + percpu_down_read(&c->mark_lock); struct bch_accounting_mem *acc = &c->accounting; unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, &p); bch2_accounting_mem_read_counters(acc, idx, v, nr, false); + percpu_up_read(&c->mark_lock); } static inline struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 53a421ff136d..9bf316e7b845 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -823,6 +823,7 @@ struct inode_walker_entry { struct bch_inode_unpacked inode; u32 snapshot; u64 count; + u64 i_size; }; struct inode_walker { @@ -910,8 +911,9 @@ found: if (k.k->p.snapshot != i->snapshot && !is_whiteout) { struct inode_walker_entry new = *i; - new.snapshot = k.k->p.snapshot; - new.count = 0; + new.snapshot = k.k->p.snapshot; + new.count = 0; + new.i_size = 0; struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); @@ -1116,37 +1118,6 @@ err: return ret; } -static int check_directory_size(struct btree_trans *trans, - struct bch_inode_unpacked *inode_u, - struct bkey_s_c inode_k, bool *write_inode) -{ - struct btree_iter iter; - struct bkey_s_c k; - u64 new_size = 0; - int ret; - - for_each_btree_key_max_norestart(trans, iter, BTREE_ID_dirents, - SPOS(inode_k.k->p.offset, 0, inode_k.k->p.snapshot), - POS(inode_k.k->p.offset, U64_MAX), - 0, k, ret) { - if (k.k->type != KEY_TYPE_dirent) - continue; - - struct bkey_s_c_dirent dirent = bkey_s_c_to_dirent(k); - struct qstr name = bch2_dirent_get_name(dirent); - - new_size += dirent_occupied_size(&name); - } - bch2_trans_iter_exit(trans, &iter); - - if (!ret && inode_u->bi_size != new_size) { - inode_u->bi_size = new_size; - *write_inode = true; - } - - return ret; -} - static int check_inode(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, @@ -1335,16 +1306,6 @@ static int check_inode(struct btree_trans *trans, u.bi_journal_seq = journal_cur_seq(&c->journal); do_update = true; } - - if (S_ISDIR(u.bi_mode)) { - ret = check_directory_size(trans, &u, k, &do_update); - - fsck_err_on(ret, - trans, directory_size_mismatch, - "directory inode %llu:%u with the mismatch directory size", - u.bi_inum, k.k->p.snapshot); - ret = 0; - } do_update: if (do_update) { ret = __bch2_fsck_write_inode(trans, &u); @@ -2017,10 +1978,31 @@ fsck_err: return ret; } -static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) +static int check_dir_i_size_notnested(struct btree_trans *trans, struct inode_walker *w) +{ + struct bch_fs *c = trans->c; + int ret = 0; + + darray_for_each(w->inodes, i) + if (fsck_err_on(i->inode.bi_size != i->i_size, + trans, inode_dir_wrong_nlink, + "directory %llu:%u with wrong i_size: got %llu, should be %llu", + w->last_pos.inode, i->snapshot, i->inode.bi_size, i->i_size)) { + i->inode.bi_size = i->i_size; + ret = bch2_fsck_write_inode(trans, &i->inode); + if (ret) + break; + } +fsck_err: + bch_err_fn(c, ret); + return ret; +} + +static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w) { u32 restart_count = trans->restart_count; return check_subdir_count_notnested(trans, w) ?: + check_dir_i_size_notnested(trans, w) ?: trans_was_restarted(trans, restart_count); } @@ -2367,7 +2349,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, goto out; if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) { - ret = check_subdir_count(trans, dir); + ret = check_subdir_dirents_count(trans, dir); if (ret) goto err; } @@ -2457,9 +2439,11 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (ret) goto err; - if (d.v->d_type == DT_DIR) - for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) + for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) { + if (d.v->d_type == DT_DIR) i->count++; + i->i_size += bkey_bytes(d.k); + } out: err: fsck_err: diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index dd508d93e9fc..03892388832b 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -411,6 +411,16 @@ void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op) __bch2_write_op_error(out, op, op->pos.offset); } +static void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out, + struct bch_write_op *op, u64 offset) +{ + bch2_inum_offset_err_msg_trans(trans, out, + (subvol_inum) { op->subvol, op->pos.inode, }, + offset << 9); + prt_printf(out, "write error%s: ", + op->flags & BCH_WRITE_MOVE ? "(internal move)" : ""); +} + void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, enum bch_data_type type, const struct bkey_i *k, @@ -1193,7 +1203,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); struct printbuf buf = PRINTBUF; - __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k)); + bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k)); prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 6a9cefb635d6..d373cd181a7f 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -384,12 +384,16 @@ void bch2_journal_pin_drop(struct journal *j, spin_unlock(&j->lock); } -static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn) +static enum journal_pin_type journal_pin_type(struct journal_entry_pin *pin, + journal_pin_flush_fn fn) { if (fn == bch2_btree_node_flush0 || - fn == bch2_btree_node_flush1) - return JOURNAL_PIN_TYPE_btree; - else if (fn == bch2_btree_key_cache_journal_flush) + fn == bch2_btree_node_flush1) { + unsigned idx = fn == bch2_btree_node_flush1; + struct btree *b = container_of(pin, struct btree, writes[idx].journal); + + return JOURNAL_PIN_TYPE_btree0 - b->c.level; + } else if (fn == bch2_btree_key_cache_journal_flush) return JOURNAL_PIN_TYPE_key_cache; else return JOURNAL_PIN_TYPE_other; @@ -441,7 +445,7 @@ void bch2_journal_pin_copy(struct journal *j, bool reclaim = __journal_pin_drop(j, dst); - bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn)); + bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(dst, flush_fn)); if (reclaim) bch2_journal_reclaim_fast(j); @@ -465,7 +469,7 @@ void bch2_journal_pin_set(struct journal *j, u64 seq, bool reclaim = __journal_pin_drop(j, pin); - bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn)); + bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(pin, flush_fn)); if (reclaim) bch2_journal_reclaim_fast(j); @@ -587,7 +591,7 @@ static size_t journal_flush_pins(struct journal *j, spin_lock(&j->lock); /* Pin might have been dropped or rearmed: */ if (likely(!err && !j->flush_in_progress_dropped)) - list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(flush_fn)]); + list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(pin, flush_fn)]); j->flush_in_progress = NULL; j->flush_in_progress_dropped = false; spin_unlock(&j->lock); @@ -869,18 +873,13 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, mutex_lock(&j->reclaim_lock); - if (journal_flush_pins_or_still_flushing(j, seq_to_flush, - BIT(JOURNAL_PIN_TYPE_key_cache)| - BIT(JOURNAL_PIN_TYPE_other))) { - *did_work = true; - goto unlock; - } - - if (journal_flush_pins_or_still_flushing(j, seq_to_flush, - BIT(JOURNAL_PIN_TYPE_btree))) { - *did_work = true; - goto unlock; - } + for (int type = JOURNAL_PIN_TYPE_NR - 1; + type >= 0; + --type) + if (journal_flush_pins_or_still_flushing(j, seq_to_flush, BIT(type))) { + *did_work = true; + goto unlock; + } if (seq_to_flush > journal_cur_seq(j)) bch2_journal_entry_close(j); diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index a198a81d7478..1ef3a28ed6ab 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -53,7 +53,10 @@ struct journal_buf { */ enum journal_pin_type { - JOURNAL_PIN_TYPE_btree, + JOURNAL_PIN_TYPE_btree3, + JOURNAL_PIN_TYPE_btree2, + JOURNAL_PIN_TYPE_btree1, + JOURNAL_PIN_TYPE_btree0, JOURNAL_PIN_TYPE_key_cache, JOURNAL_PIN_TYPE_other, JOURNAL_PIN_TYPE_NR, diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 93ba4f4e47ca..441e648f28b5 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -172,7 +172,7 @@ static int bch2_indirect_extent_missing_error(struct btree_trans *trans, bool should_commit) { if (REFLINK_P_ERROR(p.v)) - return -BCH_ERR_missing_indirect_extent; + return 0; struct bch_fs *c = trans->c; u64 live_start = REFLINK_P_IDX(p.v); @@ -259,8 +259,6 @@ struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans, return k; if (unlikely(!bkey_extent_is_reflink_data(k.k))) { - bch2_trans_iter_exit(trans, iter); - unsigned size = min((u64) k.k->size, REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad) - reflink_offset); @@ -268,14 +266,16 @@ struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans, int ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset, k.k->p.offset, should_commit); - if (ret) + if (ret) { + bch2_trans_iter_exit(trans, iter); return bkey_s_c_err(ret); + } } else if (unlikely(REFLINK_P_ERROR(p.v))) { - bch2_trans_iter_exit(trans, iter); - int ret = bch2_indirect_extent_not_missing(trans, p, should_commit); - if (ret) + if (ret) { + bch2_trans_iter_exit(trans, iter); return bkey_s_c_err(ret); + } } *offset_into_extent = reflink_offset - bkey_start_offset(k.k); @@ -300,7 +300,7 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans, if (ret) return ret; - if (bkey_deleted(k.k)) { + if (!bkey_refcount_c(k)) { if (!(flags & BTREE_TRIGGER_overwrite)) ret = -BCH_ERR_missing_indirect_extent; goto next; diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 14f6b6a5fb38..35e07bc8fbd3 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -92,7 +92,7 @@ BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ BCH_FSCK_ERR_accounting_key_junk_at_end) \ x(directory_size, \ - BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ + BIT_ULL(BCH_RECOVERY_PASS_check_dirents), \ BCH_FSCK_ERR_directory_size_mismatch) \ #define DOWNGRADE_TABLE() \ diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index ea0a18364751..b86ec013d7d7 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -180,9 +180,9 @@ enum bch_fsck_flags { x(ptr_crc_nonce_mismatch, 162, 0) \ x(ptr_stripe_redundant, 163, 0) \ x(reservation_key_nr_replicas_invalid, 164, 0) \ - x(reflink_v_refcount_wrong, 165, 0) \ + x(reflink_v_refcount_wrong, 165, FSCK_AUTOFIX) \ x(reflink_v_pos_bad, 292, 0) \ - x(reflink_p_to_missing_reflink_v, 166, 0) \ + x(reflink_p_to_missing_reflink_v, 166, FSCK_AUTOFIX) \ x(reflink_refcount_underflow, 293, 0) \ x(stripe_pos_bad, 167, 0) \ x(stripe_val_size_bad, 168, 0) \ diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a2cac9d0a1a9..b2fae67f8fa3 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -523,8 +523,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) u64 end; u32 len; - /* For now only order 0 folios are supported for data. */ - ASSERT(folio_order(folio) == 0); btrfs_debug(fs_info, "%s: bi_sector=%llu, err=%d, mirror=%u", __func__, bio->bi_iter.bi_sector, bio->bi_status, @@ -552,7 +550,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) if (likely(uptodate)) { loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> folio_shift(folio); /* * Zero out the remaining part if this range straddles @@ -561,9 +558,11 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) * Here we should only zero the range inside the folio, * not touch anything else. * - * NOTE: i_size is exclusive while end is inclusive. + * NOTE: i_size is exclusive while end is inclusive and + * folio_contains() takes PAGE_SIZE units. */ - if (folio_index(folio) == end_index && i_size <= end) { + if (folio_contains(folio, i_size >> PAGE_SHIFT) && + i_size <= end) { u32 zero_start = max(offset_in_folio(folio, i_size), offset_in_folio(folio, start)); u32 zero_len = offset_in_folio(folio, end) + 1 - @@ -899,7 +898,6 @@ static struct extent_map *get_extent_map(struct btrfs_inode *inode, u64 len, struct extent_map **em_cached) { struct extent_map *em; - struct extent_state *cached_state = NULL; ASSERT(em_cached); @@ -915,14 +913,12 @@ static struct extent_map *get_extent_map(struct btrfs_inode *inode, *em_cached = NULL; } - btrfs_lock_and_flush_ordered_range(inode, start, start + len - 1, &cached_state); em = btrfs_get_extent(inode, folio, start, len); if (!IS_ERR(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); *em_cached = em; } - unlock_extent(&inode->io_tree, start, start + len - 1, &cached_state); return em; } @@ -956,7 +952,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, return ret; } - if (folio->index == last_byte >> folio_shift(folio)) { + if (folio_contains(folio, last_byte >> PAGE_SHIFT)) { size_t zero_offset = offset_in_folio(folio, last_byte); if (zero_offset) { @@ -1079,11 +1075,18 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, int btrfs_read_folio(struct file *file, struct folio *folio) { + struct btrfs_inode *inode = folio_to_inode(folio); + const u64 start = folio_pos(folio); + const u64 end = start + folio_size(folio) - 1; + struct extent_state *cached_state = NULL; struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ }; struct extent_map *em_cached = NULL; int ret; + btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state); ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL); + unlock_extent(&inode->io_tree, start, end, &cached_state); + free_extent_map(em_cached); /* @@ -2380,12 +2383,20 @@ void btrfs_readahead(struct readahead_control *rac) { struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD }; struct folio *folio; + struct btrfs_inode *inode = BTRFS_I(rac->mapping->host); + const u64 start = readahead_pos(rac); + const u64 end = start + readahead_length(rac) - 1; + struct extent_state *cached_state = NULL; struct extent_map *em_cached = NULL; u64 prev_em_start = (u64)-1; + btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state); + while ((folio = readahead_folio(rac)) != NULL) btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start); + unlock_extent(&inode->io_tree, start, end, &cached_state); + if (em_cached) free_extent_map(em_cached); submit_one_bio(&bio_ctrl); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 36f51c311bb1..ed3c0d6546c5 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1039,7 +1039,6 @@ int btrfs_write_check(struct kiocb *iocb, size_t count) loff_t pos = iocb->ki_pos; int ret; loff_t oldsize; - loff_t start_pos; /* * Quickly bail out on NOWAIT writes if we don't have the nodatacow or @@ -1066,9 +1065,8 @@ int btrfs_write_check(struct kiocb *iocb, size_t count) inode_inc_iversion(inode); } - start_pos = round_down(pos, fs_info->sectorsize); oldsize = i_size_read(inode); - if (start_pos > oldsize) { + if (pos > oldsize) { /* Expand hole size to cover write data, preventing empty gap */ loff_t end_pos = round_up(pos + count, fs_info->sectorsize); diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index f761d44b3436..0d1b6d35ff3b 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -155,8 +155,9 @@ static void netfs_read_cache_to_pagecache(struct netfs_io_request *rreq, netfs_cache_read_terminated, subreq); } -static void netfs_issue_read(struct netfs_io_request *rreq, - struct netfs_io_subrequest *subreq) +static void netfs_queue_read(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq, + bool last_subreq) { struct netfs_io_stream *stream = &rreq->io_streams[0]; @@ -177,8 +178,17 @@ static void netfs_issue_read(struct netfs_io_request *rreq, } } + if (last_subreq) { + smp_wmb(); /* Write lists before ALL_QUEUED. */ + set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); + } + spin_unlock(&rreq->lock); +} +static void netfs_issue_read(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq) +{ switch (subreq->source) { case NETFS_DOWNLOAD_FROM_SERVER: rreq->netfs_ops->issue_read(subreq); @@ -293,11 +303,8 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) } size -= slice; start += slice; - if (size <= 0) { - smp_wmb(); /* Write lists before ALL_QUEUED. */ - set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); - } + netfs_queue_read(rreq, subreq, size <= 0); netfs_issue_read(rreq, subreq); cond_resched(); } while (size > 0); diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index eb76f98c894b..1c4f953c3d68 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -135,6 +135,8 @@ extern atomic_t netfs_n_rh_write_begin; extern atomic_t netfs_n_rh_write_done; extern atomic_t netfs_n_rh_write_failed; extern atomic_t netfs_n_rh_write_zskip; +extern atomic_t netfs_n_rh_retry_read_req; +extern atomic_t netfs_n_rh_retry_read_subreq; extern atomic_t netfs_n_wh_buffered_write; extern atomic_t netfs_n_wh_writethrough; extern atomic_t netfs_n_wh_dio_write; @@ -147,6 +149,8 @@ extern atomic_t netfs_n_wh_upload_failed; extern atomic_t netfs_n_wh_write; extern atomic_t netfs_n_wh_write_done; extern atomic_t netfs_n_wh_write_failed; +extern atomic_t netfs_n_wh_retry_write_req; +extern atomic_t netfs_n_wh_retry_write_subreq; extern atomic_t netfs_n_wb_lock_skip; extern atomic_t netfs_n_wb_lock_wait; extern atomic_t netfs_n_folioq; diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index f65affa5a9e4..636cc5a98ef5 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -470,7 +470,8 @@ void netfs_read_collection_worker(struct work_struct *work) */ void netfs_wake_read_collector(struct netfs_io_request *rreq) { - if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) { + if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags) && + !test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) { if (!work_pending(&rreq->work)) { netfs_get_request(rreq, netfs_rreq_trace_get_work); if (!queue_work(system_unbound_wq, &rreq->work)) @@ -586,7 +587,8 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq) smp_mb__after_atomic(); /* Clear IN_PROGRESS before task state */ /* If we are at the head of the queue, wake up the collector. */ - if (list_is_first(&subreq->rreq_link, &stream->subrequests)) + if (list_is_first(&subreq->rreq_link, &stream->subrequests) || + test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) netfs_wake_read_collector(rreq); netfs_put_subrequest(subreq, true, netfs_sreq_trace_put_terminated); diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c index 2290af0d51ac..0f294b26e08c 100644 --- a/fs/netfs/read_retry.c +++ b/fs/netfs/read_retry.c @@ -14,7 +14,7 @@ static void netfs_reissue_read(struct netfs_io_request *rreq, { __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); - netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); + netfs_stat(&netfs_n_rh_retry_read_subreq); subreq->rreq->netfs_ops->issue_read(subreq); } @@ -48,6 +48,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); subreq->retry_count++; netfs_reset_iter(subreq); + netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); netfs_reissue_read(rreq, subreq); } } @@ -75,7 +76,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) struct iov_iter source; unsigned long long start, len; size_t part; - bool boundary = false; + bool boundary = false, subreq_superfluous = false; /* Go through the subreqs and find the next span of contiguous * buffer that we then rejig (cifs, for example, needs the @@ -116,8 +117,10 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) /* Work through the sublist. */ subreq = from; list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) { - if (!len) + if (!len) { + subreq_superfluous = true; break; + } subreq->source = NETFS_DOWNLOAD_FROM_SERVER; subreq->start = start - subreq->transferred; subreq->len = len + subreq->transferred; @@ -154,19 +157,21 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); netfs_reissue_read(rreq, subreq); - if (subreq == to) + if (subreq == to) { + subreq_superfluous = false; break; + } } /* If we managed to use fewer subreqs, we can discard the * excess; if we used the same number, then we're done. */ if (!len) { - if (subreq == to) + if (!subreq_superfluous) continue; list_for_each_entry_safe_from(subreq, tmp, &stream->subrequests, rreq_link) { - trace_netfs_sreq(subreq, netfs_sreq_trace_discard); + trace_netfs_sreq(subreq, netfs_sreq_trace_superfluous); list_del(&subreq->rreq_link); netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done); if (subreq == to) @@ -187,14 +192,12 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) subreq->source = NETFS_DOWNLOAD_FROM_SERVER; subreq->start = start; subreq->len = len; - subreq->debug_index = atomic_inc_return(&rreq->subreq_counter); subreq->stream_nr = stream->stream_nr; subreq->retry_count = 1; trace_netfs_sreq_ref(rreq->debug_id, subreq->debug_index, refcount_read(&subreq->ref), netfs_sreq_trace_new); - netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); list_add(&subreq->rreq_link, &to->rreq_link); to = list_next_entry(to, rreq_link); @@ -256,14 +259,34 @@ void netfs_retry_reads(struct netfs_io_request *rreq) { struct netfs_io_subrequest *subreq; struct netfs_io_stream *stream = &rreq->io_streams[0]; + DEFINE_WAIT(myself); + + netfs_stat(&netfs_n_rh_retry_read_req); + + set_bit(NETFS_RREQ_RETRYING, &rreq->flags); /* Wait for all outstanding I/O to quiesce before performing retries as * we may need to renegotiate the I/O sizes. */ list_for_each_entry(subreq, &stream->subrequests, rreq_link) { - wait_on_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS, - TASK_UNINTERRUPTIBLE); + if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags)) + continue; + + trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue); + for (;;) { + prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE); + + if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags)) + break; + + trace_netfs_sreq(subreq, netfs_sreq_trace_wait_for); + schedule(); + trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue); + } + + finish_wait(&rreq->waitq, &myself); } + clear_bit(NETFS_RREQ_RETRYING, &rreq->flags); trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit); netfs_retry_read_subrequests(rreq); diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c index f1af344266cc..ab6b916addc4 100644 --- a/fs/netfs/stats.c +++ b/fs/netfs/stats.c @@ -29,6 +29,8 @@ atomic_t netfs_n_rh_write_begin; atomic_t netfs_n_rh_write_done; atomic_t netfs_n_rh_write_failed; atomic_t netfs_n_rh_write_zskip; +atomic_t netfs_n_rh_retry_read_req; +atomic_t netfs_n_rh_retry_read_subreq; atomic_t netfs_n_wh_buffered_write; atomic_t netfs_n_wh_writethrough; atomic_t netfs_n_wh_dio_write; @@ -41,6 +43,8 @@ atomic_t netfs_n_wh_upload_failed; atomic_t netfs_n_wh_write; atomic_t netfs_n_wh_write_done; atomic_t netfs_n_wh_write_failed; +atomic_t netfs_n_wh_retry_write_req; +atomic_t netfs_n_wh_retry_write_subreq; atomic_t netfs_n_wb_lock_skip; atomic_t netfs_n_wb_lock_wait; atomic_t netfs_n_folioq; @@ -81,6 +85,11 @@ int netfs_stats_show(struct seq_file *m, void *v) atomic_read(&netfs_n_wh_write), atomic_read(&netfs_n_wh_write_done), atomic_read(&netfs_n_wh_write_failed)); + seq_printf(m, "Retries: rq=%u rs=%u wq=%u ws=%u\n", + atomic_read(&netfs_n_rh_retry_read_req), + atomic_read(&netfs_n_rh_retry_read_subreq), + atomic_read(&netfs_n_wh_retry_write_req), + atomic_read(&netfs_n_wh_retry_write_subreq)); seq_printf(m, "Objs : rr=%u sr=%u foq=%u wsc=%u\n", atomic_read(&netfs_n_rh_rreq), atomic_read(&netfs_n_rh_sreq), diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 69727411683e..77279fc5b5a7 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -253,6 +253,7 @@ void netfs_reissue_write(struct netfs_io_stream *stream, subreq->retry_count++; __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); + netfs_stat(&netfs_n_wh_retry_write_subreq); netfs_do_issue_write(stream, subreq); } diff --git a/fs/netfs/write_retry.c b/fs/netfs/write_retry.c index c841a851dd73..545d33079a77 100644 --- a/fs/netfs/write_retry.c +++ b/fs/netfs/write_retry.c @@ -203,6 +203,8 @@ void netfs_retry_writes(struct netfs_io_request *wreq) struct netfs_io_stream *stream; int s; + netfs_stat(&netfs_n_wh_retry_write_req); + /* Wait for all outstanding I/O to quiesce before performing retries as * we may need to renegotiate the I/O sizes. */ diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 0e552d873eaa..fb9b1656a287 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -446,11 +446,20 @@ nfsd_file_dispose_list_delayed(struct list_head *dispose) struct nfsd_file, nf_gc); struct nfsd_net *nn = net_generic(nf->nf_net, nfsd_net_id); struct nfsd_fcache_disposal *l = nn->fcache_disposal; + struct svc_serv *serv; spin_lock(&l->lock); list_move_tail(&nf->nf_gc, &l->freeme); spin_unlock(&l->lock); - svc_wake_up(nn->nfsd_serv); + + /* + * The filecache laundrette is shut down after the + * nn->nfsd_serv pointer is cleared, but before the + * svc_serv is freed. + */ + serv = nn->nfsd_serv; + if (serv) + svc_wake_up(serv); } } diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 4e3be7201b1c..5fb202acb0fd 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -84,6 +84,8 @@ out: fail: posix_acl_release(resp->acl_access); posix_acl_release(resp->acl_default); + resp->acl_access = NULL; + resp->acl_default = NULL; goto out; } diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 5e34e98db969..7b5433bd3019 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -76,6 +76,8 @@ out: fail: posix_acl_release(resp->acl_access); posix_acl_release(resp->acl_default); + resp->acl_access = NULL; + resp->acl_default = NULL; goto out; } diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 50e468bdb8d4..484077200c5d 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -679,7 +679,7 @@ static int nfs4_xdr_dec_cb_getattr(struct rpc_rqst *rqstp, return status; status = decode_cb_op_status(xdr, OP_CB_GETATTR, &cb->cb_status); - if (unlikely(status || cb->cb_seq_status)) + if (unlikely(status || cb->cb_status)) return status; if (xdr_stream_decode_uint32_array(xdr, bitmap, 3) < 0) return -NFSERR_BAD_XDR; @@ -1583,8 +1583,11 @@ nfsd4_run_cb_work(struct work_struct *work) nfsd4_process_cb_update(cb); clnt = clp->cl_cb_client; - if (!clnt) { - /* Callback channel broken, or client killed; give up: */ + if (!clnt || clp->cl_state == NFSD4_COURTESY) { + /* + * Callback channel broken, client killed or + * nfs4_client in courtesy state; give up. + */ nfsd41_destroy_cb(cb); return; } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index b7a0cfd05401..153eeea2c7c9 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4459,10 +4459,11 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } } while (slot && --cnt > 0); } + +out: seq->maxslots = max(session->se_target_maxslots, seq->maxslots); seq->target_maxslots = session->se_target_maxslots; -out: switch (clp->cl_cb_state) { case NFSD4_CB_DOWN: seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN; diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 32019751a41e..aef474f1b84b 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -380,8 +380,9 @@ __fh_verify(struct svc_rqst *rqstp, error = check_nfsd_access(exp, rqstp, may_bypass_gss); if (error) goto out; - - svc_xprt_set_valid(rqstp->rq_xprt); + /* During LOCALIO call to fh_verify will be called with a NULL rqstp */ + if (rqstp) + svc_xprt_set_valid(rqstp->rq_xprt); /* Finally, check access permissions. */ error = nfsd_permission(cred, exp, dentry, access); diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index a00120a3c099..10d01eb09c43 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -1524,7 +1524,7 @@ int vmcore_add_device_dump(struct vmcoredd_data *data) pr_warn_once("Unexpected adding of device dump\n"); if (vmcore_open) { ret = -EBUSY; - goto out_err; + goto unlock; } list_add_tail(&dump->list, &vmcoredd_list); @@ -1532,6 +1532,9 @@ int vmcore_add_device_dump(struct vmcoredd_data *data) mutex_unlock(&vmcore_mutex); return 0; +unlock: + mutex_unlock(&vmcore_mutex); + out_err: vfree(buf); vfree(dump); diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index ac1f890a0d54..cddeb2adbf4a 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -253,6 +253,7 @@ struct cifs_cred { struct cifs_open_info_data { bool adjust_tz; bool reparse_point; + bool contains_posix_file_info; struct { /* ioctl response buffer */ struct { @@ -1508,7 +1509,6 @@ struct cifs_io_parms { struct cifs_io_request { struct netfs_io_request rreq; struct cifsFileInfo *cfile; - struct TCP_Server_Info *server; pid_t pid; }; @@ -2325,8 +2325,8 @@ struct smb2_compound_vars { struct kvec io_iov[SMB2_IOCTL_IOV_SIZE]; struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE]; struct kvec close_iov; - struct smb2_file_rename_info rename_info; - struct smb2_file_link_info link_info; + struct smb2_file_rename_info_hdr rename_info; + struct smb2_file_link_info_hdr link_info; struct kvec ea_iov; }; diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 79de2f2f9c41..8582cf61242c 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -147,7 +147,7 @@ static int cifs_prepare_read(struct netfs_io_subrequest *subreq) struct netfs_io_request *rreq = subreq->rreq; struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq); struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq); - struct TCP_Server_Info *server = req->server; + struct TCP_Server_Info *server; struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb); size_t size; int rc = 0; @@ -156,6 +156,8 @@ static int cifs_prepare_read(struct netfs_io_subrequest *subreq) rdata->xid = get_xid(); rdata->have_xid = true; } + + server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses); rdata->server = server; if (cifs_sb->ctx->rsize == 0) @@ -198,7 +200,7 @@ static void cifs_issue_read(struct netfs_io_subrequest *subreq) struct netfs_io_request *rreq = subreq->rreq; struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq); struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq); - struct TCP_Server_Info *server = req->server; + struct TCP_Server_Info *server = rdata->server; int rc = 0; cifs_dbg(FYI, "%s: op=%08x[%x] mapping=%p len=%zu/%zu\n", @@ -266,7 +268,6 @@ static int cifs_init_request(struct netfs_io_request *rreq, struct file *file) open_file = file->private_data; rreq->netfs_priv = file->private_data; req->cfile = cifsFileInfo_get(open_file); - req->server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) req->pid = req->cfile->pid; } else if (rreq->origin != NETFS_WRITEBACK) { diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 9cc31cf6ebd0..616149c7f0a5 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -1215,6 +1215,19 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data, rc = server->ops->parse_reparse_point(cifs_sb, full_path, iov, data); + /* + * If the reparse point was not handled but it is the + * name surrogate which points to directory, then treat + * is as a new mount point. Name surrogate reparse point + * represents another named entity in the system. + */ + if (rc == -EOPNOTSUPP && + IS_REPARSE_TAG_NAME_SURROGATE(data->reparse.tag) && + (le32_to_cpu(data->fi.Attributes) & ATTR_DIRECTORY)) { + rc = 0; + cifs_create_junction_fattr(fattr, sb); + goto out; + } } if (data->reparse.tag == IO_REPARSE_TAG_SYMLINK && !rc) { @@ -1408,7 +1421,7 @@ int cifs_get_inode_info(struct inode **inode, struct cifs_fattr fattr = {}; int rc; - if (is_inode_cache_good(*inode)) { + if (!data && is_inode_cache_good(*inode)) { cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); return 0; } @@ -1507,7 +1520,7 @@ int smb311_posix_get_inode_info(struct inode **inode, struct cifs_fattr fattr = {}; int rc; - if (is_inode_cache_good(*inode)) { + if (!data && is_inode_cache_good(*inode)) { cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); return 0; } diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c index 0a5a52a8a7dd..2b9e9885dc42 100644 --- a/fs/smb/client/reparse.c +++ b/fs/smb/client/reparse.c @@ -1088,13 +1088,12 @@ int parse_reparse_point(struct reparse_data_buffer *buf, le32_to_cpu(buf->ReparseTag)); return -EIO; } - break; + return 0; default: cifs_tcon_dbg(VFS | ONCE, "unhandled reparse tag: 0x%08x\n", le32_to_cpu(buf->ReparseTag)); - break; + return -EOPNOTSUPP; } - return 0; } int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb, diff --git a/fs/smb/client/reparse.h b/fs/smb/client/reparse.h index 5a753fec7e2c..c0be5ab45a78 100644 --- a/fs/smb/client/reparse.h +++ b/fs/smb/client/reparse.h @@ -99,14 +99,30 @@ static inline bool reparse_inode_match(struct inode *inode, static inline bool cifs_open_data_reparse(struct cifs_open_info_data *data) { - struct smb2_file_all_info *fi = &data->fi; - u32 attrs = le32_to_cpu(fi->Attributes); + u32 attrs; bool ret; - ret = data->reparse_point || (attrs & ATTR_REPARSE); - if (ret) - attrs |= ATTR_REPARSE; - fi->Attributes = cpu_to_le32(attrs); + if (data->contains_posix_file_info) { + struct smb311_posix_qinfo *fi = &data->posix_fi; + + attrs = le32_to_cpu(fi->DosAttributes); + if (data->reparse_point) { + attrs |= ATTR_REPARSE; + fi->DosAttributes = cpu_to_le32(attrs); + } + + } else { + struct smb2_file_all_info *fi = &data->fi; + + attrs = le32_to_cpu(fi->Attributes); + if (data->reparse_point) { + attrs |= ATTR_REPARSE; + fi->Attributes = cpu_to_le32(attrs); + } + } + + ret = attrs & ATTR_REPARSE; + return ret; } diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 5dfb30b0a852..826b57a5a2a8 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -650,6 +650,7 @@ finished: switch (cmds[i]) { case SMB2_OP_QUERY_INFO: idata = in_iov[i].iov_base; + idata->contains_posix_file_info = false; if (rc == 0 && cfile && cfile->symlink_target) { idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); if (!idata->symlink_target) @@ -673,6 +674,7 @@ finished: break; case SMB2_OP_POSIX_QUERY_INFO: idata = in_iov[i].iov_base; + idata->contains_posix_file_info = true; if (rc == 0 && cfile && cfile->symlink_target) { idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); if (!idata->symlink_target) @@ -770,6 +772,7 @@ finished: idata = in_iov[i].iov_base; idata->reparse.io.iov = *iov; idata->reparse.io.buftype = resp_buftype[i + 1]; + idata->contains_posix_file_info = false; /* BB VERIFY */ rbuf = reparse_buf_ptr(iov); if (IS_ERR(rbuf)) { rc = PTR_ERR(rbuf); @@ -791,6 +794,7 @@ finished: case SMB2_OP_QUERY_WSL_EA: if (!rc) { idata = in_iov[i].iov_base; + idata->contains_posix_file_info = false; qi_rsp = rsp_iov[i + 1].iov_base; data[0] = (u8 *)qi_rsp + le16_to_cpu(qi_rsp->OutputBufferOffset); size[0] = le32_to_cpu(qi_rsp->OutputBufferLength); diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index ec36bed54b0b..4dd11eafb69d 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -1001,6 +1001,7 @@ static int smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, if (!data->symlink_target) return -ENOMEM; } + data->contains_posix_file_info = false; return SMB2_query_info(xid, tcon, fid->persistent_fid, fid->volatile_fid, &data->fi); } @@ -4964,6 +4965,10 @@ one_more: next_buffer = (char *)cifs_buf_get(); else next_buffer = (char *)cifs_small_buf_get(); + if (!next_buffer) { + cifs_server_dbg(VFS, "No memory for (large) SMB response\n"); + return -1; + } memcpy(next_buffer, buf + next_cmd, pdu_length - next_cmd); } @@ -5146,7 +5151,7 @@ int __cifs_sfu_make_node(unsigned int xid, struct inode *inode, FILE_CREATE, CREATE_NOT_DIR | CREATE_OPTION_SPECIAL, ACL_NO_MODE); oparms.fid = &fid; - + idata.contains_posix_file_info = false; rc = server->ops->open(xid, &oparms, &oplock, &idata); if (rc) goto out; diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h index 3336df2ea5d4..c7a0efda4403 100644 --- a/fs/smb/common/smb2pdu.h +++ b/fs/smb/common/smb2pdu.h @@ -1707,23 +1707,33 @@ struct smb2_file_internal_info { } __packed; /* level 6 Query */ struct smb2_file_rename_info { /* encoding of request for level 10 */ - __u8 ReplaceIfExists; /* 1 = replace existing target with new */ - /* 0 = fail if target already exists */ - __u8 Reserved[7]; - __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ - __le32 FileNameLength; + /* New members MUST be added within the struct_group() macro below. */ + __struct_group(smb2_file_rename_info_hdr, __hdr, __packed, + __u8 ReplaceIfExists; /* 1 = replace existing target with new */ + /* 0 = fail if target already exists */ + __u8 Reserved[7]; + __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ + __le32 FileNameLength; + ); char FileName[]; /* New name to be assigned */ /* padding - overall struct size must be >= 24 so filename + pad >= 6 */ } __packed; /* level 10 Set */ +static_assert(offsetof(struct smb2_file_rename_info, FileName) == sizeof(struct smb2_file_rename_info_hdr), + "struct member likely outside of __struct_group()"); struct smb2_file_link_info { /* encoding of request for level 11 */ - __u8 ReplaceIfExists; /* 1 = replace existing link with new */ - /* 0 = fail if link already exists */ - __u8 Reserved[7]; - __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ - __le32 FileNameLength; + /* New members MUST be added within the struct_group() macro below. */ + __struct_group(smb2_file_link_info_hdr, __hdr, __packed, + __u8 ReplaceIfExists; /* 1 = replace existing link with new */ + /* 0 = fail if link already exists */ + __u8 Reserved[7]; + __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ + __le32 FileNameLength; + ); char FileName[]; /* Name to be assigned to new link */ } __packed; /* level 11 Set */ +static_assert(offsetof(struct smb2_file_link_info, FileName) == sizeof(struct smb2_file_link_info_hdr), + "struct member likely outside of __struct_group()"); /* * This level 18, although with struct with same name is different from cifs diff --git a/fs/smb/common/smbfsctl.h b/fs/smb/common/smbfsctl.h index 4b379e84c46b..3253a18ecb5c 100644 --- a/fs/smb/common/smbfsctl.h +++ b/fs/smb/common/smbfsctl.h @@ -159,6 +159,9 @@ #define IO_REPARSE_TAG_LX_CHR 0x80000025 #define IO_REPARSE_TAG_LX_BLK 0x80000026 +/* If Name Surrogate Bit is set, the file or directory represents another named entity in the system. */ +#define IS_REPARSE_TAG_NAME_SURROGATE(tag) (!!((tag) & 0x20000000)) + /* fsctl flags */ /* If Flags is set to this value, the request is an FSCTL not ioctl request */ #define SMB2_0_IOCTL_IS_FSCTL 0x00000001 diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index bdcd40f0ec74..19877d99f255 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -224,7 +224,6 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm) bool xchk_dir_looks_zapped(struct xfs_inode *dp); bool xchk_pptr_looks_zapped(struct xfs_inode *ip); -#ifdef CONFIG_XFS_ONLINE_REPAIR /* Decide if a repair is required. */ static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm) { @@ -244,10 +243,6 @@ static inline bool xchk_could_repair(const struct xfs_scrub *sc) return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && !(sc->flags & XREP_ALREADY_FIXED); } -#else -# define xchk_needs_repair(sc) (false) -# define xchk_could_repair(sc) (false) -#endif /* CONFIG_XFS_ONLINE_REPAIR */ int xchk_metadata_inode_forks(struct xfs_scrub *sc); diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c index 2f641b6d663e..13ff1c933cb8 100644 --- a/fs/xfs/scrub/inode_repair.c +++ b/fs/xfs/scrub/inode_repair.c @@ -1055,9 +1055,17 @@ xrep_dinode_check_dfork( return true; break; case S_IFREG: - if (fmt == XFS_DINODE_FMT_LOCAL) + switch (fmt) { + case XFS_DINODE_FMT_LOCAL: return true; - fallthrough; + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + case XFS_DINODE_FMT_META_BTREE: + break; + default: + return true; + } + break; case S_IFLNK: case S_IFDIR: switch (fmt) { diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 823c00d1a502..af0a3a9e5ed9 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -191,7 +191,16 @@ int xrep_reset_metafile_resv(struct xfs_scrub *sc); #else #define xrep_ino_dqattach(sc) (0) -#define xrep_will_attempt(sc) (false) + +/* + * When online repair is not built into the kernel, we still want to attempt + * the repair so that the stub xrep_attempt below will return EOPNOTSUPP. + */ +static inline bool xrep_will_attempt(const struct xfs_scrub *sc) +{ + return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || + xchk_needs_repair(sc->sm); +} static inline int xrep_attempt( diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 7567dd5cad14..6fa9e3e5bab7 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -149,6 +149,18 @@ xchk_probe( if (xchk_should_terminate(sc, &error)) return error; + /* + * If the caller is probing to see if repair works but repair isn't + * built into the kernel, return EOPNOTSUPP because that's the signal + * that userspace expects. If online repair is built in, set the + * CORRUPT flag (without any of the usual tracing/logging) to force us + * into xrep_probe. + */ + if (xchk_could_repair(sc)) { + if (!IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)) + return -EOPNOTSUPP; + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + } return 0; } diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 67877c36ed11..6d9965b546cb 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -19,6 +19,7 @@ #include "xfs_reflink.h" #include "xfs_errortag.h" #include "xfs_error.h" +#include "xfs_icache.h" struct xfs_writepage_ctx { struct iomap_writepage_ctx ctx; @@ -528,12 +529,44 @@ xfs_vm_readahead( } static int -xfs_iomap_swapfile_activate( +xfs_vm_swap_activate( struct swap_info_struct *sis, struct file *swap_file, sector_t *span) { - sis->bdev = xfs_inode_buftarg(XFS_I(file_inode(swap_file)))->bt_bdev; + struct xfs_inode *ip = XFS_I(file_inode(swap_file)); + + /* + * Swap file activation can race against concurrent shared extent + * removal in files that have been cloned. If this happens, + * iomap_swapfile_iter() can fail because it encountered a shared + * extent even though an operation is in progress to remove those + * shared extents. + * + * This race becomes problematic when we defer extent removal + * operations beyond the end of a syscall (i.e. use async background + * processing algorithms). Users think the extents are no longer + * shared, but iomap_swapfile_iter() still sees them as shared + * because the refcountbt entries for the extents being removed have + * not yet been updated. Hence the swapon call fails unexpectedly. + * + * The race condition is currently most obvious from the unlink() + * operation as extent removal is deferred until after the last + * reference to the inode goes away. We then process the extent + * removal asynchronously, hence triggers the "syscall completed but + * work not done" condition mentioned above. To close this race + * window, we need to flush any pending inodegc operations to ensure + * they have updated the refcountbt records before we try to map the + * swapfile. + */ + xfs_inodegc_flush(ip->i_mount); + + /* + * Direct the swap code to the correct block device when this file + * sits on the RT device. + */ + sis->bdev = xfs_inode_buftarg(ip)->bt_bdev; + return iomap_swapfile_activate(sis, swap_file, span, &xfs_read_iomap_ops); } @@ -549,11 +582,11 @@ const struct address_space_operations xfs_address_space_operations = { .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, - .swap_activate = xfs_iomap_swapfile_activate, + .swap_activate = xfs_vm_swap_activate, }; const struct address_space_operations xfs_dax_aops = { .writepages = xfs_dax_writepages, .dirty_folio = noop_dirty_folio, - .swap_activate = xfs_iomap_swapfile_activate, + .swap_activate = xfs_vm_swap_activate, }; diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 37f1230e7584..245d754f382a 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -78,6 +78,28 @@ xfs_qm_statvfs( } } +STATIC int +xfs_qm_validate_state_change( + struct xfs_mount *mp, + uint uqd, + uint gqd, + uint pqd) +{ + int state; + + /* Is quota state changing? */ + state = ((uqd && !XFS_IS_UQUOTA_ON(mp)) || + (!uqd && XFS_IS_UQUOTA_ON(mp)) || + (gqd && !XFS_IS_GQUOTA_ON(mp)) || + (!gqd && XFS_IS_GQUOTA_ON(mp)) || + (pqd && !XFS_IS_PQUOTA_ON(mp)) || + (!pqd && XFS_IS_PQUOTA_ON(mp))); + + return state && + (xfs_dev_is_read_only(mp, "changing quota state") || + xfs_has_norecovery(mp)); +} + int xfs_qm_newmount( xfs_mount_t *mp, @@ -97,24 +119,25 @@ xfs_qm_newmount( } /* - * If the device itself is read-only, we can't allow - * the user to change the state of quota on the mount - - * this would generate a transaction on the ro device, - * which would lead to an I/O error and shutdown + * If the device itself is read-only and/or in norecovery + * mode, we can't allow the user to change the state of + * quota on the mount - this would generate a transaction + * on the ro device, which would lead to an I/O error and + * shutdown. */ - if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) || - (!uquotaondisk && XFS_IS_UQUOTA_ON(mp)) || - (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) || - (!gquotaondisk && XFS_IS_GQUOTA_ON(mp)) || - (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) || - (!pquotaondisk && XFS_IS_PQUOTA_ON(mp))) && - xfs_dev_is_read_only(mp, "changing quota state")) { - xfs_warn(mp, "please mount with%s%s%s%s.", - (!quotaondisk ? "out quota" : ""), - (uquotaondisk ? " usrquota" : ""), - (gquotaondisk ? " grpquota" : ""), - (pquotaondisk ? " prjquota" : "")); + if (xfs_qm_validate_state_change(mp, uquotaondisk, + gquotaondisk, pquotaondisk)) { + + if (xfs_has_metadir(mp)) + xfs_warn(mp, + "metadir enabled, please mount without any quota mount options"); + else + xfs_warn(mp, "please mount with%s%s%s%s.", + (!quotaondisk ? "out quota" : ""), + (uquotaondisk ? " usrquota" : ""), + (gquotaondisk ? " grpquota" : ""), + (pquotaondisk ? " prjquota" : "")); return -EPERM; } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index d92d7a07ea89..0055066fb1d9 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1661,8 +1661,12 @@ xfs_fs_fill_super( #endif } - /* Filesystem claims it needs repair, so refuse the mount. */ - if (xfs_has_needsrepair(mp)) { + /* + * Filesystem claims it needs repair, so refuse the mount unless + * norecovery is also specified, in which case the filesystem can + * be mounted with no risk of further damage. + */ + if (xfs_has_needsrepair(mp) && !xfs_has_norecovery(mp)) { xfs_warn(mp, "Filesystem needs repair. Please run xfs_repair."); error = -EFSCORRUPTED; goto out_free_sb; |