summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2024-02-23 02:24:56 +0300
committerJakub Kicinski <kuba@kernel.org>2024-02-23 02:29:26 +0300
commitfecc51559a844b7f74119159c3cdb25b80b4e2c6 (patch)
tree2cf1e49810eb0c58e552f722cc2ab2742e62cf43 /fs
parent0fb848d1a41e0d3895cb157810862db6046063dd (diff)
parent6714ebb922ab15a209dfc3c1ed29d4bb0abc9f02 (diff)
downloadlinux-fecc51559a844b7f74119159c3cdb25b80b4e2c6.tar.xz
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Cross-merge networking fixes after downstream PR. Conflicts: net/ipv4/udp.c f796feabb9f5 ("udp: add local "peek offset enabled" flag") 56667da7399e ("net: implement lockless setsockopt(SO_PEEK_OFF)") Adjacent changes: net/unix/garbage.c aa82ac51d633 ("af_unix: Drop oob_skb ref before purging queue in GC.") 11498715f266 ("af_unix: Remove io_uring code for GC.") Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'fs')
-rw-r--r--fs/bcachefs/bcachefs.h12
-rw-r--r--fs/bcachefs/btree_update_interior.c3
-rw-r--r--fs/bcachefs/fs.c9
-rw-r--r--fs/bcachefs/io_write.c1
-rw-r--r--fs/bcachefs/journal_io.c4
-rw-r--r--fs/bcachefs/journal_reclaim.c2
-rw-r--r--fs/bcachefs/printbuf.c1
-rw-r--r--fs/bcachefs/recovery.c11
-rw-r--r--fs/bcachefs/sb-members.c2
-rw-r--r--fs/bcachefs/super-io.c2
-rw-r--r--fs/bcachefs/super.c4
-rw-r--r--fs/btrfs/defrag.c2
-rw-r--r--fs/btrfs/extent_io.c62
-rw-r--r--fs/ceph/caps.c65
-rw-r--r--fs/ceph/mds_client.c48
-rw-r--r--fs/ceph/mds_client.h5
-rw-r--r--fs/smb/client/cached_dir.c1
-rw-r--r--fs/smb/client/cifsglob.h1
-rw-r--r--fs/smb/client/connect.c14
-rw-r--r--fs/smb/client/fs_context.c11
-rw-r--r--fs/smb/client/namespace.c16
-rw-r--r--fs/smb/client/smb2ops.c14
-rw-r--r--fs/smb/client/smb2pdu.c10
-rw-r--r--fs/zonefs/file.c42
-rw-r--r--fs/zonefs/super.c66
25 files changed, 299 insertions, 109 deletions
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index b80c6c9efd8c..69d0d60d50e3 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -1249,6 +1249,18 @@ static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c)
return stdio;
}
+static inline unsigned metadata_replicas_required(struct bch_fs *c)
+{
+ return min(c->opts.metadata_replicas,
+ c->opts.metadata_replicas_required);
+}
+
+static inline unsigned data_replicas_required(struct bch_fs *c)
+{
+ return min(c->opts.data_replicas,
+ c->opts.data_replicas_required);
+}
+
#define BKEY_PADDED_ONSTACK(key, pad) \
struct { struct bkey_i key; __u64 key ## _pad[pad]; }
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 17a5938aa71a..4530b14ff2c3 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -280,7 +280,8 @@ retry:
writepoint_ptr(&c->btree_write_point),
&devs_have,
res->nr_replicas,
- c->opts.metadata_replicas_required,
+ min(res->nr_replicas,
+ c->opts.metadata_replicas_required),
watermark, 0, cl, &wp);
if (unlikely(ret))
return ERR_PTR(ret);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index ec419b8e2c43..77ae65542db9 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -435,7 +435,7 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
bch2_subvol_is_ro(c, inode->ei_subvol) ?:
__bch2_link(c, inode, dir, dentry);
if (unlikely(ret))
- return ret;
+ return bch2_err_class(ret);
ihold(&inode->v);
d_instantiate(dentry, &inode->v);
@@ -487,8 +487,9 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
struct bch_inode_info *dir= to_bch_ei(vdir);
struct bch_fs *c = dir->v.i_sb->s_fs_info;
- return bch2_subvol_is_ro(c, dir->ei_subvol) ?:
+ int ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
__bch2_unlink(vdir, dentry, false);
+ return bch2_err_class(ret);
}
static int bch2_symlink(struct mnt_idmap *idmap,
@@ -523,7 +524,7 @@ static int bch2_symlink(struct mnt_idmap *idmap,
return 0;
err:
iput(&inode->v);
- return ret;
+ return bch2_err_class(ret);
}
static int bch2_mkdir(struct mnt_idmap *idmap,
@@ -641,7 +642,7 @@ err:
src_inode,
dst_inode);
- return ret;
+ return bch2_err_class(ret);
}
static void bch2_setattr_copy(struct mnt_idmap *idmap,
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index ef3a53f9045a..2c098ac017b3 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -1564,6 +1564,7 @@ CLOSURE_CALLBACK(bch2_write)
BUG_ON(!op->write_point.v);
BUG_ON(bkey_eq(op->pos, POS_MAX));
+ op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas);
op->start_time = local_clock();
bch2_keylist_init(&op->insert_keys, op->inline_keys);
wbio_init(bio)->put_bio = false;
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index bfd6585e746d..47805193f18c 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1478,6 +1478,8 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w)
c->opts.foreground_target;
unsigned i, replicas = 0, replicas_want =
READ_ONCE(c->opts.metadata_replicas);
+ unsigned replicas_need = min_t(unsigned, replicas_want,
+ READ_ONCE(c->opts.metadata_replicas_required));
rcu_read_lock();
retry:
@@ -1526,7 +1528,7 @@ done:
BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
- return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
+ return replicas >= replicas_need ? 0 : -EROFS;
}
static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 820d25e19e5f..2cf626315652 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -205,7 +205,7 @@ void bch2_journal_space_available(struct journal *j)
j->can_discard = can_discard;
- if (nr_online < c->opts.metadata_replicas_required) {
+ if (nr_online < metadata_replicas_required(c)) {
ret = JOURNAL_ERR_insufficient_devices;
goto out;
}
diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c
index accf246c3233..b27d22925929 100644
--- a/fs/bcachefs/printbuf.c
+++ b/fs/bcachefs/printbuf.c
@@ -56,6 +56,7 @@ void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args)
va_copy(args2, args);
len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2);
+ va_end(args2);
} while (len + 1 >= printbuf_remaining(out) &&
!bch2_printbuf_make_room(out, len + 1));
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 9127d0e3ca2f..21e13bb4335b 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -577,8 +577,9 @@ u64 bch2_recovery_passes_from_stable(u64 v)
static bool check_version_upgrade(struct bch_fs *c)
{
- unsigned latest_compatible = bch2_latest_compatible_version(c->sb.version);
unsigned latest_version = bcachefs_metadata_version_current;
+ unsigned latest_compatible = min(latest_version,
+ bch2_latest_compatible_version(c->sb.version));
unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
unsigned new_version = 0;
@@ -597,7 +598,7 @@ static bool check_version_upgrade(struct bch_fs *c)
new_version = latest_version;
break;
case BCH_VERSION_UPGRADE_none:
- new_version = old_version;
+ new_version = min(old_version, latest_version);
break;
}
}
@@ -774,7 +775,7 @@ int bch2_fs_recovery(struct bch_fs *c)
goto err;
}
- if (!(c->opts.nochanges && c->opts.norecovery)) {
+ if (!c->opts.nochanges) {
mutex_lock(&c->sb_lock);
bool write_sb = false;
@@ -804,7 +805,7 @@ int bch2_fs_recovery(struct bch_fs *c)
if (bch2_check_version_downgrade(c)) {
struct printbuf buf = PRINTBUF;
- prt_str(&buf, "Version downgrade required:\n");
+ prt_str(&buf, "Version downgrade required:");
__le64 passes = ext->recovery_passes_required[0];
bch2_sb_set_downgrade(c,
@@ -812,7 +813,7 @@ int bch2_fs_recovery(struct bch_fs *c)
BCH_VERSION_MINOR(c->sb.version));
passes = ext->recovery_passes_required[0] & ~passes;
if (passes) {
- prt_str(&buf, " running recovery passes: ");
+ prt_str(&buf, "\n running recovery passes: ");
prt_bitflags(&buf, bch2_recovery_passes,
bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
}
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index a45354d2acde..eff5ce18c69c 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -421,7 +421,7 @@ void bch2_dev_errors_reset(struct bch_dev *ca)
m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
for (unsigned i = 0; i < ARRAY_SIZE(m->errors_at_reset); i++)
m->errors_at_reset[i] = cpu_to_le64(atomic64_read(&ca->errors[i]));
- m->errors_reset_time = ktime_get_real_seconds();
+ m->errors_reset_time = cpu_to_le64(ktime_get_real_seconds());
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index d60c7d27a047..36988add581f 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -717,7 +717,7 @@ retry:
if (IS_ERR(sb->bdev_handle)) {
ret = PTR_ERR(sb->bdev_handle);
- goto out;
+ goto err;
}
sb->bdev = sb->bdev_handle->bdev;
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index b9911402b175..6b23e11825e6 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1428,10 +1428,10 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
? c->opts.metadata_replicas
- : c->opts.metadata_replicas_required,
+ : metadata_replicas_required(c),
!(flags & BCH_FORCE_IF_DATA_DEGRADED)
? c->opts.data_replicas
- : c->opts.data_replicas_required);
+ : data_replicas_required(c));
return nr_rw >= required;
case BCH_MEMBER_STATE_failed:
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index c276b136ab63..5b0b64571418 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -1046,7 +1046,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
goto add;
/* Skip too large extent */
- if (range_len >= extent_thresh)
+ if (em->len >= extent_thresh)
goto next;
/*
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index cfd2967f04a2..3e19a2475ab3 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2689,16 +2689,34 @@ static int fiemap_process_hole(struct btrfs_inode *inode,
* it beyond i_size.
*/
while (cur_offset < end && cur_offset < i_size) {
+ struct extent_state *cached_state = NULL;
u64 delalloc_start;
u64 delalloc_end;
u64 prealloc_start;
+ u64 lockstart;
+ u64 lockend;
u64 prealloc_len = 0;
bool delalloc;
+ lockstart = round_down(cur_offset, inode->root->fs_info->sectorsize);
+ lockend = round_up(end, inode->root->fs_info->sectorsize);
+
+ /*
+ * We are only locking for the delalloc range because that's the
+ * only thing that can change here. With fiemap we have a lock
+ * on the inode, so no buffered or direct writes can happen.
+ *
+ * However mmaps and normal page writeback will cause this to
+ * change arbitrarily. We have to lock the extent lock here to
+ * make sure that nobody messes with the tree while we're doing
+ * btrfs_find_delalloc_in_range.
+ */
+ lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
delalloc_cached_state,
&delalloc_start,
&delalloc_end);
+ unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
if (!delalloc)
break;
@@ -2866,15 +2884,15 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
const u64 ino = btrfs_ino(inode);
- struct extent_state *cached_state = NULL;
struct extent_state *delalloc_cached_state = NULL;
struct btrfs_path *path;
struct fiemap_cache cache = { 0 };
struct btrfs_backref_share_check_ctx *backref_ctx;
u64 last_extent_end;
u64 prev_extent_end;
- u64 lockstart;
- u64 lockend;
+ u64 range_start;
+ u64 range_end;
+ const u64 sectorsize = inode->root->fs_info->sectorsize;
bool stopped = false;
int ret;
@@ -2885,12 +2903,11 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
goto out;
}
- lockstart = round_down(start, inode->root->fs_info->sectorsize);
- lockend = round_up(start + len, inode->root->fs_info->sectorsize);
- prev_extent_end = lockstart;
+ range_start = round_down(start, sectorsize);
+ range_end = round_up(start + len, sectorsize);
+ prev_extent_end = range_start;
btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
- lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
if (ret < 0)
@@ -2898,7 +2915,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
btrfs_release_path(path);
path->reada = READA_FORWARD;
- ret = fiemap_search_slot(inode, path, lockstart);
+ ret = fiemap_search_slot(inode, path, range_start);
if (ret < 0) {
goto out_unlock;
} else if (ret > 0) {
@@ -2910,7 +2927,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
goto check_eof_delalloc;
}
- while (prev_extent_end < lockend) {
+ while (prev_extent_end < range_end) {
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_file_extent_item *ei;
struct btrfs_key key;
@@ -2933,19 +2950,19 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
* The first iteration can leave us at an extent item that ends
* before our range's start. Move to the next item.
*/
- if (extent_end <= lockstart)
+ if (extent_end <= range_start)
goto next_item;
backref_ctx->curr_leaf_bytenr = leaf->start;
/* We have in implicit hole (NO_HOLES feature enabled). */
if (prev_extent_end < key.offset) {
- const u64 range_end = min(key.offset, lockend) - 1;
+ const u64 hole_end = min(key.offset, range_end) - 1;
ret = fiemap_process_hole(inode, fieinfo, &cache,
&delalloc_cached_state,
backref_ctx, 0, 0, 0,
- prev_extent_end, range_end);
+ prev_extent_end, hole_end);
if (ret < 0) {
goto out_unlock;
} else if (ret > 0) {
@@ -2955,7 +2972,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
}
/* We've reached the end of the fiemap range, stop. */
- if (key.offset >= lockend) {
+ if (key.offset >= range_end) {
stopped = true;
break;
}
@@ -3049,29 +3066,41 @@ check_eof_delalloc:
btrfs_free_path(path);
path = NULL;
- if (!stopped && prev_extent_end < lockend) {
+ if (!stopped && prev_extent_end < range_end) {
ret = fiemap_process_hole(inode, fieinfo, &cache,
&delalloc_cached_state, backref_ctx,
- 0, 0, 0, prev_extent_end, lockend - 1);
+ 0, 0, 0, prev_extent_end, range_end - 1);
if (ret < 0)
goto out_unlock;
- prev_extent_end = lockend;
+ prev_extent_end = range_end;
}
if (cache.cached && cache.offset + cache.len >= last_extent_end) {
const u64 i_size = i_size_read(&inode->vfs_inode);
if (prev_extent_end < i_size) {
+ struct extent_state *cached_state = NULL;
u64 delalloc_start;
u64 delalloc_end;
+ u64 lockstart;
+ u64 lockend;
bool delalloc;
+ lockstart = round_down(prev_extent_end, sectorsize);
+ lockend = round_up(i_size, sectorsize);
+
+ /*
+ * See the comment in fiemap_process_hole as to why
+ * we're doing the locking here.
+ */
+ lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
delalloc = btrfs_find_delalloc_in_range(inode,
prev_extent_end,
i_size - 1,
&delalloc_cached_state,
&delalloc_start,
&delalloc_end);
+ unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
if (!delalloc)
cache.flags |= FIEMAP_EXTENT_LAST;
} else {
@@ -3082,7 +3111,6 @@ check_eof_delalloc:
ret = emit_last_fiemap_cache(fieinfo, &cache);
out_unlock:
- unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
out:
free_extent_state(delalloc_cached_state);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index ad1f46c66fbf..7fb4aae97412 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2156,6 +2156,30 @@ retry:
ceph_cap_string(cap->implemented),
ceph_cap_string(revoking));
+ /* completed revocation? going down and there are no caps? */
+ if (revoking) {
+ if ((revoking & cap_used) == 0) {
+ doutc(cl, "completed revocation of %s\n",
+ ceph_cap_string(cap->implemented & ~cap->issued));
+ goto ack;
+ }
+
+ /*
+ * If the "i_wrbuffer_ref" was increased by mmap or generic
+ * cache write just before the ceph_check_caps() is called,
+ * the Fb capability revoking will fail this time. Then we
+ * must wait for the BDI's delayed work to flush the dirty
+ * pages and to release the "i_wrbuffer_ref", which will cost
+ * at most 5 seconds. That means the MDS needs to wait at
+ * most 5 seconds to finished the Fb capability's revocation.
+ *
+ * Let's queue a writeback for it.
+ */
+ if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref &&
+ (revoking & CEPH_CAP_FILE_BUFFER))
+ queue_writeback = true;
+ }
+
if (cap == ci->i_auth_cap &&
(cap->issued & CEPH_CAP_FILE_WR)) {
/* request larger max_size from MDS? */
@@ -2183,30 +2207,6 @@ retry:
}
}
- /* completed revocation? going down and there are no caps? */
- if (revoking) {
- if ((revoking & cap_used) == 0) {
- doutc(cl, "completed revocation of %s\n",
- ceph_cap_string(cap->implemented & ~cap->issued));
- goto ack;
- }
-
- /*
- * If the "i_wrbuffer_ref" was increased by mmap or generic
- * cache write just before the ceph_check_caps() is called,
- * the Fb capability revoking will fail this time. Then we
- * must wait for the BDI's delayed work to flush the dirty
- * pages and to release the "i_wrbuffer_ref", which will cost
- * at most 5 seconds. That means the MDS needs to wait at
- * most 5 seconds to finished the Fb capability's revocation.
- *
- * Let's queue a writeback for it.
- */
- if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref &&
- (revoking & CEPH_CAP_FILE_BUFFER))
- queue_writeback = true;
- }
-
/* want more caps from mds? */
if (want & ~cap->mds_wanted) {
if (want & ~(cap->mds_wanted | cap->issued))
@@ -4772,7 +4772,22 @@ int ceph_drop_caps_for_unlink(struct inode *inode)
if (__ceph_caps_dirty(ci)) {
struct ceph_mds_client *mdsc =
ceph_inode_to_fs_client(inode)->mdsc;
- __cap_delay_requeue_front(mdsc, ci);
+
+ doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode,
+ ceph_vinop(inode));
+ spin_lock(&mdsc->cap_unlink_delay_lock);
+ ci->i_ceph_flags |= CEPH_I_FLUSH;
+ if (!list_empty(&ci->i_cap_delay_list))
+ list_del_init(&ci->i_cap_delay_list);
+ list_add_tail(&ci->i_cap_delay_list,
+ &mdsc->cap_unlink_delay_list);
+ spin_unlock(&mdsc->cap_unlink_delay_lock);
+
+ /*
+ * Fire the work immediately, because the MDS maybe
+ * waiting for caps release.
+ */
+ ceph_queue_cap_unlink_work(mdsc);
}
}
spin_unlock(&ci->i_ceph_lock);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index f71bb9c9569f..3ab9c268a8bb 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2484,6 +2484,50 @@ void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr)
}
}
+void ceph_queue_cap_unlink_work(struct ceph_mds_client *mdsc)
+{
+ struct ceph_client *cl = mdsc->fsc->client;
+ if (mdsc->stopping)
+ return;
+
+ if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_unlink_work)) {
+ doutc(cl, "caps unlink work queued\n");
+ } else {
+ doutc(cl, "failed to queue caps unlink work\n");
+ }
+}
+
+static void ceph_cap_unlink_work(struct work_struct *work)
+{
+ struct ceph_mds_client *mdsc =
+ container_of(work, struct ceph_mds_client, cap_unlink_work);
+ struct ceph_client *cl = mdsc->fsc->client;
+
+ doutc(cl, "begin\n");
+ spin_lock(&mdsc->cap_unlink_delay_lock);
+ while (!list_empty(&mdsc->cap_unlink_delay_list)) {
+ struct ceph_inode_info *ci;
+ struct inode *inode;
+
+ ci = list_first_entry(&mdsc->cap_unlink_delay_list,
+ struct ceph_inode_info,
+ i_cap_delay_list);
+ list_del_init(&ci->i_cap_delay_list);
+
+ inode = igrab(&ci->netfs.inode);
+ if (inode) {
+ spin_unlock(&mdsc->cap_unlink_delay_lock);
+ doutc(cl, "on %p %llx.%llx\n", inode,
+ ceph_vinop(inode));
+ ceph_check_caps(ci, CHECK_CAPS_FLUSH);
+ iput(inode);
+ spin_lock(&mdsc->cap_unlink_delay_lock);
+ }
+ }
+ spin_unlock(&mdsc->cap_unlink_delay_lock);
+ doutc(cl, "done\n");
+}
+
/*
* requests
*/
@@ -5359,6 +5403,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
INIT_LIST_HEAD(&mdsc->cap_delay_list);
INIT_LIST_HEAD(&mdsc->cap_wait_list);
spin_lock_init(&mdsc->cap_delay_lock);
+ INIT_LIST_HEAD(&mdsc->cap_unlink_delay_list);
+ spin_lock_init(&mdsc->cap_unlink_delay_lock);
INIT_LIST_HEAD(&mdsc->snap_flush_list);
spin_lock_init(&mdsc->snap_flush_lock);
mdsc->last_cap_flush_tid = 1;
@@ -5367,6 +5413,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
spin_lock_init(&mdsc->cap_dirty_lock);
init_waitqueue_head(&mdsc->cap_flushing_wq);
INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work);
+ INIT_WORK(&mdsc->cap_unlink_work, ceph_cap_unlink_work);
err = ceph_metric_init(&mdsc->metric);
if (err)
goto err_mdsmap;
@@ -5640,6 +5687,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
ceph_cleanup_global_and_empty_realms(mdsc);
cancel_work_sync(&mdsc->cap_reclaim_work);
+ cancel_work_sync(&mdsc->cap_unlink_work);
cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
doutc(cl, "done\n");
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 40560af38827..03f8ff00874f 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -462,6 +462,8 @@ struct ceph_mds_client {
unsigned long last_renew_caps; /* last time we renewed our caps */
struct list_head cap_delay_list; /* caps with delayed release */
spinlock_t cap_delay_lock; /* protects cap_delay_list */
+ struct list_head cap_unlink_delay_list; /* caps with delayed release for unlink */
+ spinlock_t cap_unlink_delay_lock; /* protects cap_unlink_delay_list */
struct list_head snap_flush_list; /* cap_snaps ready to flush */
spinlock_t snap_flush_lock;
@@ -475,6 +477,8 @@ struct ceph_mds_client {
struct work_struct cap_reclaim_work;
atomic_t cap_reclaim_pending;
+ struct work_struct cap_unlink_work;
+
/*
* Cap reservations
*
@@ -574,6 +578,7 @@ extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session);
extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc);
extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr);
+extern void ceph_queue_cap_unlink_work(struct ceph_mds_client *mdsc);
extern int ceph_iterate_session_caps(struct ceph_mds_session *session,
int (*cb)(struct inode *, int mds, void *),
void *arg);
diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c
index 1daeb5714faa..3de5047a7ff9 100644
--- a/fs/smb/client/cached_dir.c
+++ b/fs/smb/client/cached_dir.c
@@ -242,6 +242,7 @@ replay_again:
.desired_access = FILE_READ_DATA | FILE_READ_ATTRIBUTES,
.disposition = FILE_OPEN,
.fid = pfid,
+ .replay = !!(retries),
};
rc = SMB2_open_init(tcon, server,
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index c86a72c9d9ec..53c75cfb33ab 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -1378,6 +1378,7 @@ struct cifs_open_parms {
struct cifs_fid *fid;
umode_t mode;
bool reconnect:1;
+ bool replay:1; /* indicates that this open is for a replay */
};
struct cifs_fid {
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index d03253f8f145..ac9595504f4b 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -3444,8 +3444,18 @@ int cifs_mount_get_tcon(struct cifs_mount_ctx *mnt_ctx)
* the user on mount
*/
if ((cifs_sb->ctx->wsize == 0) ||
- (cifs_sb->ctx->wsize > server->ops->negotiate_wsize(tcon, ctx)))
- cifs_sb->ctx->wsize = server->ops->negotiate_wsize(tcon, ctx);
+ (cifs_sb->ctx->wsize > server->ops->negotiate_wsize(tcon, ctx))) {
+ cifs_sb->ctx->wsize =
+ round_down(server->ops->negotiate_wsize(tcon, ctx), PAGE_SIZE);
+ /*
+ * in the very unlikely event that the server sent a max write size under PAGE_SIZE,
+ * (which would get rounded down to 0) then reset wsize to absolute minimum eg 4096
+ */
+ if (cifs_sb->ctx->wsize == 0) {
+ cifs_sb->ctx->wsize = PAGE_SIZE;
+ cifs_dbg(VFS, "wsize too small, reset to minimum ie PAGE_SIZE, usually 4096\n");
+ }
+ }
if ((cifs_sb->ctx->rsize == 0) ||
(cifs_sb->ctx->rsize > server->ops->negotiate_rsize(tcon, ctx)))
cifs_sb->ctx->rsize = server->ops->negotiate_rsize(tcon, ctx);
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index aec8dbd1f9db..4b2f5aa2ea0e 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -1111,6 +1111,17 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
case Opt_wsize:
ctx->wsize = result.uint_32;
ctx->got_wsize = true;
+ if (ctx->wsize % PAGE_SIZE != 0) {
+ ctx->wsize = round_down(ctx->wsize, PAGE_SIZE);
+ if (ctx->wsize == 0) {
+ ctx->wsize = PAGE_SIZE;
+ cifs_dbg(VFS, "wsize too small, reset to minimum %ld\n", PAGE_SIZE);
+ } else {
+ cifs_dbg(VFS,
+ "wsize rounded down to %d to multiple of PAGE_SIZE %ld\n",
+ ctx->wsize, PAGE_SIZE);
+ }
+ }
break;
case Opt_acregmax:
ctx->acregmax = HZ * result.uint_32;
diff --git a/fs/smb/client/namespace.c b/fs/smb/client/namespace.c
index a6968573b775..4a517b280f2b 100644
--- a/fs/smb/client/namespace.c
+++ b/fs/smb/client/namespace.c
@@ -168,6 +168,21 @@ static char *automount_fullpath(struct dentry *dentry, void *page)
return s;
}
+static void fs_context_set_ids(struct smb3_fs_context *ctx)
+{
+ kuid_t uid = current_fsuid();
+ kgid_t gid = current_fsgid();
+
+ if (ctx->multiuser) {
+ if (!ctx->uid_specified)
+ ctx->linux_uid = uid;
+ if (!ctx->gid_specified)
+ ctx->linux_gid = gid;
+ }
+ if (!ctx->cruid_specified)
+ ctx->cred_uid = uid;
+}
+
/*
* Create a vfsmount that we can automount
*/
@@ -205,6 +220,7 @@ static struct vfsmount *cifs_do_automount(struct path *path)
tmp.leaf_fullpath = NULL;
tmp.UNC = tmp.prepath = NULL;
tmp.dfs_root_ses = NULL;
+ fs_context_set_ids(&tmp);
rc = smb3_fs_context_dup(ctx, &tmp);
if (rc) {
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 83c898afc835..4695433fcf39 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -619,7 +619,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
goto out;
}
- while (bytes_left >= sizeof(*p)) {
+ while (bytes_left >= (ssize_t)sizeof(*p)) {
memset(&tmp_iface, 0, sizeof(tmp_iface));
tmp_iface.speed = le64_to_cpu(p->LinkSpeed);
tmp_iface.rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE) ? 1 : 0;
@@ -1204,6 +1204,7 @@ replay_again:
.disposition = FILE_OPEN,
.create_options = cifs_create_options(cifs_sb, 0),
.fid = &fid,
+ .replay = !!(retries),
};
rc = SMB2_open_init(tcon, server,
@@ -1569,6 +1570,7 @@ replay_again:
.disposition = FILE_OPEN,
.create_options = cifs_create_options(cifs_sb, create_options),
.fid = &fid,
+ .replay = !!(retries),
};
if (qi.flags & PASSTHRU_FSCTL) {
@@ -2295,6 +2297,7 @@ replay_again:
.disposition = FILE_OPEN,
.create_options = cifs_create_options(cifs_sb, 0),
.fid = fid,
+ .replay = !!(retries),
};
rc = SMB2_open_init(tcon, server,
@@ -2681,6 +2684,7 @@ replay_again:
.disposition = FILE_OPEN,
.create_options = cifs_create_options(cifs_sb, 0),
.fid = &fid,
+ .replay = !!(retries),
};
rc = SMB2_open_init(tcon, server,
@@ -5213,7 +5217,7 @@ static int smb2_create_reparse_symlink(const unsigned int xid,
struct inode *new;
struct kvec iov;
__le16 *path;
- char *sym;
+ char *sym, sep = CIFS_DIR_SEP(cifs_sb);
u16 len, plen;
int rc = 0;
@@ -5227,7 +5231,8 @@ static int smb2_create_reparse_symlink(const unsigned int xid,
.symlink_target = sym,
};
- path = cifs_convert_path_to_utf16(symname, cifs_sb);
+ convert_delimiter(sym, sep);
+ path = cifs_convert_path_to_utf16(sym, cifs_sb);
if (!path) {
rc = -ENOMEM;
goto out;
@@ -5250,7 +5255,10 @@ static int smb2_create_reparse_symlink(const unsigned int xid,
buf->PrintNameLength = cpu_to_le16(plen);
memcpy(buf->PathBuffer, path, plen);
buf->Flags = cpu_to_le32(*symname != '/' ? SYMLINK_FLAG_RELATIVE : 0);
+ if (*sym != sep)
+ buf->Flags = cpu_to_le32(SYMLINK_FLAG_RELATIVE);
+ convert_delimiter(sym, '/');
iov.iov_base = buf;
iov.iov_len = len;
new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 4085ce27fd38..608ee05491e2 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -2404,8 +2404,13 @@ create_durable_v2_buf(struct cifs_open_parms *oparms)
*/
buf->dcontext.Timeout = cpu_to_le32(oparms->tcon->handle_timeout);
buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT);
- generate_random_uuid(buf->dcontext.CreateGuid);
- memcpy(pfid->create_guid, buf->dcontext.CreateGuid, 16);
+
+ /* for replay, we should not overwrite the existing create guid */
+ if (!oparms->replay) {
+ generate_random_uuid(buf->dcontext.CreateGuid);
+ memcpy(pfid->create_guid, buf->dcontext.CreateGuid, 16);
+ } else
+ memcpy(buf->dcontext.CreateGuid, pfid->create_guid, 16);
/* SMB2_CREATE_DURABLE_HANDLE_REQUEST is "DH2Q" */
buf->Name[0] = 'D';
@@ -3142,6 +3147,7 @@ replay_again:
/* reinitialize for possible replay */
flags = 0;
server = cifs_pick_channel(ses);
+ oparms->replay = !!(retries);
cifs_dbg(FYI, "create/open\n");
if (!ses || !server)
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index 6ab2318a9c8e..dba5dcb62bef 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -348,7 +348,12 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
struct zonefs_inode_info *zi = ZONEFS_I(inode);
if (error) {
- zonefs_io_error(inode, true);
+ /*
+ * For Sync IOs, error recovery is called from
+ * zonefs_file_dio_write().
+ */
+ if (!is_sync_kiocb(iocb))
+ zonefs_io_error(inode, true);
return error;
}
@@ -491,6 +496,14 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
ret = -EINVAL;
goto inode_unlock;
}
+ /*
+ * Advance the zone write pointer offset. This assumes that the
+ * IO will succeed, which is OK to do because we do not allow
+ * partial writes (IOMAP_DIO_PARTIAL is not set) and if the IO
+ * fails, the error path will correct the write pointer offset.
+ */
+ z->z_wpoffset += count;
+ zonefs_inode_account_active(inode);
mutex_unlock(&zi->i_truncate_mutex);
}
@@ -504,20 +517,19 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
if (ret == -ENOTBLK)
ret = -EBUSY;
- if (zonefs_zone_is_seq(z) &&
- (ret > 0 || ret == -EIOCBQUEUED)) {
- if (ret > 0)
- count = ret;
-
- /*
- * Update the zone write pointer offset assuming the write
- * operation succeeded. If it did not, the error recovery path
- * will correct it. Also do active seq file accounting.
- */
- mutex_lock(&zi->i_truncate_mutex);
- z->z_wpoffset += count;
- zonefs_inode_account_active(inode);
- mutex_unlock(&zi->i_truncate_mutex);
+ /*
+ * For a failed IO or partial completion, trigger error recovery
+ * to update the zone write pointer offset to a correct value.
+ * For asynchronous IOs, zonefs_file_write_dio_end_io() may already
+ * have executed error recovery if the IO already completed when we
+ * reach here. However, we cannot know that and execute error recovery
+ * again (that will not change anything).
+ */
+ if (zonefs_zone_is_seq(z)) {
+ if (ret > 0 && ret != count)
+ ret = -EIO;
+ if (ret < 0 && ret != -EIOCBQUEUED)
+ zonefs_io_error(inode, true);
}
inode_unlock:
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 93971742613a..b6e8e7c96251 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -246,16 +246,18 @@ static void zonefs_inode_update_mode(struct inode *inode)
z->z_mode = inode->i_mode;
}
-struct zonefs_ioerr_data {
- struct inode *inode;
- bool write;
-};
-
static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
void *data)
{
- struct zonefs_ioerr_data *err = data;
- struct inode *inode = err->inode;
+ struct blk_zone *z = data;
+
+ *z = *zone;
+ return 0;
+}
+
+static void zonefs_handle_io_error(struct inode *inode, struct blk_zone *zone,
+ bool write)
+{
struct zonefs_zone *z = zonefs_inode_zone(inode);
struct super_block *sb = inode->i_sb;
struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
@@ -270,8 +272,8 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
data_size = zonefs_check_zone_condition(sb, z, zone);
isize = i_size_read(inode);
if (!(z->z_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)) &&
- !err->write && isize == data_size)
- return 0;
+ !write && isize == data_size)
+ return;
/*
* At this point, we detected either a bad zone or an inconsistency
@@ -292,7 +294,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
* In all cases, warn about inode size inconsistency and handle the
* IO error according to the zone condition and to the mount options.
*/
- if (zonefs_zone_is_seq(z) && isize != data_size)
+ if (isize != data_size)
zonefs_warn(sb,
"inode %lu: invalid size %lld (should be %lld)\n",
inode->i_ino, isize, data_size);
@@ -352,8 +354,6 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
zonefs_i_size_write(inode, data_size);
z->z_wpoffset = data_size;
zonefs_inode_account_active(inode);
-
- return 0;
}
/*
@@ -367,23 +367,25 @@ void __zonefs_io_error(struct inode *inode, bool write)
{
struct zonefs_zone *z = zonefs_inode_zone(inode);
struct super_block *sb = inode->i_sb;
- struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
unsigned int noio_flag;
- unsigned int nr_zones = 1;
- struct zonefs_ioerr_data err = {
- .inode = inode,
- .write = write,
- };
+ struct blk_zone zone;
int ret;
/*
- * The only files that have more than one zone are conventional zone
- * files with aggregated conventional zones, for which the inode zone
- * size is always larger than the device zone size.
+ * Conventional zone have no write pointer and cannot become read-only
+ * or offline. So simply fake a report for a single or aggregated zone
+ * and let zonefs_handle_io_error() correct the zone inode information
+ * according to the mount options.
*/
- if (z->z_size > bdev_zone_sectors(sb->s_bdev))
- nr_zones = z->z_size >>
- (sbi->s_zone_sectors_shift + SECTOR_SHIFT);
+ if (!zonefs_zone_is_seq(z)) {
+ zone.start = z->z_sector;
+ zone.len = z->z_size >> SECTOR_SHIFT;
+ zone.wp = zone.start + zone.len;
+ zone.type = BLK_ZONE_TYPE_CONVENTIONAL;
+ zone.cond = BLK_ZONE_COND_NOT_WP;
+ zone.capacity = zone.len;
+ goto handle_io_error;
+ }
/*
* Memory allocations in blkdev_report_zones() can trigger a memory
@@ -394,12 +396,20 @@ void __zonefs_io_error(struct inode *inode, bool write)
* the GFP_NOIO context avoids both problems.
*/
noio_flag = memalloc_noio_save();
- ret = blkdev_report_zones(sb->s_bdev, z->z_sector, nr_zones,
- zonefs_io_error_cb, &err);
- if (ret != nr_zones)
+ ret = blkdev_report_zones(sb->s_bdev, z->z_sector, 1,
+ zonefs_io_error_cb, &zone);
+ memalloc_noio_restore(noio_flag);
+
+ if (ret != 1) {
zonefs_err(sb, "Get inode %lu zone information failed %d\n",
inode->i_ino, ret);
- memalloc_noio_restore(noio_flag);
+ zonefs_warn(sb, "remounting filesystem read-only\n");
+ sb->s_flags |= SB_RDONLY;
+ return;
+ }
+
+handle_io_error:
+ zonefs_handle_io_error(inode, &zone, write);
}
static struct kmem_cache *zonefs_inode_cachep;