summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/block-group.c14
-rw-r--r--fs/btrfs/block-group.h5
-rw-r--r--fs/btrfs/inode.c77
-rw-r--r--fs/btrfs/qgroup.c1
-rw-r--r--fs/btrfs/raid56.c11
-rw-r--r--fs/btrfs/volumes.c17
-rw-r--r--fs/erofs/decompressor.c37
-rw-r--r--fs/erofs/inode.c3
-rw-r--r--fs/erofs/zdata.c4
-rw-r--r--fs/ext4/mballoc.c172
-rw-r--r--fs/ext4/xattr.c14
-rw-r--r--fs/fuse/dir.c4
-rw-r--r--fs/fuse/inode.c8
-rw-r--r--fs/fuse/ioctl.c21
-rw-r--r--fs/iomap/buffered-io.c4
-rw-r--r--fs/jbd2/checkpoint.c277
-rw-r--r--fs/jbd2/commit.c3
-rw-r--r--fs/jbd2/transaction.c40
-rw-r--r--fs/smb/client/cifsfs.h4
-rw-r--r--fs/smb/client/cifsglob.h2
-rw-r--r--fs/smb/client/cifssmb.c2
-rw-r--r--fs/smb/client/connect.c30
-rw-r--r--fs/smb/client/dfs.c26
-rw-r--r--fs/smb/client/file.c4
-rw-r--r--fs/smb/client/ioctl.c17
-rw-r--r--fs/smb/client/smb2ops.c8
-rw-r--r--fs/smb/client/smb2transport.c2
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h75
-rw-r--r--fs/xfs/libxfs/xfs_fs.h4
-rw-r--r--fs/xfs/xfs_ondisk.h5
30 files changed, 508 insertions, 383 deletions
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 48ae509f2ac2..23726152d62d 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1640,13 +1640,14 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
- trace_btrfs_add_unused_block_group(bg);
spin_lock(&fs_info->unused_bgs_lock);
if (list_empty(&bg->bg_list)) {
btrfs_get_block_group(bg);
+ trace_btrfs_add_unused_block_group(bg);
list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
- } else {
+ } else if (!test_bit(BLOCK_GROUP_FLAG_NEW, &bg->runtime_flags)) {
/* Pull out the block group from the reclaim_bgs list. */
+ trace_btrfs_add_unused_block_group(bg);
list_move_tail(&bg->bg_list, &fs_info->unused_bgs);
}
spin_unlock(&fs_info->unused_bgs_lock);
@@ -2087,6 +2088,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
/* Shouldn't have super stripes in sequential zones */
if (zoned && nr) {
+ kfree(logical);
btrfs_err(fs_info,
"zoned: block group %llu must not contain super block",
cache->start);
@@ -2668,6 +2670,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
next:
btrfs_delayed_refs_rsv_release(fs_info, 1);
list_del_init(&block_group->bg_list);
+ clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
}
btrfs_trans_release_chunk_metadata(trans);
}
@@ -2707,6 +2710,13 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
if (!cache)
return ERR_PTR(-ENOMEM);
+ /*
+ * Mark it as new before adding it to the rbtree of block groups or any
+ * list, so that no other task finds it and calls btrfs_mark_bg_unused()
+ * before the new flag is set.
+ */
+ set_bit(BLOCK_GROUP_FLAG_NEW, &cache->runtime_flags);
+
cache->length = size;
set_free_space_tree_thresholds(cache);
cache->flags = type;
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index f204addc3fe8..381c54a56417 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -70,6 +70,11 @@ enum btrfs_block_group_flags {
BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
/* Indicate that the block group is placed on a sequential zone */
BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE,
+ /*
+ * Indicate that block group is in the list of new block groups of a
+ * transaction.
+ */
+ BLOCK_GROUP_FLAG_NEW,
};
enum btrfs_caching_type {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index dbbb67293e34..49cef61f6a39 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3482,15 +3482,21 @@ zeroit:
void btrfs_add_delayed_iput(struct btrfs_inode *inode)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ unsigned long flags;
if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1))
return;
atomic_inc(&fs_info->nr_delayed_iputs);
- spin_lock(&fs_info->delayed_iput_lock);
+ /*
+ * Need to be irq safe here because we can be called from either an irq
+ * context (see bio.c and btrfs_put_ordered_extent()) or a non-irq
+ * context.
+ */
+ spin_lock_irqsave(&fs_info->delayed_iput_lock, flags);
ASSERT(list_empty(&inode->delayed_iput));
list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs);
- spin_unlock(&fs_info->delayed_iput_lock);
+ spin_unlock_irqrestore(&fs_info->delayed_iput_lock, flags);
if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
wake_up_process(fs_info->cleaner_kthread);
}
@@ -3499,37 +3505,46 @@ static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
struct btrfs_inode *inode)
{
list_del_init(&inode->delayed_iput);
- spin_unlock(&fs_info->delayed_iput_lock);
+ spin_unlock_irq(&fs_info->delayed_iput_lock);
iput(&inode->vfs_inode);
if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
wake_up(&fs_info->delayed_iputs_wait);
- spin_lock(&fs_info->delayed_iput_lock);
+ spin_lock_irq(&fs_info->delayed_iput_lock);
}
static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
struct btrfs_inode *inode)
{
if (!list_empty(&inode->delayed_iput)) {
- spin_lock(&fs_info->delayed_iput_lock);
+ spin_lock_irq(&fs_info->delayed_iput_lock);
if (!list_empty(&inode->delayed_iput))
run_delayed_iput_locked(fs_info, inode);
- spin_unlock(&fs_info->delayed_iput_lock);
+ spin_unlock_irq(&fs_info->delayed_iput_lock);
}
}
void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
{
-
- spin_lock(&fs_info->delayed_iput_lock);
+ /*
+ * btrfs_put_ordered_extent() can run in irq context (see bio.c), which
+ * calls btrfs_add_delayed_iput() and that needs to lock
+ * fs_info->delayed_iput_lock. So we need to disable irqs here to
+ * prevent a deadlock.
+ */
+ spin_lock_irq(&fs_info->delayed_iput_lock);
while (!list_empty(&fs_info->delayed_iputs)) {
struct btrfs_inode *inode;
inode = list_first_entry(&fs_info->delayed_iputs,
struct btrfs_inode, delayed_iput);
run_delayed_iput_locked(fs_info, inode);
- cond_resched_lock(&fs_info->delayed_iput_lock);
+ if (need_resched()) {
+ spin_unlock_irq(&fs_info->delayed_iput_lock);
+ cond_resched();
+ spin_lock_irq(&fs_info->delayed_iput_lock);
+ }
}
- spin_unlock(&fs_info->delayed_iput_lock);
+ spin_unlock_irq(&fs_info->delayed_iput_lock);
}
/*
@@ -3659,11 +3674,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
found_key.type = BTRFS_INODE_ITEM_KEY;
found_key.offset = 0;
inode = btrfs_iget(fs_info->sb, last_objectid, root);
- ret = PTR_ERR_OR_ZERO(inode);
- if (ret && ret != -ENOENT)
- goto out;
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ inode = NULL;
+ if (ret != -ENOENT)
+ goto out;
+ }
- if (ret == -ENOENT && root == fs_info->tree_root) {
+ if (!inode && root == fs_info->tree_root) {
struct btrfs_root *dead_root;
int is_dead_root = 0;
@@ -3724,17 +3742,17 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
* deleted but wasn't. The inode number may have been reused,
* but either way, we can delete the orphan item.
*/
- if (ret == -ENOENT || inode->i_nlink) {
- if (!ret) {
+ if (!inode || inode->i_nlink) {
+ if (inode) {
ret = btrfs_drop_verity_items(BTRFS_I(inode));
iput(inode);
+ inode = NULL;
if (ret)
goto out;
}
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- iput(inode);
goto out;
}
btrfs_debug(fs_info, "auto deleting %Lu",
@@ -3742,10 +3760,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
ret = btrfs_del_orphan_item(trans, root,
found_key.objectid);
btrfs_end_transaction(trans);
- if (ret) {
- iput(inode);
+ if (ret)
goto out;
- }
continue;
}
@@ -4847,9 +4863,6 @@ again:
ret = -ENOMEM;
goto out;
}
- ret = set_page_extent_mapped(page);
- if (ret < 0)
- goto out_unlock;
if (!PageUptodate(page)) {
ret = btrfs_read_folio(NULL, page_folio(page));
@@ -4864,6 +4877,17 @@ again:
goto out_unlock;
}
}
+
+ /*
+ * We unlock the page after the io is completed and then re-lock it
+ * above. release_folio() could have come in between that and cleared
+ * PagePrivate(), but left the page in the mapping. Set the page mapped
+ * here to make sure it's properly set for the subpage stuff.
+ */
+ ret = set_page_extent_mapped(page);
+ if (ret < 0)
+ goto out_unlock;
+
wait_on_page_writeback(page);
lock_extent(io_tree, block_start, block_end, &cached_state);
@@ -7849,8 +7873,11 @@ static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
if (ret) {
- bbio->bio.bi_status = errno_to_blk_status(ret);
- btrfs_dio_end_io(bbio);
+ btrfs_finish_ordered_extent(dio_data->ordered, NULL,
+ file_offset, dip->bytes,
+ !ret);
+ bio->bi_status = errno_to_blk_status(ret);
+ iomap_dio_bio_end_io(bio);
return;
}
}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index da1f84a0eb29..2637d6b157ff 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -4445,4 +4445,5 @@ void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
ulist_free(entry->old_roots);
kfree(entry);
}
+ *root = RB_ROOT;
}
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index f37b925d587f..0249ea52bb80 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -71,7 +71,7 @@ static void rmw_rbio_work_locked(struct work_struct *work);
static void index_rbio_pages(struct btrfs_raid_bio *rbio);
static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
-static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check);
+static int finish_parity_scrub(struct btrfs_raid_bio *rbio);
static void scrub_rbio_work_locked(struct work_struct *work);
static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio)
@@ -2404,7 +2404,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
return 0;
}
-static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check)
+static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
{
struct btrfs_io_context *bioc = rbio->bioc;
const u32 sectorsize = bioc->fs_info->sectorsize;
@@ -2445,9 +2445,6 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check)
*/
clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
- if (!need_check)
- goto writeback;
-
p_sector.page = alloc_page(GFP_NOFS);
if (!p_sector.page)
return -ENOMEM;
@@ -2516,7 +2513,6 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check)
q_sector.page = NULL;
}
-writeback:
/*
* time to start writing. Make bios for everything from the
* higher layers (the bio_list in our rbio) and our p/q. Ignore
@@ -2699,7 +2695,6 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio)
static void scrub_rbio(struct btrfs_raid_bio *rbio)
{
- bool need_check = false;
int sector_nr;
int ret;
@@ -2722,7 +2717,7 @@ static void scrub_rbio(struct btrfs_raid_bio *rbio)
* We have every sector properly prepared. Can finish the scrub
* and writeback the good content.
*/
- ret = finish_parity_scrub(rbio, need_check);
+ ret = finish_parity_scrub(rbio);
wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
int found_errors;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 73f9ea7672db..2ecb76cf3d91 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4078,14 +4078,6 @@ static int alloc_profile_is_valid(u64 flags, int extended)
return has_single_bit_set(flags);
}
-static inline int balance_need_close(struct btrfs_fs_info *fs_info)
-{
- /* cancel requested || normal exit path */
- return atomic_read(&fs_info->balance_cancel_req) ||
- (atomic_read(&fs_info->balance_pause_req) == 0 &&
- atomic_read(&fs_info->balance_cancel_req) == 0);
-}
-
/*
* Validate target profile against allowed profiles and return true if it's OK.
* Otherwise print the error message and return false.
@@ -4275,6 +4267,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
u64 num_devices;
unsigned seq;
bool reducing_redundancy;
+ bool paused = false;
int i;
if (btrfs_fs_closing(fs_info) ||
@@ -4405,6 +4398,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) {
btrfs_info(fs_info, "balance: paused");
btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED);
+ paused = true;
}
/*
* Balance can be canceled by:
@@ -4433,8 +4427,8 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
btrfs_update_ioctl_balance_args(fs_info, bargs);
}
- if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
- balance_need_close(fs_info)) {
+ /* We didn't pause, we can clean everything up. */
+ if (!paused) {
reset_balance_state(fs_info);
btrfs_exclop_finish(fs_info);
}
@@ -6404,7 +6398,8 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
(op == BTRFS_MAP_READ || !dev_replace_is_ongoing ||
!dev_replace->tgtdev)) {
set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr);
- *mirror_num_ret = mirror_num;
+ if (mirror_num_ret)
+ *mirror_num_ret = mirror_num;
*bioc_ret = NULL;
ret = 0;
goto out;
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 2a29943fa5cc..cfad1eac7fd9 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -148,7 +148,7 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx,
*maptype = 0;
return inpage;
}
- kunmap_atomic(inpage);
+ kunmap_local(inpage);
might_sleep();
src = erofs_vm_map_ram(rq->in, ctx->inpages);
if (!src)
@@ -162,7 +162,7 @@ docopy:
src = erofs_get_pcpubuf(ctx->inpages);
if (!src) {
DBG_BUGON(1);
- kunmap_atomic(inpage);
+ kunmap_local(inpage);
return ERR_PTR(-EFAULT);
}
@@ -173,9 +173,9 @@ docopy:
min_t(unsigned int, total, PAGE_SIZE - *inputmargin);
if (!inpage)
- inpage = kmap_atomic(*in);
+ inpage = kmap_local_page(*in);
memcpy(tmp, inpage + *inputmargin, page_copycnt);
- kunmap_atomic(inpage);
+ kunmap_local(inpage);
inpage = NULL;
tmp += page_copycnt;
total -= page_copycnt;
@@ -214,7 +214,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
int ret, maptype;
DBG_BUGON(*rq->in == NULL);
- headpage = kmap_atomic(*rq->in);
+ headpage = kmap_local_page(*rq->in);
/* LZ4 decompression inplace is only safe if zero_padding is enabled */
if (erofs_sb_has_zero_padding(EROFS_SB(rq->sb))) {
@@ -223,7 +223,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
min_t(unsigned int, rq->inputsize,
rq->sb->s_blocksize - rq->pageofs_in));
if (ret) {
- kunmap_atomic(headpage);
+ kunmap_local(headpage);
return ret;
}
may_inplace = !((rq->pageofs_in + rq->inputsize) &
@@ -261,7 +261,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
}
if (maptype == 0) {
- kunmap_atomic(headpage);
+ kunmap_local(headpage);
} else if (maptype == 1) {
vm_unmap_ram(src, ctx->inpages);
} else if (maptype == 2) {
@@ -289,7 +289,7 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq,
/* one optimized fast path only for non bigpcluster cases yet */
if (ctx.inpages == 1 && ctx.outpages == 1 && !rq->inplace_io) {
DBG_BUGON(!*rq->out);
- dst = kmap_atomic(*rq->out);
+ dst = kmap_local_page(*rq->out);
dst_maptype = 0;
goto dstmap_out;
}
@@ -311,7 +311,7 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq,
dstmap_out:
ret = z_erofs_lz4_decompress_mem(&ctx, dst + rq->pageofs_out);
if (!dst_maptype)
- kunmap_atomic(dst);
+ kunmap_local(dst);
else if (dst_maptype == 2)
vm_unmap_ram(dst, ctx.outpages);
return ret;
@@ -328,7 +328,7 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
const unsigned int lefthalf = rq->outputsize - righthalf;
const unsigned int interlaced_offset =
rq->alg == Z_EROFS_COMPRESSION_SHIFTED ? 0 : rq->pageofs_out;
- unsigned char *src, *dst;
+ u8 *src;
if (outpages > 2 && rq->alg == Z_EROFS_COMPRESSION_SHIFTED) {
DBG_BUGON(1);
@@ -341,22 +341,19 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
}
src = kmap_local_page(rq->in[inpages - 1]) + rq->pageofs_in;
- if (rq->out[0]) {
- dst = kmap_local_page(rq->out[0]);
- memcpy(dst + rq->pageofs_out, src + interlaced_offset,
- righthalf);
- kunmap_local(dst);
- }
+ if (rq->out[0])
+ memcpy_to_page(rq->out[0], rq->pageofs_out,
+ src + interlaced_offset, righthalf);
if (outpages > inpages) {
DBG_BUGON(!rq->out[outpages - 1]);
if (rq->out[outpages - 1] != rq->in[inpages - 1]) {
- dst = kmap_local_page(rq->out[outpages - 1]);
- memcpy(dst, interlaced_offset ? src :
- (src + righthalf), lefthalf);
- kunmap_local(dst);
+ memcpy_to_page(rq->out[outpages - 1], 0, src +
+ (interlaced_offset ? 0 : righthalf),
+ lefthalf);
} else if (!interlaced_offset) {
memmove(src, src + righthalf, lefthalf);
+ flush_dcache_page(rq->in[inpages - 1]);
}
}
kunmap_local(src);
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index d70b12b81507..e12592727a54 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -183,7 +183,8 @@ static void *erofs_read_inode(struct erofs_buf *buf,
inode->i_flags &= ~S_DAX;
if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) &&
- vi->datalayout == EROFS_INODE_FLAT_PLAIN)
+ (vi->datalayout == EROFS_INODE_FLAT_PLAIN ||
+ vi->datalayout == EROFS_INODE_CHUNK_BASED))
inode->i_flags |= S_DAX;
if (!nblks)
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 5f1890e309c6..b69d89a11dd0 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -1035,7 +1035,7 @@ hitted:
*/
tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
- cur = end - min_t(unsigned int, offset + end - map->m_la, end);
+ cur = end - min_t(erofs_off_t, offset + end - map->m_la, end);
if (!(map->m_flags & EROFS_MAP_MAPPED)) {
zero_user_segment(page, cur, end);
goto next_part;
@@ -1841,7 +1841,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
}
cur = map->m_la + map->m_llen - 1;
- while (cur >= end) {
+ while ((cur >= end) && (cur < i_size_read(inode))) {
pgoff_t index = cur >> PAGE_SHIFT;
struct page *page;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a2475b8c9fb5..21b903fe546e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1006,14 +1006,11 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
* fls() instead since we need to know the actual length while modifying
* goal length.
*/
- order = fls(ac->ac_g_ex.fe_len);
+ order = fls(ac->ac_g_ex.fe_len) - 1;
min_order = order - sbi->s_mb_best_avail_max_trim_order;
if (min_order < 0)
min_order = 0;
- if (1 << min_order < ac->ac_o_ex.fe_len)
- min_order = fls(ac->ac_o_ex.fe_len) + 1;
-
if (sbi->s_stripe > 0) {
/*
* We are assuming that stripe size is always a multiple of
@@ -1021,9 +1018,16 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
*/
num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe);
if (1 << min_order < num_stripe_clusters)
- min_order = fls(num_stripe_clusters);
+ /*
+ * We consider 1 order less because later we round
+ * up the goal len to num_stripe_clusters
+ */
+ min_order = fls(num_stripe_clusters) - 1;
}
+ if (1 << min_order < ac->ac_o_ex.fe_len)
+ min_order = fls(ac->ac_o_ex.fe_len);
+
for (i = order; i >= min_order; i--) {
int frag_order;
/*
@@ -4761,8 +4765,8 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
int order, i;
struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
struct ext4_locality_group *lg;
- struct ext4_prealloc_space *tmp_pa, *cpa = NULL;
- ext4_lblk_t tmp_pa_start, tmp_pa_end;
+ struct ext4_prealloc_space *tmp_pa = NULL, *cpa = NULL;
+ loff_t tmp_pa_end;
struct rb_node *iter;
ext4_fsblk_t goal_block;
@@ -4770,47 +4774,151 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
return false;
- /* first, try per-file preallocation */
+ /*
+ * first, try per-file preallocation by searching the inode pa rbtree.
+ *
+ * Here, we can't do a direct traversal of the tree because
+ * ext4_mb_discard_group_preallocation() can paralelly mark the pa
+ * deleted and that can cause direct traversal to skip some entries.
+ */
read_lock(&ei->i_prealloc_lock);
+
+ if (RB_EMPTY_ROOT(&ei->i_prealloc_node)) {
+ goto try_group_pa;
+ }
+
+ /*
+ * Step 1: Find a pa with logical start immediately adjacent to the
+ * original logical start. This could be on the left or right.
+ *
+ * (tmp_pa->pa_lstart never changes so we can skip locking for it).
+ */
for (iter = ei->i_prealloc_node.rb_node; iter;
iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical,
- tmp_pa_start, iter)) {
+ tmp_pa->pa_lstart, iter)) {
tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
pa_node.inode_node);
+ }
- /* all fields in this condition don't change,
- * so we can skip locking for them */
- tmp_pa_start = tmp_pa->pa_lstart;
- tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len);
-
- /* original request start doesn't lie in this PA */
- if (ac->ac_o_ex.fe_logical < tmp_pa_start ||
- ac->ac_o_ex.fe_logical >= tmp_pa_end)
- continue;
+ /*
+ * Step 2: The adjacent pa might be to the right of logical start, find
+ * the left adjacent pa. After this step we'd have a valid tmp_pa whose
+ * logical start is towards the left of original request's logical start
+ */
+ if (tmp_pa->pa_lstart > ac->ac_o_ex.fe_logical) {
+ struct rb_node *tmp;
+ tmp = rb_prev(&tmp_pa->pa_node.inode_node);
- /* non-extent files can't have physical blocks past 2^32 */
- if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
- (tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) >
- EXT4_MAX_BLOCK_FILE_PHYS)) {
+ if (tmp) {
+ tmp_pa = rb_entry(tmp, struct ext4_prealloc_space,
+ pa_node.inode_node);
+ } else {
/*
- * Since PAs don't overlap, we won't find any
- * other PA to satisfy this.
+ * If there is no adjacent pa to the left then finding
+ * an overlapping pa is not possible hence stop searching
+ * inode pa tree
*/
- break;
+ goto try_group_pa;
}
+ }
+
+ BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical));
- /* found preallocated blocks, use them */
+ /*
+ * Step 3: If the left adjacent pa is deleted, keep moving left to find
+ * the first non deleted adjacent pa. After this step we should have a
+ * valid tmp_pa which is guaranteed to be non deleted.
+ */
+ for (iter = &tmp_pa->pa_node.inode_node;; iter = rb_prev(iter)) {
+ if (!iter) {
+ /*
+ * no non deleted left adjacent pa, so stop searching
+ * inode pa tree
+ */
+ goto try_group_pa;
+ }
+ tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
+ pa_node.inode_node);
spin_lock(&tmp_pa->pa_lock);
- if (tmp_pa->pa_deleted == 0 && tmp_pa->pa_free &&
- likely(ext4_mb_pa_goal_check(ac, tmp_pa))) {
- atomic_inc(&tmp_pa->pa_count);
- ext4_mb_use_inode_pa(ac, tmp_pa);
+ if (tmp_pa->pa_deleted == 0) {
+ /*
+ * We will keep holding the pa_lock from
+ * this point on because we don't want group discard
+ * to delete this pa underneath us. Since group
+ * discard is anyways an ENOSPC operation it
+ * should be okay for it to wait a few more cycles.
+ */
+ break;
+ } else {
spin_unlock(&tmp_pa->pa_lock);
- read_unlock(&ei->i_prealloc_lock);
- return true;
}
+ }
+
+ BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical));
+ BUG_ON(tmp_pa->pa_deleted == 1);
+
+ /*
+ * Step 4: We now have the non deleted left adjacent pa. Only this
+ * pa can possibly satisfy the request hence check if it overlaps
+ * original logical start and stop searching if it doesn't.
+ */
+ tmp_pa_end = (loff_t)tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len);
+
+ if (ac->ac_o_ex.fe_logical >= tmp_pa_end) {
spin_unlock(&tmp_pa->pa_lock);
+ goto try_group_pa;
+ }
+
+ /* non-extent files can't have physical blocks past 2^32 */
+ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
+ (tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) >
+ EXT4_MAX_BLOCK_FILE_PHYS)) {
+ /*
+ * Since PAs don't overlap, we won't find any other PA to
+ * satisfy this.
+ */
+ spin_unlock(&tmp_pa->pa_lock);
+ goto try_group_pa;
+ }
+
+ if (tmp_pa->pa_free && likely(ext4_mb_pa_goal_check(ac, tmp_pa))) {
+ atomic_inc(&tmp_pa->pa_count);
+ ext4_mb_use_inode_pa(ac, tmp_pa);
+ spin_unlock(&tmp_pa->pa_lock);
+ read_unlock(&ei->i_prealloc_lock);
+ return true;
+ } else {
+ /*
+ * We found a valid overlapping pa but couldn't use it because
+ * it had no free blocks. This should ideally never happen
+ * because:
+ *
+ * 1. When a new inode pa is added to rbtree it must have
+ * pa_free > 0 since otherwise we won't actually need
+ * preallocation.
+ *
+ * 2. An inode pa that is in the rbtree can only have it's
+ * pa_free become zero when another thread calls:
+ * ext4_mb_new_blocks
+ * ext4_mb_use_preallocated
+ * ext4_mb_use_inode_pa
+ *
+ * 3. Further, after the above calls make pa_free == 0, we will
+ * immediately remove it from the rbtree in:
+ * ext4_mb_new_blocks
+ * ext4_mb_release_context
+ * ext4_mb_put_pa
+ *
+ * 4. Since the pa_free becoming 0 and pa_free getting removed
+ * from tree both happen in ext4_mb_new_blocks, which is always
+ * called with i_data_sem held for data allocations, we can be
+ * sure that another process will never see a pa in rbtree with
+ * pa_free == 0.
+ */
+ WARN_ON_ONCE(tmp_pa->pa_free == 0);
}
+ spin_unlock(&tmp_pa->pa_lock);
+try_group_pa:
read_unlock(&ei->i_prealloc_lock);
/* can we use group allocation? */
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 321e3a888c20..05151d61b00b 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1782,6 +1782,20 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
memmove(here, (void *)here + size,
(void *)last - (void *)here + sizeof(__u32));
memset(last, 0, size);
+
+ /*
+ * Update i_inline_off - moved ibody region might contain
+ * system.data attribute. Handling a failure here won't
+ * cause other complications for setting an xattr.
+ */
+ if (!is_block && ext4_has_inline_data(inode)) {
+ ret = ext4_find_inline_data_nolock(inode);
+ if (ret) {
+ ext4_warning_inode(inode,
+ "unable to update i_inline_off");
+ goto out;
+ }
+ }
} else if (s->not_found) {
/* Insert new name. */
size_t size = EXT4_XATTR_LEN(name_len);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 35bc174f9ba2..f67bef9d83c4 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -258,7 +258,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
spin_unlock(&fi->lock);
}
kfree(forget);
- if (ret == -ENOMEM)
+ if (ret == -ENOMEM || ret == -EINTR)
goto out;
if (ret || fuse_invalid_attr(&outarg.attr) ||
fuse_stale_inode(inode, outarg.generation, &outarg.attr))
@@ -395,8 +395,6 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name
goto out_put_forget;
err = -EIO;
- if (!outarg->nodeid)
- goto out_put_forget;
if (fuse_invalid_attr(&outarg->attr))
goto out_put_forget;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index d66070af145d..f19d748890f0 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1134,7 +1134,10 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
process_init_limits(fc, arg);
if (arg->minor >= 6) {
- u64 flags = arg->flags | (u64) arg->flags2 << 32;
+ u64 flags = arg->flags;
+
+ if (flags & FUSE_INIT_EXT)
+ flags |= (u64) arg->flags2 << 32;
ra_pages = arg->max_readahead / PAGE_SIZE;
if (flags & FUSE_ASYNC_READ)
@@ -1254,7 +1257,8 @@ void fuse_send_init(struct fuse_mount *fm)
FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS |
FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA |
FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT |
- FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP;
+ FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP |
+ FUSE_HAS_EXPIRE_ONLY;
#ifdef CONFIG_FUSE_DAX
if (fm->fc->dax)
flags |= FUSE_MAP_ALIGNMENT;
diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c
index 8e01bfdfc430..726640fa439e 100644
--- a/fs/fuse/ioctl.c
+++ b/fs/fuse/ioctl.c
@@ -9,14 +9,23 @@
#include <linux/compat.h>
#include <linux/fileattr.h>
-static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args)
+static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args,
+ struct fuse_ioctl_out *outarg)
{
- ssize_t ret = fuse_simple_request(fm, args);
+ ssize_t ret;
+
+ args->out_args[0].size = sizeof(*outarg);
+ args->out_args[0].value = outarg;
+
+ ret = fuse_simple_request(fm, args);
/* Translate ENOSYS, which shouldn't be returned from fs */
if (ret == -ENOSYS)
ret = -ENOTTY;
+ if (ret >= 0 && outarg->result == -ENOSYS)
+ outarg->result = -ENOTTY;
+
return ret;
}
@@ -264,13 +273,11 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
}
ap.args.out_numargs = 2;
- ap.args.out_args[0].size = sizeof(outarg);
- ap.args.out_args[0].value = &outarg;
ap.args.out_args[1].size = out_size;
ap.args.out_pages = true;
ap.args.out_argvar = true;
- transferred = fuse_send_ioctl(fm, &ap.args);
+ transferred = fuse_send_ioctl(fm, &ap.args, &outarg);
err = transferred;
if (transferred < 0)
goto out;
@@ -399,12 +406,10 @@ static int fuse_priv_ioctl(struct inode *inode, struct fuse_file *ff,
args.in_args[1].size = inarg.in_size;
args.in_args[1].value = ptr;
args.out_numargs = 2;
- args.out_args[0].size = sizeof(outarg);
- args.out_args[0].value = &outarg;
args.out_args[1].size = inarg.out_size;
args.out_args[1].value = ptr;
- err = fuse_send_ioctl(fm, &args);
+ err = fuse_send_ioctl(fm, &args, &outarg);
if (!err) {
if (outarg.result < 0)
err = outarg.result;
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index adb92cdb24b0..aa8967cca1a3 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -872,10 +872,10 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
while ((ret = iomap_iter(&iter, ops)) > 0)
iter.processed = iomap_write_iter(&iter, i);
- if (unlikely(ret < 0))
+ if (unlikely(iter.pos == iocb->ki_pos))
return ret;
ret = iter.pos - iocb->ki_pos;
- iocb->ki_pos += ret;
+ iocb->ki_pos = iter.pos;
return ret;
}
EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 51bd38da21cd..9ec91017a7f3 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -27,7 +27,7 @@
*
* Called with j_list_lock held.
*/
-static inline void __buffer_unlink_first(struct journal_head *jh)
+static inline void __buffer_unlink(struct journal_head *jh)
{
transaction_t *transaction = jh->b_cp_transaction;
@@ -41,45 +41,6 @@ static inline void __buffer_unlink_first(struct journal_head *jh)
}
/*
- * Unlink a buffer from a transaction checkpoint(io) list.
- *
- * Called with j_list_lock held.
- */
-static inline void __buffer_unlink(struct journal_head *jh)
-{
- transaction_t *transaction = jh->b_cp_transaction;
-
- __buffer_unlink_first(jh);
- if (transaction->t_checkpoint_io_list == jh) {
- transaction->t_checkpoint_io_list = jh->b_cpnext;
- if (transaction->t_checkpoint_io_list == jh)
- transaction->t_checkpoint_io_list = NULL;
- }
-}
-
-/*
- * Move a buffer from the checkpoint list to the checkpoint io list
- *
- * Called with j_list_lock held
- */
-static inline void __buffer_relink_io(struct journal_head *jh)
-{
- transaction_t *transaction = jh->b_cp_transaction;
-
- __buffer_unlink_first(jh);
-
- if (!transaction->t_checkpoint_io_list) {
- jh->b_cpnext = jh->b_cpprev = jh;
- } else {
- jh->b_cpnext = transaction->t_checkpoint_io_list;
- jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
- jh->b_cpprev->b_cpnext = jh;
- jh->b_cpnext->b_cpprev = jh;
- }
- transaction->t_checkpoint_io_list = jh;
-}
-
-/*
* Check a checkpoint buffer could be release or not.
*
* Requires j_list_lock
@@ -183,6 +144,7 @@ __flush_batch(journal_t *journal, int *batch_count)
struct buffer_head *bh = journal->j_chkpt_bhs[i];
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
+ journal->j_chkpt_bhs[i] = NULL;
}
*batch_count = 0;
}
@@ -242,15 +204,6 @@ restart:
jh = transaction->t_checkpoint_list;
bh = jh2bh(jh);
- if (buffer_locked(bh)) {
- get_bh(bh);
- spin_unlock(&journal->j_list_lock);
- wait_on_buffer(bh);
- /* the journal_head may have gone by now */
- BUFFER_TRACE(bh, "brelse");
- __brelse(bh);
- goto retry;
- }
if (jh->b_transaction != NULL) {
transaction_t *t = jh->b_transaction;
tid_t tid = t->t_tid;
@@ -285,30 +238,50 @@ restart:
spin_lock(&journal->j_list_lock);
goto restart;
}
- if (!buffer_dirty(bh)) {
+ if (!trylock_buffer(bh)) {
+ /*
+ * The buffer is locked, it may be writing back, or
+ * flushing out in the last couple of cycles, or
+ * re-adding into a new transaction, need to check
+ * it again until it's unlocked.
+ */
+ get_bh(bh);
+ spin_unlock(&journal->j_list_lock);
+ wait_on_buffer(bh);
+ /* the journal_head may have gone by now */
+ BUFFER_TRACE(bh, "brelse");
+ __brelse(bh);
+ goto retry;
+ } else if (!buffer_dirty(bh)) {
+ unlock_buffer(bh);
BUFFER_TRACE(bh, "remove from checkpoint");
- if (__jbd2_journal_remove_checkpoint(jh))
- /* The transaction was released; we're done */
+ /*
+ * If the transaction was released or the checkpoint
+ * list was empty, we're done.
+ */
+ if (__jbd2_journal_remove_checkpoint(jh) ||
+ !transaction->t_checkpoint_list)
goto out;
- continue;
+ } else {
+ unlock_buffer(bh);
+ /*
+ * We are about to write the buffer, it could be
+ * raced by some other transaction shrink or buffer
+ * re-log logic once we release the j_list_lock,
+ * leave it on the checkpoint list and check status
+ * again to make sure it's clean.
+ */
+ BUFFER_TRACE(bh, "queue");
+ get_bh(bh);
+ J_ASSERT_BH(bh, !buffer_jwrite(bh));
+ journal->j_chkpt_bhs[batch_count++] = bh;
+ transaction->t_chp_stats.cs_written++;
+ transaction->t_checkpoint_list = jh->b_cpnext;
}
- /*
- * Important: we are about to write the buffer, and
- * possibly block, while still holding the journal
- * lock. We cannot afford to let the transaction
- * logic start messing around with this buffer before
- * we write it to disk, as that would break
- * recoverability.
- */
- BUFFER_TRACE(bh, "queue");
- get_bh(bh);
- J_ASSERT_BH(bh, !buffer_jwrite(bh));
- journal->j_chkpt_bhs[batch_count++] = bh;
- __buffer_relink_io(jh);
- transaction->t_chp_stats.cs_written++;
+
if ((batch_count == JBD2_NR_BATCH) ||
- need_resched() ||
- spin_needbreak(&journal->j_list_lock))
+ need_resched() || spin_needbreak(&journal->j_list_lock) ||
+ jh2bh(transaction->t_checkpoint_list) == journal->j_chkpt_bhs[0])
goto unlock_and_flush;
}
@@ -322,38 +295,6 @@ restart:
goto restart;
}
- /*
- * Now we issued all of the transaction's buffers, let's deal
- * with the buffers that are out for I/O.
- */
-restart2:
- /* Did somebody clean up the transaction in the meanwhile? */
- if (journal->j_checkpoint_transactions != transaction ||
- transaction->t_tid != this_tid)
- goto out;
-
- while (transaction->t_checkpoint_io_list) {
- jh = transaction->t_checkpoint_io_list;
- bh = jh2bh(jh);
- if (buffer_locked(bh)) {
- get_bh(bh);
- spin_unlock(&journal->j_list_lock);
- wait_on_buffer(bh);
- /* the journal_head may have gone by now */
- BUFFER_TRACE(bh, "brelse");
- __brelse(bh);
- spin_lock(&journal->j_list_lock);
- goto restart2;
- }
-
- /*
- * Now in whatever state the buffer currently is, we
- * know that it has been written out and so we can
- * drop it from the list
- */
- if (__jbd2_journal_remove_checkpoint(jh))
- break;
- }
out:
spin_unlock(&journal->j_list_lock);
result = jbd2_cleanup_journal_tail(journal);
@@ -409,49 +350,9 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
/* Checkpoint list management */
/*
- * journal_clean_one_cp_list
- *
- * Find all the written-back checkpoint buffers in the given list and
- * release them. If 'destroy' is set, clean all buffers unconditionally.
- *
- * Called with j_list_lock held.
- * Returns 1 if we freed the transaction, 0 otherwise.
- */
-static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy)
-{
- struct journal_head *last_jh;
- struct journal_head *next_jh = jh;
-
- if (!jh)
- return 0;
-
- last_jh = jh->b_cpprev;
- do {
- jh = next_jh;
- next_jh = jh->b_cpnext;
-
- if (!destroy && __cp_buffer_busy(jh))
- return 0;
-
- if (__jbd2_journal_remove_checkpoint(jh))
- return 1;
- /*
- * This function only frees up some memory
- * if possible so we dont have an obligation
- * to finish processing. Bail out if preemption
- * requested:
- */
- if (need_resched())
- return 0;
- } while (jh != last_jh);
-
- return 0;
-}
-
-/*
* journal_shrink_one_cp_list
*
- * Find 'nr_to_scan' written-back checkpoint buffers in the given list
+ * Find all the written-back checkpoint buffers in the given list
* and try to release them. If the whole transaction is released, set
* the 'released' parameter. Return the number of released checkpointed
* buffers.
@@ -459,15 +360,15 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy)
* Called with j_list_lock held.
*/
static unsigned long journal_shrink_one_cp_list(struct journal_head *jh,
- unsigned long *nr_to_scan,
- bool *released)
+ bool destroy, bool *released)
{
struct journal_head *last_jh;
struct journal_head *next_jh = jh;
unsigned long nr_freed = 0;
int ret;
- if (!jh || *nr_to_scan == 0)
+ *released = false;
+ if (!jh)
return 0;
last_jh = jh->b_cpprev;
@@ -475,12 +376,15 @@ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh,
jh = next_jh;
next_jh = jh->b_cpnext;
- (*nr_to_scan)--;
- if (__cp_buffer_busy(jh))
- continue;
+ if (destroy) {
+ ret = __jbd2_journal_remove_checkpoint(jh);
+ } else {
+ ret = jbd2_journal_try_remove_checkpoint(jh);
+ if (ret < 0)
+ continue;
+ }
nr_freed++;
- ret = __jbd2_journal_remove_checkpoint(jh);
if (ret) {
*released = true;
break;
@@ -488,7 +392,7 @@ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh,
if (need_resched())
break;
- } while (jh != last_jh && *nr_to_scan);
+ } while (jh != last_jh);
return nr_freed;
}
@@ -506,11 +410,11 @@ unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal,
unsigned long *nr_to_scan)
{
transaction_t *transaction, *last_transaction, *next_transaction;
- bool released;
+ bool __maybe_unused released;
tid_t first_tid = 0, last_tid = 0, next_tid = 0;
tid_t tid = 0;
unsigned long nr_freed = 0;
- unsigned long nr_scanned = *nr_to_scan;
+ unsigned long freed;
again:
spin_lock(&journal->j_list_lock);
@@ -539,19 +443,11 @@ again:
transaction = next_transaction;
next_transaction = transaction->t_cpnext;
tid = transaction->t_tid;
- released = false;
-
- nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_list,
- nr_to_scan, &released);
- if (*nr_to_scan == 0)
- break;
- if (need_resched() || spin_needbreak(&journal->j_list_lock))
- break;
- if (released)
- continue;
- nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_io_list,
- nr_to_scan, &released);
+ freed = journal_shrink_one_cp_list(transaction->t_checkpoint_list,
+ false, &released);
+ nr_freed += freed;
+ (*nr_to_scan) -= min(*nr_to_scan, freed);
if (*nr_to_scan == 0)
break;
if (need_resched() || spin_needbreak(&journal->j_list_lock))
@@ -572,9 +468,8 @@ again:
if (*nr_to_scan && next_tid)
goto again;
out:
- nr_scanned -= *nr_to_scan;
trace_jbd2_shrink_checkpoint_list(journal, first_tid, tid, last_tid,
- nr_freed, nr_scanned, next_tid);
+ nr_freed, next_tid);
return nr_freed;
}
@@ -590,7 +485,7 @@ out:
void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy)
{
transaction_t *transaction, *last_transaction, *next_transaction;
- int ret;
+ bool released;
transaction = journal->j_checkpoint_transactions;
if (!transaction)
@@ -601,8 +496,8 @@ void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy)
do {
transaction = next_transaction;
next_transaction = transaction->t_cpnext;
- ret = journal_clean_one_cp_list(transaction->t_checkpoint_list,
- destroy);
+ journal_shrink_one_cp_list(transaction->t_checkpoint_list,
+ destroy, &released);
/*
* This function only frees up some memory if possible so we
* dont have an obligation to finish processing. Bail out if
@@ -610,23 +505,12 @@ void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy)
*/
if (need_resched())
return;
- if (ret)
- continue;
- /*
- * It is essential that we are as careful as in the case of
- * t_checkpoint_list with removing the buffer from the list as
- * we can possibly see not yet submitted buffers on io_list
- */
- ret = journal_clean_one_cp_list(transaction->
- t_checkpoint_io_list, destroy);
- if (need_resched())
- return;
/*
* Stop scanning if we couldn't free the transaction. This
* avoids pointless scanning of transactions which still
* weren't checkpointed.
*/
- if (!ret)
+ if (!released)
return;
} while (transaction != last_transaction);
}
@@ -705,7 +589,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
jbd2_journal_put_journal_head(jh);
/* Is this transaction empty? */
- if (transaction->t_checkpoint_list || transaction->t_checkpoint_io_list)
+ if (transaction->t_checkpoint_list)
return 0;
/*
@@ -737,6 +621,34 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
}
/*
+ * Check the checkpoint buffer and try to remove it from the checkpoint
+ * list if it's clean. Returns -EBUSY if it is not clean, returns 1 if
+ * it frees the transaction, 0 otherwise.
+ *
+ * This function is called with j_list_lock held.
+ */
+int jbd2_journal_try_remove_checkpoint(struct journal_head *jh)
+{
+ struct buffer_head *bh = jh2bh(jh);
+
+ if (!trylock_buffer(bh))
+ return -EBUSY;
+ if (buffer_dirty(bh)) {
+ unlock_buffer(bh);
+ return -EBUSY;
+ }
+ unlock_buffer(bh);
+
+ /*
+ * Buffer is clean and the IO has finished (we held the buffer
+ * lock) so the checkpoint is done. We can safely remove the
+ * buffer from this transaction.
+ */
+ JBUFFER_TRACE(jh, "remove from checkpoint list");
+ return __jbd2_journal_remove_checkpoint(jh);
+}
+
+/*
* journal_insert_checkpoint: put a committed buffer onto a checkpoint
* list so that we know when it is safe to clean the transaction out of
* the log.
@@ -797,7 +709,6 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
J_ASSERT(transaction->t_forget == NULL);
J_ASSERT(transaction->t_shadow_list == NULL);
J_ASSERT(transaction->t_checkpoint_list == NULL);
- J_ASSERT(transaction->t_checkpoint_io_list == NULL);
J_ASSERT(atomic_read(&transaction->t_updates) == 0);
J_ASSERT(journal->j_committing_transaction != transaction);
J_ASSERT(journal->j_running_transaction != transaction);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index b33155dd7001..1073259902a6 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -1141,8 +1141,7 @@ restart_loop:
spin_lock(&journal->j_list_lock);
commit_transaction->t_state = T_FINISHED;
/* Check if the transaction can be dropped now that we are finished */
- if (commit_transaction->t_checkpoint_list == NULL &&
- commit_transaction->t_checkpoint_io_list == NULL) {
+ if (commit_transaction->t_checkpoint_list == NULL) {
__jbd2_journal_drop_transaction(journal, commit_transaction);
jbd2_journal_free_transaction(commit_transaction);
}
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 18611241f451..4d1fda1f7143 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1784,8 +1784,7 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh)
* Otherwise, if the buffer has been written to disk,
* it is safe to remove the checkpoint and drop it.
*/
- if (!buffer_dirty(bh)) {
- __jbd2_journal_remove_checkpoint(jh);
+ if (jbd2_journal_try_remove_checkpoint(jh) >= 0) {
spin_unlock(&journal->j_list_lock);
goto drop;
}
@@ -2100,35 +2099,6 @@ void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
__brelse(bh);
}
-/*
- * Called from jbd2_journal_try_to_free_buffers().
- *
- * Called under jh->b_state_lock
- */
-static void
-__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
-{
- struct journal_head *jh;
-
- jh = bh2jh(bh);
-
- if (buffer_locked(bh) || buffer_dirty(bh))
- goto out;
-
- if (jh->b_next_transaction != NULL || jh->b_transaction != NULL)
- goto out;
-
- spin_lock(&journal->j_list_lock);
- if (jh->b_cp_transaction != NULL) {
- /* written-back checkpointed metadata buffer */
- JBUFFER_TRACE(jh, "remove from checkpoint list");
- __jbd2_journal_remove_checkpoint(jh);
- }
- spin_unlock(&journal->j_list_lock);
-out:
- return;
-}
-
/**
* jbd2_journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
@@ -2186,7 +2156,13 @@ bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio)
continue;
spin_lock(&jh->b_state_lock);
- __journal_try_to_free_buffer(journal, bh);
+ if (!jh->b_transaction && !jh->b_next_transaction) {
+ spin_lock(&journal->j_list_lock);
+ /* Remove written-back checkpointed metadata buffer */
+ if (jh->b_cp_transaction != NULL)
+ jbd2_journal_try_remove_checkpoint(jh);
+ spin_unlock(&journal->j_list_lock);
+ }
spin_unlock(&jh->b_state_lock);
jbd2_journal_put_journal_head(jh);
if (buffer_jbd(bh))
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index d7274eefc666..15c8cc4b6680 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -159,6 +159,6 @@ extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
/* when changing internal version - update following two lines at same time */
-#define SMB3_PRODUCT_BUILD 43
-#define CIFS_VERSION "2.43"
+#define SMB3_PRODUCT_BUILD 44
+#define CIFS_VERSION "2.44"
#endif /* _CIFSFS_H */
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index b5808fe3469a..e5eec6d38d02 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -532,7 +532,7 @@ struct smb_version_operations {
/* Check for STATUS_IO_TIMEOUT */
bool (*is_status_io_timeout)(char *buf);
/* Check for STATUS_NETWORK_NAME_DELETED */
- void (*is_network_name_deleted)(char *buf, struct TCP_Server_Info *srv);
+ bool (*is_network_name_deleted)(char *buf, struct TCP_Server_Info *srv);
};
struct smb_version_values {
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 19f7385abeec..9dee267f1893 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -3184,7 +3184,7 @@ setAclRetry:
param_offset = offsetof(struct smb_com_transaction2_spi_req,
InformationLevel) - 4;
offset = param_offset + params;
- parm_data = ((char *) &pSMB->hdr.Protocol) + offset;
+ parm_data = ((char *)pSMB) + sizeof(pSMB->hdr.smb_buf_length) + offset;
pSMB->ParameterOffset = cpu_to_le16(param_offset);
/* convert to on the wire format for POSIX ACL */
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 85dd1b373974..9280e253bf09 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -60,7 +60,7 @@ extern bool disable_legacy_dialects;
#define TLINK_IDLE_EXPIRE (600 * HZ)
/* Drop the connection to not overload the server */
-#define NUM_STATUS_IO_TIMEOUT 5
+#define MAX_STATUS_IO_TIMEOUT 5
static int ip_connect(struct TCP_Server_Info *server);
static int generic_ip_connect(struct TCP_Server_Info *server);
@@ -1117,6 +1117,7 @@ cifs_demultiplex_thread(void *p)
struct mid_q_entry *mids[MAX_COMPOUND];
char *bufs[MAX_COMPOUND];
unsigned int noreclaim_flag, num_io_timeout = 0;
+ bool pending_reconnect = false;
noreclaim_flag = memalloc_noreclaim_save();
cifs_dbg(FYI, "Demultiplex PID: %d\n", task_pid_nr(current));
@@ -1156,6 +1157,8 @@ cifs_demultiplex_thread(void *p)
cifs_dbg(FYI, "RFC1002 header 0x%x\n", pdu_length);
if (!is_smb_response(server, buf[0]))
continue;
+
+ pending_reconnect = false;
next_pdu:
server->pdu_size = pdu_length;
@@ -1213,10 +1216,13 @@ next_pdu:
if (server->ops->is_status_io_timeout &&
server->ops->is_status_io_timeout(buf)) {
num_io_timeout++;
- if (num_io_timeout > NUM_STATUS_IO_TIMEOUT) {
- cifs_reconnect(server, false);
+ if (num_io_timeout > MAX_STATUS_IO_TIMEOUT) {
+ cifs_server_dbg(VFS,
+ "Number of request timeouts exceeded %d. Reconnecting",
+ MAX_STATUS_IO_TIMEOUT);
+
+ pending_reconnect = true;
num_io_timeout = 0;
- continue;
}
}
@@ -1226,9 +1232,14 @@ next_pdu:
if (mids[i] != NULL) {
mids[i]->resp_buf_size = server->pdu_size;
- if (bufs[i] && server->ops->is_network_name_deleted)
- server->ops->is_network_name_deleted(bufs[i],
- server);
+ if (bufs[i] != NULL) {
+ if (server->ops->is_network_name_deleted &&
+ server->ops->is_network_name_deleted(bufs[i],
+ server)) {
+ cifs_server_dbg(FYI,
+ "Share deleted. Reconnect needed");
+ }
+ }
if (!mids[i]->multiRsp || mids[i]->multiEnd)
mids[i]->callback(mids[i]);
@@ -1263,6 +1274,11 @@ next_pdu:
buf = server->smallbuf;
goto next_pdu;
}
+
+ /* do this reconnect at the very end after processing all MIDs */
+ if (pending_reconnect)
+ cifs_reconnect(server, true);
+
} /* end while !EXITING */
/* buffer usually freed in free_mid - need to free it here on exit */
diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c
index 1403a2d1ab17..df3fd3b720da 100644
--- a/fs/smb/client/dfs.c
+++ b/fs/smb/client/dfs.c
@@ -66,6 +66,12 @@ static int get_session(struct cifs_mount_ctx *mnt_ctx, const char *full_path)
return rc;
}
+/*
+ * Track individual DFS referral servers used by new DFS mount.
+ *
+ * On success, their lifetime will be shared by final tcon (dfs_ses_list).
+ * Otherwise, they will be put by dfs_put_root_smb_sessions() in cifs_mount().
+ */
static int add_root_smb_session(struct cifs_mount_ctx *mnt_ctx)
{
struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
@@ -80,11 +86,12 @@ static int add_root_smb_session(struct cifs_mount_ctx *mnt_ctx)
INIT_LIST_HEAD(&root_ses->list);
spin_lock(&cifs_tcp_ses_lock);
- ses->ses_count++;
+ cifs_smb_ses_inc_refcount(ses);
spin_unlock(&cifs_tcp_ses_lock);
root_ses->ses = ses;
list_add_tail(&root_ses->list, &mnt_ctx->dfs_ses_list);
}
+ /* Select new DFS referral server so that new referrals go through it */
ctx->dfs_root_ses = ses;
return 0;
}
@@ -242,7 +249,6 @@ out:
int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs)
{
struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
- struct cifs_ses *ses;
bool nodfs = ctx->nodfs;
int rc;
@@ -276,20 +282,8 @@ int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs)
}
*isdfs = true;
- /*
- * Prevent DFS root session of being put in the first call to
- * cifs_mount_put_conns(). If another DFS root server was not found
- * while chasing the referrals (@ctx->dfs_root_ses == @ses), then we
- * can safely put extra refcount of @ses.
- */
- ses = mnt_ctx->ses;
- mnt_ctx->ses = NULL;
- mnt_ctx->server = NULL;
- rc = __dfs_mount_share(mnt_ctx);
- if (ses == ctx->dfs_root_ses)
- cifs_put_smb_ses(ses);
-
- return rc;
+ add_root_smb_session(mnt_ctx);
+ return __dfs_mount_share(mnt_ctx);
}
/* Update dfs referral path of superblock */
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 879bc8e6555c..fc5acc95cd13 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -1080,8 +1080,8 @@ int cifs_close(struct inode *inode, struct file *file)
cfile = file->private_data;
file->private_data = NULL;
dclose = kmalloc(sizeof(struct cifs_deferred_close), GFP_KERNEL);
- if ((cinode->oplock == CIFS_CACHE_RHW_FLG) &&
- cinode->lease_granted &&
+ if ((cifs_sb->ctx->closetimeo && cinode->oplock == CIFS_CACHE_RHW_FLG)
+ && cinode->lease_granted &&
!test_bit(CIFS_INO_CLOSE_ON_LOCK, &cinode->flags) &&
dclose) {
if (test_and_clear_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) {
diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c
index fff092bbc7a3..e1904b86ed96 100644
--- a/fs/smb/client/ioctl.c
+++ b/fs/smb/client/ioctl.c
@@ -433,16 +433,21 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
* Dump encryption keys. This is an old ioctl that only
* handles AES-128-{CCM,GCM}.
*/
- if (pSMBFile == NULL)
- break;
if (!capable(CAP_SYS_ADMIN)) {
rc = -EACCES;
break;
}
- tcon = tlink_tcon(pSMBFile->tlink);
+ cifs_sb = CIFS_SB(inode->i_sb);
+ tlink = cifs_sb_tlink(cifs_sb);
+ if (IS_ERR(tlink)) {
+ rc = PTR_ERR(tlink);
+ break;
+ }
+ tcon = tlink_tcon(tlink);
if (!smb3_encryption_required(tcon)) {
rc = -EOPNOTSUPP;
+ cifs_put_tlink(tlink);
break;
}
pkey_inf.cipher_type =
@@ -459,6 +464,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
rc = -EFAULT;
else
rc = 0;
+ cifs_put_tlink(tlink);
break;
case CIFS_DUMP_FULL_KEY:
/*
@@ -470,8 +476,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
rc = -EACCES;
break;
}
- tcon = tlink_tcon(pSMBFile->tlink);
+ cifs_sb = CIFS_SB(inode->i_sb);
+ tlink = cifs_sb_tlink(cifs_sb);
+ tcon = tlink_tcon(tlink);
rc = cifs_dump_full_key(tcon, (void __user *)arg);
+ cifs_put_tlink(tlink);
break;
case CIFS_IOC_NOTIFY:
if (!S_ISDIR(inode->i_mode)) {
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 87abce010974..0f62bc373ad0 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -2395,7 +2395,7 @@ smb2_is_status_io_timeout(char *buf)
return false;
}
-static void
+static bool
smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server)
{
struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
@@ -2404,7 +2404,7 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server)
struct cifs_tcon *tcon;
if (shdr->Status != STATUS_NETWORK_NAME_DELETED)
- return;
+ return false;
/* If server is a channel, select the primary channel */
pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
@@ -2419,11 +2419,13 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server)
spin_unlock(&cifs_tcp_ses_lock);
pr_warn_once("Server share %s deleted.\n",
tcon->tree_name);
- return;
+ return true;
}
}
}
spin_unlock(&cifs_tcp_ses_lock);
+
+ return false;
}
static int
diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c
index c6db898dab7c..7676091b3e77 100644
--- a/fs/smb/client/smb2transport.c
+++ b/fs/smb/client/smb2transport.c
@@ -160,7 +160,7 @@ smb2_find_smb_ses_unlocked(struct TCP_Server_Info *server, __u64 ses_id)
spin_unlock(&ses->ses_lock);
continue;
}
- ++ses->ses_count;
+ cifs_smb_ses_inc_refcount(ses);
spin_unlock(&ses->ses_lock);
return ses;
}
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 25e2841084e1..f9015f88eca7 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -591,7 +591,7 @@ struct xfs_attr_shortform {
uint8_t valuelen; /* actual length of value (no NULL) */
uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
uint8_t nameval[]; /* name & value bytes concatenated */
- } list[1]; /* variable sized array */
+ } list[]; /* variable sized array */
};
typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */
@@ -620,19 +620,29 @@ typedef struct xfs_attr_leaf_entry { /* sorted on key, not name */
typedef struct xfs_attr_leaf_name_local {
__be16 valuelen; /* number of bytes in value */
__u8 namelen; /* length of name bytes */
- __u8 nameval[1]; /* name/value bytes */
+ /*
+ * In Linux 6.5 this flex array was converted from nameval[1] to
+ * nameval[]. Be very careful here about extra padding at the end;
+ * see xfs_attr_leaf_entsize_local() for details.
+ */
+ __u8 nameval[]; /* name/value bytes */
} xfs_attr_leaf_name_local_t;
typedef struct xfs_attr_leaf_name_remote {
__be32 valueblk; /* block number of value bytes */
__be32 valuelen; /* number of bytes in value */
__u8 namelen; /* length of name bytes */
- __u8 name[1]; /* name bytes */
+ /*
+ * In Linux 6.5 this flex array was converted from name[1] to name[].
+ * Be very careful here about extra padding at the end; see
+ * xfs_attr_leaf_entsize_remote() for details.
+ */
+ __u8 name[]; /* name bytes */
} xfs_attr_leaf_name_remote_t;
typedef struct xfs_attr_leafblock {
xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */
- xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */
+ xfs_attr_leaf_entry_t entries[]; /* sorted on key, not name */
/*
* The rest of the block contains the following structures after the
* leaf entries, growing from the bottom up. The variables are never
@@ -664,7 +674,7 @@ struct xfs_attr3_leaf_hdr {
struct xfs_attr3_leafblock {
struct xfs_attr3_leaf_hdr hdr;
- struct xfs_attr_leaf_entry entries[1];
+ struct xfs_attr_leaf_entry entries[];
/*
* The rest of the block contains the following structures after the
@@ -747,14 +757,61 @@ xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
*/
static inline int xfs_attr_leaf_entsize_remote(int nlen)
{
- return round_up(sizeof(struct xfs_attr_leaf_name_remote) - 1 +
- nlen, XFS_ATTR_LEAF_NAME_ALIGN);
+ /*
+ * Prior to Linux 6.5, struct xfs_attr_leaf_name_remote ended with
+ * name[1], which was used as a flexarray. The layout of this struct
+ * is 9 bytes of fixed-length fields followed by a __u8 flex array at
+ * offset 9.
+ *
+ * On most architectures, struct xfs_attr_leaf_name_remote had two
+ * bytes of implicit padding at the end of the struct to make the
+ * struct length 12. After converting name[1] to name[], there are
+ * three implicit padding bytes and the struct size remains 12.
+ * However, there are compiler configurations that do not add implicit
+ * padding at all (m68k) and have been broken for years.
+ *
+ * This entsize computation historically added (the xattr name length)
+ * to (the padded struct length - 1) and rounded that sum up to the
+ * nearest multiple of 4 (NAME_ALIGN). IOWs, round_up(11 + nlen, 4).
+ * This is encoded in the ondisk format, so we cannot change this.
+ *
+ * Compute the entsize from offsetof of the flexarray and manually
+ * adding bytes for the implicit padding.
+ */
+ const size_t remotesize =
+ offsetof(struct xfs_attr_leaf_name_remote, name) + 2;
+
+ return round_up(remotesize + nlen, XFS_ATTR_LEAF_NAME_ALIGN);
}
static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen)
{
- return round_up(sizeof(struct xfs_attr_leaf_name_local) - 1 +
- nlen + vlen, XFS_ATTR_LEAF_NAME_ALIGN);
+ /*
+ * Prior to Linux 6.5, struct xfs_attr_leaf_name_local ended with
+ * nameval[1], which was used as a flexarray. The layout of this
+ * struct is 3 bytes of fixed-length fields followed by a __u8 flex
+ * array at offset 3.
+ *
+ * struct xfs_attr_leaf_name_local had zero bytes of implicit padding
+ * at the end of the struct to make the struct length 4. On most
+ * architectures, after converting nameval[1] to nameval[], there is
+ * one implicit padding byte and the struct size remains 4. However,
+ * there are compiler configurations that do not add implicit padding
+ * at all (m68k) and would break.
+ *
+ * This entsize computation historically added (the xattr name and
+ * value length) to (the padded struct length - 1) and rounded that sum
+ * up to the nearest multiple of 4 (NAME_ALIGN). IOWs, the formula is
+ * round_up(3 + nlen + vlen, 4). This is encoded in the ondisk format,
+ * so we cannot change this.
+ *
+ * Compute the entsize from offsetof of the flexarray and manually
+ * adding bytes for the implicit padding.
+ */
+ const size_t localsize =
+ offsetof(struct xfs_attr_leaf_name_local, nameval);
+
+ return round_up(localsize + nlen + vlen, XFS_ATTR_LEAF_NAME_ALIGN);
}
static inline int xfs_attr_leaf_entsize_local_max(int bsize)
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 9c60ebb328b4..2cbf9ea39b8c 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -592,12 +592,12 @@ typedef struct xfs_attrlist_cursor {
struct xfs_attrlist {
__s32 al_count; /* number of entries in attrlist */
__s32 al_more; /* T/F: more attrs (do call again) */
- __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */
+ __s32 al_offset[]; /* byte offsets of attrs [var-sized] */
};
struct xfs_attrlist_ent { /* data from attr_list() */
__u32 a_valuelen; /* number bytes in value of attr */
- char a_name[1]; /* attr name (NULL terminated) */
+ char a_name[]; /* attr name (NULL terminated) */
};
typedef struct xfs_fsop_attrlist_handlereq {
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index 9737b5a9f405..c4cc99b70dd3 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -56,7 +56,7 @@ xfs_check_ondisk_structs(void)
/* dir/attr trees */
XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leaf_hdr, 80);
- XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leafblock, 88);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leafblock, 80);
XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_rmt_hdr, 56);
XFS_CHECK_STRUCT_SIZE(struct xfs_da3_blkinfo, 56);
XFS_CHECK_STRUCT_SIZE(struct xfs_da3_intnode, 64);
@@ -88,7 +88,8 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valuelen, 4);
XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8);
XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9);
- XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 40);
+ XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 32);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attr_shortform, 4);
XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.totsize, 0);
XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.count, 2);
XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].namelen, 4);