summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/ctree.h2
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/extent_map.h8
-rw-r--r--fs/btrfs/file.c97
-rw-r--r--fs/btrfs/inode.c4
-rw-r--r--fs/btrfs/ioctl.c261
-rw-r--r--fs/btrfs/lzo.c11
-rw-r--r--fs/btrfs/send.c4
-rw-r--r--fs/btrfs/transaction.c12
-rw-r--r--fs/cifs/cifsacl.c9
-rw-r--r--fs/cifs/cifsfs.c1
-rw-r--r--fs/cifs/fs_context.c4
-rw-r--r--fs/cifs/sess.c11
-rw-r--r--fs/cifs/xattr.c2
-rw-r--r--fs/configfs/dir.c14
-rw-r--r--fs/file_table.c8
-rw-r--r--fs/io_uring.c24
-rw-r--r--fs/namespace.c30
-rw-r--r--fs/nfs/dir.c4
-rw-r--r--fs/nfs/inode.c9
-rw-r--r--fs/nfs/nfs4proc.c3
-rw-r--r--fs/tracefs/inode.c5
-rw-r--r--fs/xfs/xfs_super.c7
23 files changed, 413 insertions, 119 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8992e0096163..947f04789389 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3291,7 +3291,7 @@ void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
int __init btrfs_auto_defrag_init(void);
void __cold btrfs_auto_defrag_exit(void);
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode);
+ struct btrfs_inode *inode, u32 extent_thresh);
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 5a36add21305..c28ceddefae4 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -261,6 +261,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
em->mod_start = merge->mod_start;
em->generation = max(em->generation, merge->generation);
+ set_bit(EXTENT_FLAG_MERGED, &em->flags);
rb_erase_cached(&merge->rb_node, &tree->map);
RB_CLEAR_NODE(&merge->rb_node);
@@ -278,6 +279,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
RB_CLEAR_NODE(&merge->rb_node);
em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
em->generation = max(em->generation, merge->generation);
+ set_bit(EXTENT_FLAG_MERGED, &em->flags);
free_extent_map(merge);
}
}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 8e217337dff9..d2fa32ffe304 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -25,6 +25,8 @@ enum {
EXTENT_FLAG_FILLING,
/* filesystem extent mapping type */
EXTENT_FLAG_FS_MAPPING,
+ /* This em is merged from two or more physically adjacent ems */
+ EXTENT_FLAG_MERGED,
};
struct extent_map {
@@ -40,6 +42,12 @@ struct extent_map {
u64 ram_bytes;
u64 block_start;
u64 block_len;
+
+ /*
+ * Generation of the extent map, for merged em it's the highest
+ * generation of all merged ems.
+ * For non-merged extents, it's from btrfs_file_extent_item::generation.
+ */
u64 generation;
unsigned long flags;
/* Used for chunk mappings, flag EXTENT_FLAG_FS_MAPPING must be set */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 11204dbbe053..a0179cc62913 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -50,11 +50,14 @@ struct inode_defrag {
/* root objectid */
u64 root;
- /* last offset we were able to defrag */
- u64 last_offset;
-
- /* if we've wrapped around back to zero once already */
- int cycled;
+ /*
+ * The extent size threshold for autodefrag.
+ *
+ * This value is different for compressed/non-compressed extents,
+ * thus needs to be passed from higher layer.
+ * (aka, inode_should_defrag())
+ */
+ u32 extent_thresh;
};
static int __compare_inode_defrag(struct inode_defrag *defrag1,
@@ -107,8 +110,8 @@ static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
*/
if (defrag->transid < entry->transid)
entry->transid = defrag->transid;
- if (defrag->last_offset > entry->last_offset)
- entry->last_offset = defrag->last_offset;
+ entry->extent_thresh = min(defrag->extent_thresh,
+ entry->extent_thresh);
return -EEXIST;
}
}
@@ -134,7 +137,7 @@ static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
* enabled
*/
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode)
+ struct btrfs_inode *inode, u32 extent_thresh)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -160,6 +163,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
defrag->ino = btrfs_ino(inode);
defrag->transid = transid;
defrag->root = root->root_key.objectid;
+ defrag->extent_thresh = extent_thresh;
spin_lock(&fs_info->defrag_inodes_lock);
if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
@@ -179,34 +183,6 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
}
/*
- * Requeue the defrag object. If there is a defrag object that points to
- * the same inode in the tree, we will merge them together (by
- * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
- */
-static void btrfs_requeue_inode_defrag(struct btrfs_inode *inode,
- struct inode_defrag *defrag)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- int ret;
-
- if (!__need_auto_defrag(fs_info))
- goto out;
-
- /*
- * Here we don't check the IN_DEFRAG flag, because we need merge
- * them together.
- */
- spin_lock(&fs_info->defrag_inodes_lock);
- ret = __btrfs_add_inode_defrag(inode, defrag);
- spin_unlock(&fs_info->defrag_inodes_lock);
- if (ret)
- goto out;
- return;
-out:
- kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
-}
-
-/*
* pick the defragable inode that we want, if it doesn't exist, we will get
* the next one.
*/
@@ -278,8 +254,14 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
struct btrfs_root *inode_root;
struct inode *inode;
struct btrfs_ioctl_defrag_range_args range;
- int num_defrag;
- int ret;
+ int ret = 0;
+ u64 cur = 0;
+
+again:
+ if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
+ goto cleanup;
+ if (!__need_auto_defrag(fs_info))
+ goto cleanup;
/* get the inode */
inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
@@ -295,39 +277,30 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
goto cleanup;
}
+ if (cur >= i_size_read(inode)) {
+ iput(inode);
+ goto cleanup;
+ }
+
/* do a chunk of defrag */
clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
memset(&range, 0, sizeof(range));
range.len = (u64)-1;
- range.start = defrag->last_offset;
+ range.start = cur;
+ range.extent_thresh = defrag->extent_thresh;
sb_start_write(fs_info->sb);
- num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+ ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
BTRFS_DEFRAG_BATCH);
sb_end_write(fs_info->sb);
- /*
- * if we filled the whole defrag batch, there
- * must be more work to do. Queue this defrag
- * again
- */
- if (num_defrag == BTRFS_DEFRAG_BATCH) {
- defrag->last_offset = range.start;
- btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
- } else if (defrag->last_offset && !defrag->cycled) {
- /*
- * we didn't fill our defrag batch, but
- * we didn't start at zero. Make sure we loop
- * around to the start of the file.
- */
- defrag->last_offset = 0;
- defrag->cycled = 1;
- btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
- } else {
- kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
- }
-
iput(inode);
- return 0;
+
+ if (ret < 0)
+ goto cleanup;
+
+ cur = max(cur + fs_info->sectorsize, range.start);
+ goto again;
+
cleanup:
kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
return ret;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3b2403b6127f..76e530f76e3c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -560,12 +560,12 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
}
static inline void inode_should_defrag(struct btrfs_inode *inode,
- u64 start, u64 end, u64 num_bytes, u64 small_write)
+ u64 start, u64 end, u64 num_bytes, u32 small_write)
{
/* If this is a small write inside eof, kick off a defrag */
if (num_bytes < small_write &&
(start > 0 || end + 1 < inode->disk_i_size))
- btrfs_add_inode_defrag(NULL, inode);
+ btrfs_add_inode_defrag(NULL, inode, small_write);
}
/*
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 33eda39df685..8d47ec5fc4f4 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1012,8 +1012,155 @@ out:
return ret;
}
+/*
+ * Defrag specific helper to get an extent map.
+ *
+ * Differences between this and btrfs_get_extent() are:
+ *
+ * - No extent_map will be added to inode->extent_tree
+ * To reduce memory usage in the long run.
+ *
+ * - Extra optimization to skip file extents older than @newer_than
+ * By using btrfs_search_forward() we can skip entire file ranges that
+ * have extents created in past transactions, because btrfs_search_forward()
+ * will not visit leaves and nodes with a generation smaller than given
+ * minimal generation threshold (@newer_than).
+ *
+ * Return valid em if we find a file extent matching the requirement.
+ * Return NULL if we can not find a file extent matching the requirement.
+ *
+ * Return ERR_PTR() for error.
+ */
+static struct extent_map *defrag_get_extent(struct btrfs_inode *inode,
+ u64 start, u64 newer_than)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_path path = { 0 };
+ struct extent_map *em;
+ struct btrfs_key key;
+ u64 ino = btrfs_ino(inode);
+ int ret;
+
+ em = alloc_extent_map();
+ if (!em) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = start;
+
+ if (newer_than) {
+ ret = btrfs_search_forward(root, &key, &path, newer_than);
+ if (ret < 0)
+ goto err;
+ /* Can't find anything newer */
+ if (ret > 0)
+ goto not_found;
+ } else {
+ ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
+ if (ret < 0)
+ goto err;
+ }
+ if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
+ /*
+ * If btrfs_search_slot() makes path to point beyond nritems,
+ * we should not have an empty leaf, as this inode must at
+ * least have its INODE_ITEM.
+ */
+ ASSERT(btrfs_header_nritems(path.nodes[0]));
+ path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1;
+ }
+ btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+ /* Perfect match, no need to go one slot back */
+ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY &&
+ key.offset == start)
+ goto iterate;
+
+ /* We didn't find a perfect match, needs to go one slot back */
+ if (path.slots[0] > 0) {
+ btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
+ path.slots[0]--;
+ }
+
+iterate:
+ /* Iterate through the path to find a file extent covering @start */
+ while (true) {
+ u64 extent_end;
+
+ if (path.slots[0] >= btrfs_header_nritems(path.nodes[0]))
+ goto next;
+
+ btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+
+ /*
+ * We may go one slot back to INODE_REF/XATTR item, then
+ * need to go forward until we reach an EXTENT_DATA.
+ * But we should still has the correct ino as key.objectid.
+ */
+ if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY)
+ goto next;
+
+ /* It's beyond our target range, definitely not extent found */
+ if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY)
+ goto not_found;
+
+ /*
+ * | |<- File extent ->|
+ * \- start
+ *
+ * This means there is a hole between start and key.offset.
+ */
+ if (key.offset > start) {
+ em->start = start;
+ em->orig_start = start;
+ em->block_start = EXTENT_MAP_HOLE;
+ em->len = key.offset - start;
+ break;
+ }
+
+ fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
+ struct btrfs_file_extent_item);
+ extent_end = btrfs_file_extent_end(&path);
+
+ /*
+ * |<- file extent ->| |
+ * \- start
+ *
+ * We haven't reached start, search next slot.
+ */
+ if (extent_end <= start)
+ goto next;
+
+ /* Now this extent covers @start, convert it to em */
+ btrfs_extent_item_to_extent_map(inode, &path, fi, false, em);
+ break;
+next:
+ ret = btrfs_next_item(root, &path);
+ if (ret < 0)
+ goto err;
+ if (ret > 0)
+ goto not_found;
+ }
+ btrfs_release_path(&path);
+ return em;
+
+not_found:
+ btrfs_release_path(&path);
+ free_extent_map(em);
+ return NULL;
+
+err:
+ btrfs_release_path(&path);
+ free_extent_map(em);
+ return ERR_PTR(ret);
+}
+
static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
- bool locked)
+ u64 newer_than, bool locked)
{
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@@ -1028,6 +1175,20 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
em = lookup_extent_mapping(em_tree, start, sectorsize);
read_unlock(&em_tree->lock);
+ /*
+ * We can get a merged extent, in that case, we need to re-search
+ * tree to get the original em for defrag.
+ *
+ * If @newer_than is 0 or em::generation < newer_than, we can trust
+ * this em, as either we don't care about the generation, or the
+ * merged extent map will be rejected anyway.
+ */
+ if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) &&
+ newer_than && em->generation >= newer_than) {
+ free_extent_map(em);
+ em = NULL;
+ }
+
if (!em) {
struct extent_state *cached = NULL;
u64 end = start + sectorsize - 1;
@@ -1035,7 +1196,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
/* get the big lock and read metadata off disk */
if (!locked)
lock_extent_bits(io_tree, start, end, &cached);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, sectorsize);
+ em = defrag_get_extent(BTRFS_I(inode), start, newer_than);
if (!locked)
unlock_extent_cached(io_tree, start, end, &cached);
@@ -1046,23 +1207,42 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
return em;
}
+static u32 get_extent_max_capacity(const struct extent_map *em)
+{
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+ return BTRFS_MAX_COMPRESSED;
+ return BTRFS_MAX_EXTENT_SIZE;
+}
+
static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
bool locked)
{
struct extent_map *next;
- bool ret = true;
+ bool ret = false;
/* this is the last extent */
if (em->start + em->len >= i_size_read(inode))
return false;
- next = defrag_lookup_extent(inode, em->start + em->len, locked);
+ /*
+ * We want to check if the next extent can be merged with the current
+ * one, which can be an extent created in a past generation, so we pass
+ * a minimum generation of 0 to defrag_lookup_extent().
+ */
+ next = defrag_lookup_extent(inode, em->start + em->len, 0, locked);
+ /* No more em or hole */
if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
- ret = false;
- else if ((em->block_start + em->block_len == next->block_start) &&
- (em->block_len > SZ_128K && next->block_len > SZ_128K))
- ret = false;
-
+ goto out;
+ if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags))
+ goto out;
+ /*
+ * If the next extent is at its max capacity, defragging current extent
+ * makes no sense, as the total number of extents won't change.
+ */
+ if (next->len >= get_extent_max_capacity(em))
+ goto out;
+ ret = true;
+out:
free_extent_map(next);
return ret;
}
@@ -1186,8 +1366,10 @@ struct defrag_target_range {
static int defrag_collect_targets(struct btrfs_inode *inode,
u64 start, u64 len, u32 extent_thresh,
u64 newer_than, bool do_compress,
- bool locked, struct list_head *target_list)
+ bool locked, struct list_head *target_list,
+ u64 *last_scanned_ret)
{
+ bool last_is_target = false;
u64 cur = start;
int ret = 0;
@@ -1197,7 +1379,9 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
bool next_mergeable = true;
u64 range_len;
- em = defrag_lookup_extent(&inode->vfs_inode, cur, locked);
+ last_is_target = false;
+ em = defrag_lookup_extent(&inode->vfs_inode, cur,
+ newer_than, locked);
if (!em)
break;
@@ -1210,6 +1394,10 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
if (em->generation < newer_than)
goto next;
+ /* This em is under writeback, no need to defrag */
+ if (em->generation == (u64)-1)
+ goto next;
+
/*
* Our start offset might be in the middle of an existing extent
* map, so take that into account.
@@ -1250,6 +1438,13 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
if (range_len >= extent_thresh)
goto next;
+ /*
+ * Skip extents already at its max capacity, this is mostly for
+ * compressed extents, which max cap is only 128K.
+ */
+ if (em->len >= get_extent_max_capacity(em))
+ goto next;
+
next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
locked);
if (!next_mergeable) {
@@ -1268,6 +1463,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
}
add:
+ last_is_target = true;
range_len = min(extent_map_end(em), start + len) - cur;
/*
* This one is a good target, check if it can be merged into
@@ -1311,6 +1507,17 @@ next:
kfree(entry);
}
}
+ if (!ret && last_scanned_ret) {
+ /*
+ * If the last extent is not a target, the caller can skip to
+ * the end of that extent.
+ * Otherwise, we can only go the end of the specified range.
+ */
+ if (!last_is_target)
+ *last_scanned_ret = max(cur, *last_scanned_ret);
+ else
+ *last_scanned_ret = max(start + len, *last_scanned_ret);
+ }
return ret;
}
@@ -1369,7 +1576,8 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
}
static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
- u32 extent_thresh, u64 newer_than, bool do_compress)
+ u32 extent_thresh, u64 newer_than, bool do_compress,
+ u64 *last_scanned_ret)
{
struct extent_state *cached_state = NULL;
struct defrag_target_range *entry;
@@ -1415,7 +1623,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
*/
ret = defrag_collect_targets(inode, start, len, extent_thresh,
newer_than, do_compress, true,
- &target_list);
+ &target_list, last_scanned_ret);
if (ret < 0)
goto unlock_extent;
@@ -1450,7 +1658,8 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
u64 start, u32 len, u32 extent_thresh,
u64 newer_than, bool do_compress,
unsigned long *sectors_defragged,
- unsigned long max_sectors)
+ unsigned long max_sectors,
+ u64 *last_scanned_ret)
{
const u32 sectorsize = inode->root->fs_info->sectorsize;
struct defrag_target_range *entry;
@@ -1461,7 +1670,7 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
ret = defrag_collect_targets(inode, start, len, extent_thresh,
newer_than, do_compress, false,
- &target_list);
+ &target_list, NULL);
if (ret < 0)
goto out;
@@ -1478,6 +1687,15 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
range_len = min_t(u32, range_len,
(max_sectors - *sectors_defragged) * sectorsize);
+ /*
+ * If defrag_one_range() has updated last_scanned_ret,
+ * our range may already be invalid (e.g. hole punched).
+ * Skip if our range is before last_scanned_ret, as there is
+ * no need to defrag the range anymore.
+ */
+ if (entry->start + range_len <= *last_scanned_ret)
+ continue;
+
if (ra)
page_cache_sync_readahead(inode->vfs_inode.i_mapping,
ra, NULL, entry->start >> PAGE_SHIFT,
@@ -1490,7 +1708,8 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
* accounting.
*/
ret = defrag_one_range(inode, entry->start, range_len,
- extent_thresh, newer_than, do_compress);
+ extent_thresh, newer_than, do_compress,
+ last_scanned_ret);
if (ret < 0)
break;
*sectors_defragged += range_len >>
@@ -1501,6 +1720,8 @@ out:
list_del_init(&entry->list);
kfree(entry);
}
+ if (ret >= 0)
+ *last_scanned_ret = max(*last_scanned_ret, start + len);
return ret;
}
@@ -1586,6 +1807,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
while (cur < last_byte) {
const unsigned long prev_sectors_defragged = sectors_defragged;
+ u64 last_scanned = cur;
u64 cluster_end;
/* The cluster size 256K should always be page aligned */
@@ -1615,8 +1837,8 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
BTRFS_I(inode)->defrag_compress = compress_type;
ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
cluster_end + 1 - cur, extent_thresh,
- newer_than, do_compress,
- &sectors_defragged, max_to_defrag);
+ newer_than, do_compress, &sectors_defragged,
+ max_to_defrag, &last_scanned);
if (sectors_defragged > prev_sectors_defragged)
balance_dirty_pages_ratelimited(inode->i_mapping);
@@ -1624,11 +1846,12 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
btrfs_inode_unlock(inode, 0);
if (ret < 0)
break;
- cur = cluster_end + 1;
+ cur = max(cluster_end + 1, last_scanned);
if (ret > 0) {
ret = 0;
break;
}
+ cond_resched();
}
if (ra_allocated)
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 0fb90cbe7669..e6e28a9c7987 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -380,6 +380,17 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
kunmap(cur_page);
cur_in += LZO_LEN;
+ if (seg_len > lzo1x_worst_compress(PAGE_SIZE)) {
+ /*
+ * seg_len shouldn't be larger than we have allocated
+ * for workspace->cbuf
+ */
+ btrfs_err(fs_info, "unexpectedly large lzo segment len %u",
+ seg_len);
+ ret = -EIO;
+ goto out;
+ }
+
/* Copy the compressed segment payload into workspace */
copy_compressed_segment(cb, workspace->cbuf, seg_len, &cur_in);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index d8ccb62aa7d2..201eb2628aea 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4999,6 +4999,10 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
lock_page(page);
if (!PageUptodate(page)) {
unlock_page(page);
+ btrfs_err(fs_info,
+ "send: IO error at offset %llu for inode %llu root %llu",
+ page_offset(page), sctx->cur_ino,
+ sctx->send_root->root_key.objectid);
put_page(page);
ret = -EIO;
break;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c43bbc7f623e..c3cfdfd8de9b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1981,16 +1981,24 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
{
/*
- * We use writeback_inodes_sb here because if we used
+ * We use try_to_writeback_inodes_sb() here because if we used
* btrfs_start_delalloc_roots we would deadlock with fs freeze.
* Currently are holding the fs freeze lock, if we do an async flush
* we'll do btrfs_join_transaction() and deadlock because we need to
* wait for the fs freeze lock. Using the direct flushing we benefit
* from already being in a transaction and our join_transaction doesn't
* have to re-take the fs freeze lock.
+ *
+ * Note that try_to_writeback_inodes_sb() will only trigger writeback
+ * if it can read lock sb->s_umount. It will always be able to lock it,
+ * except when the filesystem is being unmounted or being frozen, but in
+ * those cases sync_filesystem() is called, which results in calling
+ * writeback_inodes_sb() while holding a write lock on sb->s_umount.
+ * Note that we don't call writeback_inodes_sb() directly, because it
+ * will emit a warning if sb->s_umount is not locked.
*/
if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
- writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC);
+ try_to_writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC);
return 0;
}
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index ee3aab3dd4ac..bf861fef2f0c 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -949,6 +949,9 @@ static void populate_new_aces(char *nacl_base,
pnntace = (struct cifs_ace *) (nacl_base + nsize);
nsize += setup_special_mode_ACE(pnntace, nmode);
num_aces++;
+ pnntace = (struct cifs_ace *) (nacl_base + nsize);
+ nsize += setup_authusers_ACE(pnntace);
+ num_aces++;
goto set_size;
}
@@ -1297,7 +1300,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
if (uid_valid(uid)) { /* chown */
uid_t id;
- nowner_sid_ptr = kmalloc(sizeof(struct cifs_sid),
+ nowner_sid_ptr = kzalloc(sizeof(struct cifs_sid),
GFP_KERNEL);
if (!nowner_sid_ptr) {
rc = -ENOMEM;
@@ -1326,7 +1329,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
}
if (gid_valid(gid)) { /* chgrp */
gid_t id;
- ngroup_sid_ptr = kmalloc(sizeof(struct cifs_sid),
+ ngroup_sid_ptr = kzalloc(sizeof(struct cifs_sid),
GFP_KERNEL);
if (!ngroup_sid_ptr) {
rc = -ENOMEM;
@@ -1613,7 +1616,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode,
nsecdesclen = secdesclen;
if (pnmode && *pnmode != NO_CHANGE_64) { /* chmod */
if (mode_from_sid)
- nsecdesclen += sizeof(struct cifs_ace);
+ nsecdesclen += 2 * sizeof(struct cifs_ace);
else /* cifsacl */
nsecdesclen += 5 * sizeof(struct cifs_ace);
} else { /* chown */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 199edac0cb59..082c21478686 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -919,6 +919,7 @@ cifs_smb3_do_mount(struct file_system_type *fs_type,
out_super:
deactivate_locked_super(sb);
+ return root;
out:
if (cifs_sb) {
kfree(cifs_sb->prepath);
diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c
index 7ec35f3f0a5f..a92e9eec521f 100644
--- a/fs/cifs/fs_context.c
+++ b/fs/cifs/fs_context.c
@@ -149,7 +149,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
fsparam_u32("echo_interval", Opt_echo_interval),
fsparam_u32("max_credits", Opt_max_credits),
fsparam_u32("handletimeout", Opt_handletimeout),
- fsparam_u32("snapshot", Opt_snapshot),
+ fsparam_u64("snapshot", Opt_snapshot),
fsparam_u32("max_channels", Opt_max_channels),
/* Mount options which take string value */
@@ -1078,7 +1078,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
ctx->echo_interval = result.uint_32;
break;
case Opt_snapshot:
- ctx->snapshot_time = result.uint_32;
+ ctx->snapshot_time = result.uint_64;
break;
case Opt_max_credits:
if (result.uint_32 < 20 || result.uint_32 > 60000) {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 5723d50340e5..32f478c7a66d 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -127,11 +127,6 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
struct cifs_server_iface *ifaces = NULL;
size_t iface_count;
- if (ses->server->dialect < SMB30_PROT_ID) {
- cifs_dbg(VFS, "multichannel is not supported on this protocol version, use 3.0 or above\n");
- return 0;
- }
-
spin_lock(&ses->chan_lock);
new_chan_count = old_chan_count = ses->chan_count;
@@ -145,6 +140,12 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
return 0;
}
+ if (ses->server->dialect < SMB30_PROT_ID) {
+ spin_unlock(&ses->chan_lock);
+ cifs_dbg(VFS, "multichannel is not supported on this protocol version, use 3.0 or above\n");
+ return 0;
+ }
+
if (!(ses->server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
ses->chan_max = 1;
spin_unlock(&ses->chan_lock);
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 7d8b72d67c80..9d486fbbfbbd 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -175,11 +175,13 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
switch (handler->flags) {
case XATTR_CIFS_NTSD_FULL:
aclflags = (CIFS_ACL_OWNER |
+ CIFS_ACL_GROUP |
CIFS_ACL_DACL |
CIFS_ACL_SACL);
break;
case XATTR_CIFS_NTSD:
aclflags = (CIFS_ACL_OWNER |
+ CIFS_ACL_GROUP |
CIFS_ACL_DACL);
break;
case XATTR_CIFS_ACL:
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index d3cd2a94d1e8..d1f9d2632202 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -34,6 +34,14 @@
*/
DEFINE_SPINLOCK(configfs_dirent_lock);
+/*
+ * All of link_obj/unlink_obj/link_group/unlink_group require that
+ * subsys->su_mutex is held.
+ * But parent configfs_subsystem is NULL when config_item is root.
+ * Use this mutex when config_item is root.
+ */
+static DEFINE_MUTEX(configfs_subsystem_mutex);
+
static void configfs_d_iput(struct dentry * dentry,
struct inode * inode)
{
@@ -1859,7 +1867,9 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
group->cg_item.ci_name = group->cg_item.ci_namebuf;
sd = root->d_fsdata;
+ mutex_lock(&configfs_subsystem_mutex);
link_group(to_config_group(sd->s_element), group);
+ mutex_unlock(&configfs_subsystem_mutex);
inode_lock_nested(d_inode(root), I_MUTEX_PARENT);
@@ -1884,7 +1894,9 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
inode_unlock(d_inode(root));
if (err) {
+ mutex_lock(&configfs_subsystem_mutex);
unlink_group(group);
+ mutex_unlock(&configfs_subsystem_mutex);
configfs_release_fs();
}
put_fragment(frag);
@@ -1931,7 +1943,9 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
dput(dentry);
+ mutex_lock(&configfs_subsystem_mutex);
unlink_group(group);
+ mutex_unlock(&configfs_subsystem_mutex);
configfs_release_fs();
}
diff --git a/fs/file_table.c b/fs/file_table.c
index 4969021fa676..7d2e692b66a9 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -27,6 +27,7 @@
#include <linux/task_work.h>
#include <linux/ima.h>
#include <linux/swap.h>
+#include <linux/kmemleak.h>
#include <linux/atomic.h>
@@ -119,8 +120,11 @@ static struct ctl_table fs_stat_sysctls[] = {
static int __init init_fs_stat_sysctls(void)
{
register_sysctl_init("fs", fs_stat_sysctls);
- if (IS_ENABLED(CONFIG_BINFMT_MISC))
- register_sysctl_mount_point("fs/binfmt_misc");
+ if (IS_ENABLED(CONFIG_BINFMT_MISC)) {
+ struct ctl_table_header *hdr;
+ hdr = register_sysctl_mount_point("fs/binfmt_misc");
+ kmemleak_not_leak(hdr);
+ }
return 0;
}
fs_initcall(init_fs_stat_sysctls);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 77b9c7e4793b..4715980e9015 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4567,6 +4567,7 @@ static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
} else {
list_add_tail(&buf->list, &(*head)->list);
}
+ cond_resched();
}
return i ? i : -ENOMEM;
@@ -7693,7 +7694,7 @@ static int io_run_task_work_sig(void)
/* when returns >0, the caller should retry */
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq,
- signed long *timeout)
+ ktime_t timeout)
{
int ret;
@@ -7705,8 +7706,9 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
if (test_bit(0, &ctx->check_cq_overflow))
return 1;
- *timeout = schedule_timeout(*timeout);
- return !*timeout ? -ETIME : 1;
+ if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS))
+ return -ETIME;
+ return 1;
}
/*
@@ -7719,7 +7721,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
{
struct io_wait_queue iowq;
struct io_rings *rings = ctx->rings;
- signed long timeout = MAX_SCHEDULE_TIMEOUT;
+ ktime_t timeout = KTIME_MAX;
int ret;
do {
@@ -7735,7 +7737,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
if (get_timespec64(&ts, uts))
return -EFAULT;
- timeout = timespec64_to_jiffies(&ts);
+ timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
}
if (sig) {
@@ -7767,7 +7769,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
}
prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
TASK_INTERRUPTIBLE);
- ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
+ ret = io_cqring_wait_schedule(ctx, &iowq, timeout);
finish_wait(&ctx->cq_wait, &iowq.wq);
cond_resched();
} while (ret > 0);
@@ -7924,7 +7926,15 @@ static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
ret = wait_for_completion_interruptible(&data->done);
if (!ret) {
mutex_lock(&ctx->uring_lock);
- break;
+ if (atomic_read(&data->refs) > 0) {
+ /*
+ * it has been revived by another thread while
+ * we were unlocked
+ */
+ mutex_unlock(&ctx->uring_lock);
+ } else {
+ break;
+ }
}
atomic_inc(&data->refs);
diff --git a/fs/namespace.c b/fs/namespace.c
index 40b994a29e90..de6fae84f1a1 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -469,6 +469,24 @@ void mnt_drop_write_file(struct file *file)
}
EXPORT_SYMBOL(mnt_drop_write_file);
+/**
+ * mnt_hold_writers - prevent write access to the given mount
+ * @mnt: mnt to prevent write access to
+ *
+ * Prevents write access to @mnt if there are no active writers for @mnt.
+ * This function needs to be called and return successfully before changing
+ * properties of @mnt that need to remain stable for callers with write access
+ * to @mnt.
+ *
+ * After this functions has been called successfully callers must pair it with
+ * a call to mnt_unhold_writers() in order to stop preventing write access to
+ * @mnt.
+ *
+ * Context: This function expects lock_mount_hash() to be held serializing
+ * setting MNT_WRITE_HOLD.
+ * Return: On success 0 is returned.
+ * On error, -EBUSY is returned.
+ */
static inline int mnt_hold_writers(struct mount *mnt)
{
mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
@@ -500,6 +518,18 @@ static inline int mnt_hold_writers(struct mount *mnt)
return 0;
}
+/**
+ * mnt_unhold_writers - stop preventing write access to the given mount
+ * @mnt: mnt to stop preventing write access to
+ *
+ * Stop preventing write access to @mnt allowing callers to gain write access
+ * to @mnt again.
+ *
+ * This function can only be called after a successful call to
+ * mnt_hold_writers().
+ *
+ * Context: This function expects lock_mount_hash() to be held.
+ */
static inline void mnt_unhold_writers(struct mount *mnt)
{
/*
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 7bc7cf6b26f0..75cb1cbe4cde 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2010,14 +2010,14 @@ no_open:
if (!res) {
inode = d_inode(dentry);
if ((lookup_flags & LOOKUP_DIRECTORY) && inode &&
- !S_ISDIR(inode->i_mode))
+ !(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)))
res = ERR_PTR(-ENOTDIR);
else if (inode && S_ISREG(inode->i_mode))
res = ERR_PTR(-EOPENSTALE);
} else if (!IS_ERR(res)) {
inode = d_inode(res);
if ((lookup_flags & LOOKUP_DIRECTORY) && inode &&
- !S_ISDIR(inode->i_mode)) {
+ !(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) {
dput(res);
res = ERR_PTR(-ENOTDIR);
} else if (inode && S_ISREG(inode->i_mode)) {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index a918c3a834b6..d96baa4450e3 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -853,12 +853,9 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
}
/* Flush out writes to the server in order to update c/mtime. */
- if ((request_mask & (STATX_CTIME|STATX_MTIME)) &&
- S_ISREG(inode->i_mode)) {
- err = filemap_write_and_wait(inode->i_mapping);
- if (err)
- goto out;
- }
+ if ((request_mask & (STATX_CTIME | STATX_MTIME)) &&
+ S_ISREG(inode->i_mode))
+ filemap_write_and_wait(inode->i_mapping);
/*
* We may force a getattr if the user cares about atime.
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f5020828ab65..0e0db6c27619 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1229,8 +1229,7 @@ nfs4_update_changeattr_locked(struct inode *inode,
NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL |
NFS_INO_INVALID_SIZE | NFS_INO_INVALID_OTHER |
NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_NLINK |
- NFS_INO_INVALID_MODE | NFS_INO_INVALID_XATTR |
- NFS_INO_REVAL_PAGECACHE;
+ NFS_INO_INVALID_MODE | NFS_INO_INVALID_XATTR;
nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
}
nfsi->attrtimeo_timestamp = jiffies;
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index bafc02bf8220..de7252715b12 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -264,7 +264,6 @@ static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts)
if (!gid_valid(gid))
return -EINVAL;
opts->gid = gid;
- set_gid(tracefs_mount->mnt_root, gid);
break;
case Opt_mode:
if (match_octal(&args[0], &option))
@@ -291,7 +290,9 @@ static int tracefs_apply_options(struct super_block *sb)
inode->i_mode |= opts->mode;
inode->i_uid = opts->uid;
- inode->i_gid = opts->gid;
+
+ /* Set all the group ids to the mount option */
+ set_gid(sb->s_root, opts->gid);
return 0;
}
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 4c0dee78b2f8..d84714e4e46a 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1753,6 +1753,11 @@ xfs_remount_ro(
};
int error;
+ /* Flush all the dirty data to disk. */
+ error = sync_filesystem(mp->m_super);
+ if (error)
+ return error;
+
/*
* Cancel background eofb scanning so it cannot race with the final
* log force+buftarg wait and deadlock the remount.
@@ -1831,8 +1836,6 @@ xfs_fs_reconfigure(
if (error)
return error;
- sync_filesystem(mp->m_super);
-
/* inode32 -> inode64 */
if (xfs_has_small_inums(mp) && !xfs_has_small_inums(new_mp)) {
mp->m_features &= ~XFS_FEAT_SMALL_INUMS;