summaryrefslogtreecommitdiff
path: root/fs/libfs.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-01-20 22:00:53 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2025-01-20 22:00:53 +0300
commit7e587c20adab5b8da4c7b5573e711a8c808e0a2d (patch)
tree29c8f36dacf1c5ef6faaa3e4e84ef92a259060dc /fs/libfs.c
parent100ceb4817a2ac650e29f107cf97161ce3e2289a (diff)
parenta0634b457eca16b21a4525bc40cd2db80f52dadc (diff)
downloadlinux-7e587c20adab5b8da4c7b5573e711a8c808e0a2d.tar.xz
Merge tag 'vfs-6.14-rc1.libfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs libfs updates from Christian Brauner: "This improves the stable directory offset behavior in various ways. Stable offsets are needed so that NFS can reliably read directories on filesystems such as tmpfs: - Improve the end-of-directory detection According to getdents(3), the d_off field in each returned directory entry points to the next entry in the directory. The d_off field in the last returned entry in the readdir buffer must contain a valid offset value, but if it points to an actual directory entry, then readdir/getdents can loop. Introduce a specific fixed offset value that is placed in the d_off field of the last entry in a directory. Some user space applications assume that the EOD offset value is larger than the offsets of real directory entries, so the largest valid offset value is reserved for this purpose. This new value is never allocated by simple_offset_add(). When ->iterate_dir() returns, getdents{64} inserts the ctx->pos value into the d_off field of the last valid entry in the readdir buffer. When it hits EOD, offset_readdir() sets ctx->pos to the EOD offset value so the last entry is updated to point to the EOD marker. When trying to read the entry at the EOD offset, offset_readdir() terminates immediately. - Rely on d_children to iterate stable offset directories Instead of using the mtree to emit entries in the order of their offset values, use it only to map incoming ctx->pos to a starting entry. Then use the directory's d_children list, which is already maintained properly by the dcache, to find the next child to emit. - Narrow the range of directory offset values returned by simple_offset_add() to 3 .. (S32_MAX - 1) on all platforms. This means the allocation behavior is identical on 32-bit systems, 64-bit systems, and 32-bit user space on 64-bit kernels. The new range still permits over 2 billion concurrent entries per directory. - Return ENOSPC when the directory offset range is exhausted. Hitting this error is almost impossible though. - Remove the simple_offset_empty() helper" * tag 'vfs-6.14-rc1.libfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: libfs: Use d_children list to iterate simple_offset directories libfs: Replace simple_offset end-of-directory detection Revert "libfs: fix infinite directory reads for offset dir" Revert "libfs: Add simple_offset_empty()" libfs: Return ENOSPC when the directory offset range is exhausted
Diffstat (limited to 'fs/libfs.c')
-rw-r--r--fs/libfs.c162
1 files changed, 77 insertions, 85 deletions
diff --git a/fs/libfs.c b/fs/libfs.c
index 2890a9c4a414..5b6120b19e99 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -245,9 +245,16 @@ const struct inode_operations simple_dir_inode_operations = {
};
EXPORT_SYMBOL(simple_dir_inode_operations);
-/* 0 is '.', 1 is '..', so always start with offset 2 or more */
+/* simple_offset_add() never assigns these to a dentry */
enum {
- DIR_OFFSET_MIN = 2,
+ DIR_OFFSET_FIRST = 2, /* Find first real entry */
+ DIR_OFFSET_EOD = S32_MAX,
+};
+
+/* simple_offset_add() allocation range */
+enum {
+ DIR_OFFSET_MIN = DIR_OFFSET_FIRST + 1,
+ DIR_OFFSET_MAX = DIR_OFFSET_EOD - 1,
};
static void offset_set(struct dentry *dentry, long offset)
@@ -291,9 +298,10 @@ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry)
return -EBUSY;
ret = mtree_alloc_cyclic(&octx->mt, &offset, dentry, DIR_OFFSET_MIN,
- LONG_MAX, &octx->next_offset, GFP_KERNEL);
- if (ret < 0)
- return ret;
+ DIR_OFFSET_MAX, &octx->next_offset,
+ GFP_KERNEL);
+ if (unlikely(ret < 0))
+ return ret == -EBUSY ? -ENOSPC : ret;
offset_set(dentry, offset);
return 0;
@@ -330,38 +338,6 @@ void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry)
}
/**
- * simple_offset_empty - Check if a dentry can be unlinked
- * @dentry: dentry to be tested
- *
- * Returns 0 if @dentry is a non-empty directory; otherwise returns 1.
- */
-int simple_offset_empty(struct dentry *dentry)
-{
- struct inode *inode = d_inode(dentry);
- struct offset_ctx *octx;
- struct dentry *child;
- unsigned long index;
- int ret = 1;
-
- if (!inode || !S_ISDIR(inode->i_mode))
- return ret;
-
- index = DIR_OFFSET_MIN;
- octx = inode->i_op->get_offset_ctx(inode);
- mt_for_each(&octx->mt, child, index, LONG_MAX) {
- spin_lock(&child->d_lock);
- if (simple_positive(child)) {
- spin_unlock(&child->d_lock);
- ret = 0;
- break;
- }
- spin_unlock(&child->d_lock);
- }
-
- return ret;
-}
-
-/**
* simple_offset_rename - handle directory offsets for rename
* @old_dir: parent directory of source entry
* @old_dentry: dentry of source entry
@@ -454,14 +430,6 @@ void simple_offset_destroy(struct offset_ctx *octx)
mtree_destroy(&octx->mt);
}
-static int offset_dir_open(struct inode *inode, struct file *file)
-{
- struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode);
-
- file->private_data = (void *)ctx->next_offset;
- return 0;
-}
-
/**
* offset_dir_llseek - Advance the read position of a directory descriptor
* @file: an open directory whose position is to be updated
@@ -475,9 +443,6 @@ static int offset_dir_open(struct inode *inode, struct file *file)
*/
static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
{
- struct inode *inode = file->f_inode;
- struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode);
-
switch (whence) {
case SEEK_CUR:
offset += file->f_pos;
@@ -490,62 +455,89 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
return -EINVAL;
}
- /* In this case, ->private_data is protected by f_pos_lock */
- if (!offset)
- file->private_data = (void *)ctx->next_offset;
return vfs_setpos(file, offset, LONG_MAX);
}
-static struct dentry *offset_find_next(struct offset_ctx *octx, loff_t offset)
+static struct dentry *find_positive_dentry(struct dentry *parent,
+ struct dentry *dentry,
+ bool next)
{
- MA_STATE(mas, &octx->mt, offset, offset);
+ struct dentry *found = NULL;
+
+ spin_lock(&parent->d_lock);
+ if (next)
+ dentry = d_next_sibling(dentry);
+ else if (!dentry)
+ dentry = d_first_child(parent);
+ hlist_for_each_entry_from(dentry, d_sib) {
+ if (!simple_positive(dentry))
+ continue;
+ spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+ if (simple_positive(dentry))
+ found = dget_dlock(dentry);
+ spin_unlock(&dentry->d_lock);
+ if (likely(found))
+ break;
+ }
+ spin_unlock(&parent->d_lock);
+ return found;
+}
+
+static noinline_for_stack struct dentry *
+offset_dir_lookup(struct dentry *parent, loff_t offset)
+{
+ struct inode *inode = d_inode(parent);
+ struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode);
struct dentry *child, *found = NULL;
- rcu_read_lock();
- child = mas_find(&mas, LONG_MAX);
- if (!child)
- goto out;
- spin_lock(&child->d_lock);
- if (simple_positive(child))
- found = dget_dlock(child);
- spin_unlock(&child->d_lock);
-out:
- rcu_read_unlock();
+ MA_STATE(mas, &octx->mt, offset, offset);
+
+ if (offset == DIR_OFFSET_FIRST)
+ found = find_positive_dentry(parent, NULL, false);
+ else {
+ rcu_read_lock();
+ child = mas_find(&mas, DIR_OFFSET_MAX);
+ found = find_positive_dentry(parent, child, false);
+ rcu_read_unlock();
+ }
return found;
}
static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
- long offset = dentry2offset(dentry);
- return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, offset,
- inode->i_ino, fs_umode_to_dtype(inode->i_mode));
+ return dir_emit(ctx, dentry->d_name.name, dentry->d_name.len,
+ inode->i_ino, fs_umode_to_dtype(inode->i_mode));
}
-static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx, long last_index)
+static void offset_iterate_dir(struct file *file, struct dir_context *ctx)
{
- struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode);
+ struct dentry *dir = file->f_path.dentry;
struct dentry *dentry;
+ dentry = offset_dir_lookup(dir, ctx->pos);
+ if (!dentry)
+ goto out_eod;
while (true) {
- dentry = offset_find_next(octx, ctx->pos);
- if (!dentry)
- return;
-
- if (dentry2offset(dentry) >= last_index) {
- dput(dentry);
- return;
- }
+ struct dentry *next;
- if (!offset_dir_emit(ctx, dentry)) {
- dput(dentry);
- return;
- }
+ ctx->pos = dentry2offset(dentry);
+ if (!offset_dir_emit(ctx, dentry))
+ break;
- ctx->pos = dentry2offset(dentry) + 1;
+ next = find_positive_dentry(dir, dentry, true);
dput(dentry);
+
+ if (!next)
+ goto out_eod;
+ dentry = next;
}
+ dput(dentry);
+ return;
+
+out_eod:
+ ctx->pos = DIR_OFFSET_EOD;
}
/**
@@ -565,6 +557,8 @@ static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx, lon
*
* On return, @ctx->pos contains an offset that will read the next entry
* in this directory when offset_readdir() is called again with @ctx.
+ * Caller places this value in the d_off field of the last entry in the
+ * user's buffer.
*
* Return values:
* %0 - Complete
@@ -572,19 +566,17 @@ static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx, lon
static int offset_readdir(struct file *file, struct dir_context *ctx)
{
struct dentry *dir = file->f_path.dentry;
- long last_index = (long)file->private_data;
lockdep_assert_held(&d_inode(dir)->i_rwsem);
if (!dir_emit_dots(file, ctx))
return 0;
-
- offset_iterate_dir(d_inode(dir), ctx, last_index);
+ if (ctx->pos != DIR_OFFSET_EOD)
+ offset_iterate_dir(file, ctx);
return 0;
}
const struct file_operations simple_offset_dir_operations = {
- .open = offset_dir_open,
.llseek = offset_dir_llseek,
.iterate_shared = offset_readdir,
.read = generic_read_dir,