diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-01-20 22:00:53 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-01-20 22:00:53 +0300 |
commit | 7e587c20adab5b8da4c7b5573e711a8c808e0a2d (patch) | |
tree | 29c8f36dacf1c5ef6faaa3e4e84ef92a259060dc /fs/libfs.c | |
parent | 100ceb4817a2ac650e29f107cf97161ce3e2289a (diff) | |
parent | a0634b457eca16b21a4525bc40cd2db80f52dadc (diff) | |
download | linux-7e587c20adab5b8da4c7b5573e711a8c808e0a2d.tar.xz |
Merge tag 'vfs-6.14-rc1.libfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs libfs updates from Christian Brauner:
"This improves the stable directory offset behavior in various ways.
Stable offsets are needed so that NFS can reliably read directories on
filesystems such as tmpfs:
- Improve the end-of-directory detection
According to getdents(3), the d_off field in each returned
directory entry points to the next entry in the directory. The
d_off field in the last returned entry in the readdir buffer must
contain a valid offset value, but if it points to an actual
directory entry, then readdir/getdents can loop.
Introduce a specific fixed offset value that is placed in the d_off
field of the last entry in a directory. Some user space
applications assume that the EOD offset value is larger than the
offsets of real directory entries, so the largest valid offset
value is reserved for this purpose. This new value is never
allocated by simple_offset_add().
When ->iterate_dir() returns, getdents{64} inserts the ctx->pos
value into the d_off field of the last valid entry in the readdir
buffer. When it hits EOD, offset_readdir() sets ctx->pos to the EOD
offset value so the last entry is updated to point to the EOD
marker.
When trying to read the entry at the EOD offset, offset_readdir()
terminates immediately.
- Rely on d_children to iterate stable offset directories
Instead of using the mtree to emit entries in the order of their
offset values, use it only to map incoming ctx->pos to a starting
entry. Then use the directory's d_children list, which is already
maintained properly by the dcache, to find the next child to emit.
- Narrow the range of directory offset values returned by
simple_offset_add() to 3 .. (S32_MAX - 1) on all platforms. This
means the allocation behavior is identical on 32-bit systems,
64-bit systems, and 32-bit user space on 64-bit kernels. The new
range still permits over 2 billion concurrent entries per
directory.
- Return ENOSPC when the directory offset range is exhausted. Hitting
this error is almost impossible though.
- Remove the simple_offset_empty() helper"
* tag 'vfs-6.14-rc1.libfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
libfs: Use d_children list to iterate simple_offset directories
libfs: Replace simple_offset end-of-directory detection
Revert "libfs: fix infinite directory reads for offset dir"
Revert "libfs: Add simple_offset_empty()"
libfs: Return ENOSPC when the directory offset range is exhausted
Diffstat (limited to 'fs/libfs.c')
-rw-r--r-- | fs/libfs.c | 162 |
1 files changed, 77 insertions, 85 deletions
diff --git a/fs/libfs.c b/fs/libfs.c index 2890a9c4a414..5b6120b19e99 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -245,9 +245,16 @@ const struct inode_operations simple_dir_inode_operations = { }; EXPORT_SYMBOL(simple_dir_inode_operations); -/* 0 is '.', 1 is '..', so always start with offset 2 or more */ +/* simple_offset_add() never assigns these to a dentry */ enum { - DIR_OFFSET_MIN = 2, + DIR_OFFSET_FIRST = 2, /* Find first real entry */ + DIR_OFFSET_EOD = S32_MAX, +}; + +/* simple_offset_add() allocation range */ +enum { + DIR_OFFSET_MIN = DIR_OFFSET_FIRST + 1, + DIR_OFFSET_MAX = DIR_OFFSET_EOD - 1, }; static void offset_set(struct dentry *dentry, long offset) @@ -291,9 +298,10 @@ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry) return -EBUSY; ret = mtree_alloc_cyclic(&octx->mt, &offset, dentry, DIR_OFFSET_MIN, - LONG_MAX, &octx->next_offset, GFP_KERNEL); - if (ret < 0) - return ret; + DIR_OFFSET_MAX, &octx->next_offset, + GFP_KERNEL); + if (unlikely(ret < 0)) + return ret == -EBUSY ? -ENOSPC : ret; offset_set(dentry, offset); return 0; @@ -330,38 +338,6 @@ void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry) } /** - * simple_offset_empty - Check if a dentry can be unlinked - * @dentry: dentry to be tested - * - * Returns 0 if @dentry is a non-empty directory; otherwise returns 1. - */ -int simple_offset_empty(struct dentry *dentry) -{ - struct inode *inode = d_inode(dentry); - struct offset_ctx *octx; - struct dentry *child; - unsigned long index; - int ret = 1; - - if (!inode || !S_ISDIR(inode->i_mode)) - return ret; - - index = DIR_OFFSET_MIN; - octx = inode->i_op->get_offset_ctx(inode); - mt_for_each(&octx->mt, child, index, LONG_MAX) { - spin_lock(&child->d_lock); - if (simple_positive(child)) { - spin_unlock(&child->d_lock); - ret = 0; - break; - } - spin_unlock(&child->d_lock); - } - - return ret; -} - -/** * simple_offset_rename - handle directory offsets for rename * @old_dir: parent directory of source entry * @old_dentry: dentry of source entry @@ -454,14 +430,6 @@ void simple_offset_destroy(struct offset_ctx *octx) mtree_destroy(&octx->mt); } -static int offset_dir_open(struct inode *inode, struct file *file) -{ - struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode); - - file->private_data = (void *)ctx->next_offset; - return 0; -} - /** * offset_dir_llseek - Advance the read position of a directory descriptor * @file: an open directory whose position is to be updated @@ -475,9 +443,6 @@ static int offset_dir_open(struct inode *inode, struct file *file) */ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) { - struct inode *inode = file->f_inode; - struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode); - switch (whence) { case SEEK_CUR: offset += file->f_pos; @@ -490,62 +455,89 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) return -EINVAL; } - /* In this case, ->private_data is protected by f_pos_lock */ - if (!offset) - file->private_data = (void *)ctx->next_offset; return vfs_setpos(file, offset, LONG_MAX); } -static struct dentry *offset_find_next(struct offset_ctx *octx, loff_t offset) +static struct dentry *find_positive_dentry(struct dentry *parent, + struct dentry *dentry, + bool next) { - MA_STATE(mas, &octx->mt, offset, offset); + struct dentry *found = NULL; + + spin_lock(&parent->d_lock); + if (next) + dentry = d_next_sibling(dentry); + else if (!dentry) + dentry = d_first_child(parent); + hlist_for_each_entry_from(dentry, d_sib) { + if (!simple_positive(dentry)) + continue; + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + if (simple_positive(dentry)) + found = dget_dlock(dentry); + spin_unlock(&dentry->d_lock); + if (likely(found)) + break; + } + spin_unlock(&parent->d_lock); + return found; +} + +static noinline_for_stack struct dentry * +offset_dir_lookup(struct dentry *parent, loff_t offset) +{ + struct inode *inode = d_inode(parent); + struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode); struct dentry *child, *found = NULL; - rcu_read_lock(); - child = mas_find(&mas, LONG_MAX); - if (!child) - goto out; - spin_lock(&child->d_lock); - if (simple_positive(child)) - found = dget_dlock(child); - spin_unlock(&child->d_lock); -out: - rcu_read_unlock(); + MA_STATE(mas, &octx->mt, offset, offset); + + if (offset == DIR_OFFSET_FIRST) + found = find_positive_dentry(parent, NULL, false); + else { + rcu_read_lock(); + child = mas_find(&mas, DIR_OFFSET_MAX); + found = find_positive_dentry(parent, child, false); + rcu_read_unlock(); + } return found; } static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry) { struct inode *inode = d_inode(dentry); - long offset = dentry2offset(dentry); - return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, offset, - inode->i_ino, fs_umode_to_dtype(inode->i_mode)); + return dir_emit(ctx, dentry->d_name.name, dentry->d_name.len, + inode->i_ino, fs_umode_to_dtype(inode->i_mode)); } -static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx, long last_index) +static void offset_iterate_dir(struct file *file, struct dir_context *ctx) { - struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode); + struct dentry *dir = file->f_path.dentry; struct dentry *dentry; + dentry = offset_dir_lookup(dir, ctx->pos); + if (!dentry) + goto out_eod; while (true) { - dentry = offset_find_next(octx, ctx->pos); - if (!dentry) - return; - - if (dentry2offset(dentry) >= last_index) { - dput(dentry); - return; - } + struct dentry *next; - if (!offset_dir_emit(ctx, dentry)) { - dput(dentry); - return; - } + ctx->pos = dentry2offset(dentry); + if (!offset_dir_emit(ctx, dentry)) + break; - ctx->pos = dentry2offset(dentry) + 1; + next = find_positive_dentry(dir, dentry, true); dput(dentry); + + if (!next) + goto out_eod; + dentry = next; } + dput(dentry); + return; + +out_eod: + ctx->pos = DIR_OFFSET_EOD; } /** @@ -565,6 +557,8 @@ static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx, lon * * On return, @ctx->pos contains an offset that will read the next entry * in this directory when offset_readdir() is called again with @ctx. + * Caller places this value in the d_off field of the last entry in the + * user's buffer. * * Return values: * %0 - Complete @@ -572,19 +566,17 @@ static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx, lon static int offset_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dir = file->f_path.dentry; - long last_index = (long)file->private_data; lockdep_assert_held(&d_inode(dir)->i_rwsem); if (!dir_emit_dots(file, ctx)) return 0; - - offset_iterate_dir(d_inode(dir), ctx, last_index); + if (ctx->pos != DIR_OFFSET_EOD) + offset_iterate_dir(file, ctx); return 0; } const struct file_operations simple_offset_dir_operations = { - .open = offset_dir_open, .llseek = offset_dir_llseek, .iterate_shared = offset_readdir, .read = generic_read_dir, |