summaryrefslogtreecommitdiff
path: root/fs/orangefs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/orangefs')
-rw-r--r--fs/orangefs/dir.c528
-rw-r--r--fs/orangefs/downcall.h16
-rw-r--r--fs/orangefs/orangefs-dev-proto.h7
3 files changed, 197 insertions, 354 deletions
diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c
index 284373a57a08..cf0ebb06b84e 100644
--- a/fs/orangefs/dir.c
+++ b/fs/orangefs/dir.c
@@ -1,7 +1,5 @@
/*
- * (C) 2001 Clemson University and The University of Chicago
- *
- * See COPYING in top-level directory.
+ * Copyright 2017 Omnibond Systems, L.L.C.
*/
#include "protocol.h"
@@ -9,388 +7,244 @@
#include "orangefs-bufmap.h"
/*
- * decode routine used by kmod to deal with the blob sent from
- * userspace for readdirs. The blob contains zero or more of these
- * sub-blobs:
- * __u32 - represents length of the character string that follows.
- * string - between 1 and ORANGEFS_NAME_MAX bytes long.
- * padding - (if needed) to cause the __u32 plus the string to be
- * eight byte aligned.
- * khandle - sizeof(khandle) bytes.
+ * There can be up to 512 directory entries. Each entry is encoded as
+ * follows:
+ * 4 bytes: string size (n)
+ * n bytes: string
+ * 1 byte: trailing zero
+ * padding to 8 bytes
+ * 16 bytes: khandle
+ * padding to 8 bytes
*/
-static long decode_dirents(char *ptr, size_t size,
- struct orangefs_readdir_response_s *readdir)
-{
- int i;
- struct orangefs_readdir_response_s *rd =
- (struct orangefs_readdir_response_s *) ptr;
- char *buf = ptr;
- int khandle_size = sizeof(struct orangefs_khandle);
- size_t offset = offsetof(struct orangefs_readdir_response_s,
- dirent_array);
- /* 8 reflects eight byte alignment */
- int smallest_blob = khandle_size + 8;
- __u32 len;
- int aligned_len;
- int sizeof_u32 = sizeof(__u32);
- long ret;
-
- gossip_debug(GOSSIP_DIR_DEBUG, "%s: size:%zu:\n", __func__, size);
+#define MAX_DIRECTORY ((4 + 257 + 3 + 16)*512)
- /* size is = offset on empty dirs, > offset on non-empty dirs... */
- if (size < offset) {
- gossip_err("%s: size:%zu: offset:%zu:\n",
- __func__,
- size,
- offset);
- ret = -EINVAL;
- goto out;
- }
-
- if ((size == offset) && (readdir->orangefs_dirent_outcount != 0)) {
- gossip_err("%s: size:%zu: dirent_outcount:%d:\n",
- __func__,
- size,
- readdir->orangefs_dirent_outcount);
- ret = -EINVAL;
- goto out;
- }
-
- readdir->token = rd->token;
- readdir->orangefs_dirent_outcount = rd->orangefs_dirent_outcount;
- readdir->dirent_array = kcalloc(readdir->orangefs_dirent_outcount,
- sizeof(*readdir->dirent_array),
- GFP_KERNEL);
- if (readdir->dirent_array == NULL) {
- gossip_err("%s: kcalloc failed.\n", __func__);
- ret = -ENOMEM;
- goto out;
- }
-
- buf += offset;
- size -= offset;
-
- for (i = 0; i < readdir->orangefs_dirent_outcount; i++) {
- if (size < smallest_blob) {
- gossip_err("%s: size:%zu: smallest_blob:%d:\n",
- __func__,
- size,
- smallest_blob);
- ret = -EINVAL;
- goto free;
- }
-
- len = *(__u32 *)buf;
- if ((len < 1) || (len > ORANGEFS_NAME_MAX)) {
- gossip_err("%s: len:%d:\n", __func__, len);
- ret = -EINVAL;
- goto free;
- }
-
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: size:%zu: len:%d:\n",
- __func__,
- size,
- len);
-
- readdir->dirent_array[i].d_name = buf + sizeof_u32;
- readdir->dirent_array[i].d_length = len;
-
- /*
- * Calculate "aligned" length of this string and its
- * associated __u32 descriptor.
- */
- aligned_len = ((sizeof_u32 + len + 1) + 7) & ~7;
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: aligned_len:%d:\n",
- __func__,
- aligned_len);
-
- /*
- * The end of the blob should coincide with the end
- * of the last sub-blob.
- */
- if (size < aligned_len + khandle_size) {
- gossip_err("%s: ran off the end of the blob.\n",
- __func__);
- ret = -EINVAL;
- goto free;
- }
- size -= aligned_len + khandle_size;
-
- buf += aligned_len;
-
- readdir->dirent_array[i].khandle =
- *(struct orangefs_khandle *) buf;
- buf += khandle_size;
- }
- ret = buf - ptr;
- gossip_debug(GOSSIP_DIR_DEBUG, "%s: returning:%ld:\n", __func__, ret);
- goto out;
-
-free:
- kfree(readdir->dirent_array);
- readdir->dirent_array = NULL;
-
-out:
- return ret;
-}
+struct orangefs_dir {
+ __u64 token;
+ void *directory;
+ size_t i, len;
+ int error;
+};
/*
- * Read directory entries from an instance of an open directory.
+ * The userspace component sends several directory entries of the
+ * following format. The first four bytes are the string length not
+ * including a trailing zero byte. This is followed by the string and a
+ * trailing zero padded to the next four byte boundry. This is followed
+ * by the sixteen byte khandle padded to the next eight byte boundry.
+ *
+ * The trailer_buf starts with a struct orangefs_readdir_response_s
+ * which must be skipped to get to the directory data.
*/
-static int orangefs_readdir(struct file *file, struct dir_context *ctx)
-{
- int ret = 0;
- int buffer_index;
- /*
- * ptoken supports Orangefs' distributed directory logic, added
- * in 2.9.2.
- */
- __u64 *ptoken = file->private_data;
- __u64 pos = 0;
- ino_t ino = 0;
- struct dentry *dentry = file->f_path.dentry;
- struct orangefs_kernel_op_s *new_op = NULL;
- struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(dentry->d_inode);
- struct orangefs_readdir_response_s readdir_response;
- void *dents_buf;
- int i = 0;
- int len = 0;
- ino_t current_ino = 0;
- char *current_entry = NULL;
- long bytes_decoded;
-
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: ctx->pos:%lld, ptoken = %llu\n",
- __func__,
- lld(ctx->pos),
- llu(*ptoken));
-
- pos = (__u64) ctx->pos;
-
- /* are we done? */
- if (pos == ORANGEFS_READDIR_END) {
- gossip_debug(GOSSIP_DIR_DEBUG,
- "Skipping to termination path\n");
- return 0;
- }
-
- gossip_debug(GOSSIP_DIR_DEBUG,
- "orangefs_readdir called on %pd (pos=%llu)\n",
- dentry, llu(pos));
-
- memset(&readdir_response, 0, sizeof(readdir_response));
- new_op = op_alloc(ORANGEFS_VFS_OP_READDIR);
- if (!new_op)
+static int orangefs_dir_more(struct orangefs_inode_s *oi,
+ struct orangefs_dir *od, struct dentry *dentry)
+{
+ const size_t offset =
+ sizeof(struct orangefs_readdir_response_s);
+ struct orangefs_readdir_response_s *resp;
+ struct orangefs_kernel_op_s *op;
+ int bufi, r;
+
+ op = op_alloc(ORANGEFS_VFS_OP_READDIR);
+ if (!op) {
+ od->error = -ENOMEM;
return -ENOMEM;
+ }
/*
- * Only the indices are shared. No memory is actually shared, but the
- * mechanism is used.
+ * Despite the badly named field, readdir does not use shared
+ * memory. However, there are a limited number of readdir
+ * slots, which must be allocated here. This flag simply tells
+ * the op scheduler to return the op here for retry.
*/
- new_op->uses_shared_memory = 1;
- new_op->upcall.req.readdir.refn = orangefs_inode->refn;
- new_op->upcall.req.readdir.max_dirent_count =
+ op->uses_shared_memory = 1;
+ op->upcall.req.readdir.refn = oi->refn;
+ op->upcall.req.readdir.token = od->token;
+ op->upcall.req.readdir.max_dirent_count =
ORANGEFS_MAX_DIRENT_COUNT_READDIR;
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: upcall.req.readdir.refn.khandle: %pU\n",
- __func__,
- &new_op->upcall.req.readdir.refn.khandle);
-
- new_op->upcall.req.readdir.token = *ptoken;
-
-get_new_buffer_index:
- buffer_index = orangefs_readdir_index_get();
- if (buffer_index < 0) {
- ret = buffer_index;
- gossip_lerr("orangefs_readdir: orangefs_readdir_index_get() failure (%d)\n",
- ret);
- goto out_free_op;
+again:
+ bufi = orangefs_readdir_index_get();
+ if (bufi < 0) {
+ op_release(op);
+ od->error = bufi;
+ return bufi;
}
- new_op->upcall.req.readdir.buf_index = buffer_index;
- ret = service_operation(new_op,
- "orangefs_readdir",
- get_interruptible_flag(dentry->d_inode));
+ op->upcall.req.readdir.buf_index = bufi;
- gossip_debug(GOSSIP_DIR_DEBUG,
- "Readdir downcall status is %d. ret:%d\n",
- new_op->downcall.status,
- ret);
+ r = service_operation(op, "orangefs_readdir",
+ get_interruptible_flag(dentry->d_inode));
- orangefs_readdir_index_put(buffer_index);
+ orangefs_readdir_index_put(bufi);
- if (ret == -EAGAIN && op_state_purged(new_op)) {
- /* Client-core indices are invalid after it restarted. */
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: Getting new buffer_index for retry of readdir..\n",
- __func__);
- goto get_new_buffer_index;
- }
-
- if (ret == -EIO && op_state_purged(new_op)) {
- gossip_err("%s: Client is down. Aborting readdir call.\n",
- __func__);
- goto out_free_op;
+ if (op_state_purged(op)) {
+ if (r == -EAGAIN) {
+ vfree(op->downcall.trailer_buf);
+ goto again;
+ } else if (r == -EIO) {
+ vfree(op->downcall.trailer_buf);
+ op_release(op);
+ od->error = r;
+ return r;
+ }
}
- if (ret < 0 || new_op->downcall.status != 0) {
- gossip_debug(GOSSIP_DIR_DEBUG,
- "Readdir request failed. Status:%d\n",
- new_op->downcall.status);
- if (ret >= 0)
- ret = new_op->downcall.status;
- goto out_free_op;
- }
+ if (r < 0) {
+ vfree(op->downcall.trailer_buf);
+ op_release(op);
+ od->error = r;
+ return r;
+ } else if (op->downcall.status) {
+ vfree(op->downcall.trailer_buf);
+ op_release(op);
+ od->error = op->downcall.status;
+ return op->downcall.status;
+ }
+
+ resp = (struct orangefs_readdir_response_s *)
+ op->downcall.trailer_buf;
+ od->token = resp->token;
+
+ if (od->len + op->downcall.trailer_size - offset <=
+ MAX_DIRECTORY) {
+ memcpy(od->directory + od->len,
+ op->downcall.trailer_buf + offset,
+ op->downcall.trailer_size - offset);
+ od->len += op->downcall.trailer_size - offset;
+ } else {
+ /* This limit was chosen based on protocol limits. */
+ gossip_err("orangefs_dir_more: userspace sent too much data\n");
+ vfree(op->downcall.trailer_buf);
+ op_release(op);
+ od->error = -EIO;
+ return -EIO;
+ }
+
+ vfree(op->downcall.trailer_buf);
+ op_release(op);
+ return 0;
+}
- dents_buf = new_op->downcall.trailer_buf;
- if (dents_buf == NULL) {
- gossip_err("Invalid NULL buffer in readdir response\n");
- ret = -ENOMEM;
- goto out_free_op;
- }
+static int orangefs_dir_fill(struct orangefs_inode_s *oi,
+ struct orangefs_dir *od, struct dentry *dentry,
+ struct dir_context *ctx)
+{
+ struct orangefs_khandle *khandle;
+ __u32 *len, padlen;
+ char *s;
+ while (od->i < od->len) {
+ if (od->len < od->i + sizeof *len)
+ goto eio;
+ len = od->directory + od->i;
+ /*
+ * len is the size of the string itself. padlen is the
+ * total size of the encoded string.
+ */
+ padlen = (sizeof *len + *len + 1) +
+ (4 - (sizeof *len + *len + 1)%8)%8;
+ if (od->len < od->i + padlen + sizeof *khandle)
+ goto eio;
+ s = od->directory + od->i + sizeof *len;
+ if (s[*len] != 0)
+ goto eio;
+ khandle = od->directory + od->i + padlen;
+
+ if (!dir_emit(ctx, s, *len,
+ orangefs_khandle_to_ino(khandle), DT_UNKNOWN))
+ return 0;
+ od->i += padlen + sizeof *khandle;
+ od->i = od->i + (8 - od->i%8)%8;
+ ctx->pos = 2 + od->i;
+ }
+ BUG_ON(od->i > od->len);
+ return 0;
+eio:
+ gossip_err("orangefs_dir_fill: userspace returns corrupt data\n");
+ od->error = -EIO;
+ return -EIO;
+}
- bytes_decoded = decode_dirents(dents_buf, new_op->downcall.trailer_size,
- &readdir_response);
- if (bytes_decoded < 0) {
- ret = bytes_decoded;
- gossip_err("Could not decode readdir from buffer %d\n", ret);
- goto out_vfree;
- }
+static int orangefs_dir_iterate(struct file *file,
+ struct dir_context *ctx)
+{
+ struct orangefs_inode_s *oi;
+ struct orangefs_dir *od;
+ struct dentry *dentry;
+ int r;
- if (bytes_decoded != new_op->downcall.trailer_size) {
- gossip_err("orangefs_readdir: # bytes decoded (%ld) "
- "!= trailer size (%ld)\n",
- bytes_decoded,
- (long)new_op->downcall.trailer_size);
- ret = -EINVAL;
- goto out_destroy_handle;
- }
+ dentry = file->f_path.dentry;
+ oi = ORANGEFS_I(dentry->d_inode);
+ od = file->private_data;
- /*
- * orangefs doesn't actually store dot and dot-dot, but
- * we need to have them represented.
- */
- if (pos == 0) {
- ino = get_ino_from_khandle(dentry->d_inode);
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: calling dir_emit of \".\" with pos = %llu\n",
- __func__,
- llu(pos));
- ret = dir_emit(ctx, ".", 1, ino, DT_DIR);
- pos += 1;
- }
+ if (od->error)
+ return od->error;
- if (pos == 1) {
- ino = get_parent_ino_from_dentry(dentry);
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: calling dir_emit of \"..\" with pos = %llu\n",
- __func__,
- llu(pos));
- ret = dir_emit(ctx, "..", 2, ino, DT_DIR);
- pos += 1;
+ if (ctx->pos == 0) {
+ if (!dir_emit_dot(file, ctx))
+ return 0;
+ ctx->pos++;
}
-
- /*
- * we stored ORANGEFS_ITERATE_NEXT in ctx->pos last time around
- * to prevent "finding" dot and dot-dot on any iteration
- * other than the first.
- */
- if (ctx->pos == ORANGEFS_ITERATE_NEXT)
- ctx->pos = 0;
-
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: dirent_outcount:%d:\n",
- __func__,
- readdir_response.orangefs_dirent_outcount);
- for (i = ctx->pos;
- i < readdir_response.orangefs_dirent_outcount;
- i++) {
- len = readdir_response.dirent_array[i].d_length;
- current_entry = readdir_response.dirent_array[i].d_name;
- current_ino = orangefs_khandle_to_ino(
- &readdir_response.dirent_array[i].khandle);
-
- gossip_debug(GOSSIP_DIR_DEBUG,
- "calling dir_emit for %s with len %d"
- ", ctx->pos %ld\n",
- current_entry,
- len,
- (unsigned long)ctx->pos);
- /*
- * type is unknown. We don't return object type
- * in the dirent_array. This leaves getdents
- * clueless about type.
- */
- ret =
- dir_emit(ctx, current_entry, len, current_ino, DT_UNKNOWN);
- if (!ret)
- break;
+ if (ctx->pos == 1) {
+ if (!dir_emit_dotdot(file, ctx))
+ return 0;
ctx->pos++;
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: ctx->pos:%lld\n",
- __func__,
- lld(ctx->pos));
-
}
- /*
- * we ran all the way through the last batch, set up for
- * getting another batch...
- */
- if (ret) {
- *ptoken = readdir_response.token;
- ctx->pos = ORANGEFS_ITERATE_NEXT;
+ r = 0;
+
+ if (od->i < od->len) {
+ r = orangefs_dir_fill(oi, od, dentry, ctx);
+ if (r)
+ return r;
}
- /*
- * Did we hit the end of the directory?
- */
- if (readdir_response.token == ORANGEFS_READDIR_END) {
- gossip_debug(GOSSIP_DIR_DEBUG,
- "End of dir detected; setting ctx->pos to ORANGEFS_READDIR_END.\n");
- ctx->pos = ORANGEFS_READDIR_END;
+ if (od->token != ORANGEFS_READDIR_END) {
+ r = orangefs_dir_more(oi, od, dentry);
+ if (r)
+ return r;
+ r = orangefs_dir_fill(oi, od, dentry, ctx);
}
-out_destroy_handle:
- /* kfree(NULL) is safe */
- kfree(readdir_response.dirent_array);
-out_vfree:
- gossip_debug(GOSSIP_DIR_DEBUG, "vfree %p\n", dents_buf);
- vfree(dents_buf);
-out_free_op:
- op_release(new_op);
- gossip_debug(GOSSIP_DIR_DEBUG, "orangefs_readdir returning %d\n", ret);
- return ret;
+ return r;
}
static int orangefs_dir_open(struct inode *inode, struct file *file)
{
- __u64 *ptoken;
-
- file->private_data = kmalloc(sizeof(__u64), GFP_KERNEL);
+ struct orangefs_dir *od;
+ file->private_data = kmalloc(sizeof(struct orangefs_dir),
+ GFP_KERNEL);
if (!file->private_data)
return -ENOMEM;
-
- ptoken = file->private_data;
- *ptoken = ORANGEFS_READDIR_START;
+ od = file->private_data;
+ od->token = ORANGEFS_READDIR_START;
+ /*
+ * XXX: It seems wasteful to allocate such a large buffer for
+ * each request. Most will be much smaller.
+ */
+ od->directory = alloc_pages_exact(MAX_DIRECTORY, GFP_KERNEL);
+ if (!od->directory) {
+ kfree(file->private_data);
+ return -ENOMEM;
+ }
+ od->i = 0;
+ od->len = 0;
+ od->error = 0;
return 0;
}
static int orangefs_dir_release(struct inode *inode, struct file *file)
{
+ struct orangefs_dir *od = file->private_data;
orangefs_flush_inode(inode);
- kfree(file->private_data);
+ free_pages_exact(od->directory, MAX_DIRECTORY);
+ kfree(od);
return 0;
}
-/** ORANGEFS implementation of VFS directory operations */
const struct file_operations orangefs_dir_operations = {
.read = generic_read_dir,
- .iterate = orangefs_readdir,
+ .iterate = orangefs_dir_iterate,
.open = orangefs_dir_open,
- .release = orangefs_dir_release,
+ .release = orangefs_dir_release
};
diff --git a/fs/orangefs/downcall.h b/fs/orangefs/downcall.h
index 3b8923f8bf21..163001c95501 100644
--- a/fs/orangefs/downcall.h
+++ b/fs/orangefs/downcall.h
@@ -40,16 +40,6 @@ struct orangefs_mkdir_response {
struct orangefs_object_kref refn;
};
-/*
- * duplication of some system interface structures so that I don't have
- * to allocate extra memory
- */
-struct orangefs_dirent {
- char *d_name;
- int d_length;
- struct orangefs_khandle khandle;
-};
-
struct orangefs_statfs_response {
__s64 block_size;
__s64 blocks_total;
@@ -131,12 +121,16 @@ struct orangefs_downcall_s {
} resp;
};
+/*
+ * The readdir response comes in the trailer. It is followed by the
+ * directory entries as described in dir.c.
+ */
+
struct orangefs_readdir_response_s {
__u64 token;
__u64 directory_version;
__u32 __pad2;
__u32 orangefs_dirent_outcount;
- struct orangefs_dirent *dirent_array;
};
#endif /* __DOWNCALL_H */
diff --git a/fs/orangefs/orangefs-dev-proto.h b/fs/orangefs/orangefs-dev-proto.h
index f380f9ed1b28..efe08c763e56 100644
--- a/fs/orangefs/orangefs-dev-proto.h
+++ b/fs/orangefs/orangefs-dev-proto.h
@@ -52,12 +52,7 @@
*/
#define ORANGEFS_MAX_DEBUG_STRING_LEN 0x00000800
-/*
- * The maximum number of directory entries in a single request is 96.
- * XXX: Why can this not be higher. The client-side code can handle up to 512.
- * XXX: What happens if we expect more than the client can return?
- */
-#define ORANGEFS_MAX_DIRENT_COUNT_READDIR 96
+#define ORANGEFS_MAX_DIRENT_COUNT_READDIR 512
#include "upcall.h"
#include "downcall.h"