From 418ce3eb66885e409eb3fef5c19bd037435241c5 Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Tue, 25 Apr 2017 15:37:55 -0400 Subject: orangefs: remove unused get_fsid_from_ino Signed-off-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/orangefs-kernel.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 8afac46fcc87..d9b050bc8882 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -340,11 +340,6 @@ static inline struct orangefs_khandle *get_khandle_from_ino(struct inode *inode) return &(ORANGEFS_I(inode)->refn.khandle); } -static inline __s32 get_fsid_from_ino(struct inode *inode) -{ - return ORANGEFS_I(inode)->refn.fs_id; -} - static inline ino_t get_ino_from_khandle(struct inode *inode) { struct orangefs_khandle *khandle; -- cgit v1.2.3 From a956af337b9ff25822d9ce1a59c6ed0c09fc14b9 Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Tue, 25 Apr 2017 15:37:56 -0400 Subject: orangefs: fix bounds check for listxattr Signed-off-by: Martin Brandenburg Cc: stable@vger.kernel.org Signed-off-by: Mike Marshall --- fs/orangefs/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c index 74a81b1daaac..fba4db7d0512 100644 --- a/fs/orangefs/xattr.c +++ b/fs/orangefs/xattr.c @@ -358,7 +358,7 @@ try_again: returned_count = new_op->downcall.resp.listxattr.returned_count; if (returned_count < 0 || - returned_count >= ORANGEFS_MAX_XATTR_LISTLEN) { + returned_count > ORANGEFS_MAX_XATTR_LISTLEN) { gossip_err("%s: impossible value for returned_count:%d:\n", __func__, returned_count); -- cgit v1.2.3 From e675c5ec51fe2554719a7b6bcdbef0a770f2c19b Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Tue, 25 Apr 2017 15:37:57 -0400 Subject: orangefs: clean up oversize xattr validation Also don't check flags as this has been validated by the VFS already. Fix an off-by-one error in the max size checking. Stop logging just because userspace wants to write attributes which do not fit. This and the previous commit fix xfstests generic/020. Signed-off-by: Martin Brandenburg Cc: stable@vger.kernel.org Signed-off-by: Mike Marshall --- fs/orangefs/xattr.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c index fba4db7d0512..237c9c04dc3b 100644 --- a/fs/orangefs/xattr.c +++ b/fs/orangefs/xattr.c @@ -76,11 +76,8 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name, if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; - if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) { - gossip_err("Invalid key length (%d)\n", - (int)strlen(name)); + if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN) return -EINVAL; - } fsuid = from_kuid(&init_user_ns, current_fsuid()); fsgid = from_kgid(&init_user_ns, current_fsgid()); @@ -172,6 +169,9 @@ static int orangefs_inode_removexattr(struct inode *inode, const char *name, struct orangefs_kernel_op_s *new_op = NULL; int ret = -ENOMEM; + if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN) + return -EINVAL; + down_write(&orangefs_inode->xattr_sem); new_op = op_alloc(ORANGEFS_VFS_OP_REMOVEXATTR); if (!new_op) @@ -231,23 +231,13 @@ int orangefs_inode_setxattr(struct inode *inode, const char *name, "%s: name %s, buffer_size %zd\n", __func__, name, size); - if (size >= ORANGEFS_MAX_XATTR_VALUELEN || - flags < 0) { - gossip_err("orangefs_inode_setxattr: bogus values of size(%d), flags(%d)\n", - (int)size, - flags); + if (size > ORANGEFS_MAX_XATTR_VALUELEN) + return -EINVAL; + if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN) return -EINVAL; - } internal_flag = convert_to_internal_xattr_flags(flags); - if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) { - gossip_err - ("orangefs_inode_setxattr: bogus key size (%d)\n", - (int)(strlen(name))); - return -EINVAL; - } - /* This is equivalent to a removexattr */ if (size == 0 && value == NULL) { gossip_debug(GOSSIP_XATTR_DEBUG, -- cgit v1.2.3 From 17930b252cd6f31163c259eaa99dd8aa630fb9ba Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Tue, 25 Apr 2017 15:37:58 -0400 Subject: orangefs: do not set getattr_time on orangefs_lookup Since orangefs_lookup calls orangefs_iget which calls orangefs_inode_getattr, getattr_time will get set. Signed-off-by: Martin Brandenburg Cc: stable@vger.kernel.org Signed-off-by: Mike Marshall --- fs/orangefs/namei.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index a290ff6ec756..7c315938e9c2 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -193,8 +193,6 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry, goto out; } - ORANGEFS_I(inode)->getattr_time = jiffies - 1; - gossip_debug(GOSSIP_NAME_DEBUG, "%s:%s:%d " "Found good inode [%lu] with count [%d]\n", -- cgit v1.2.3 From 382f4581e67f57209c7aa67e39f26ba076306a2e Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Tue, 25 Apr 2017 15:37:59 -0400 Subject: orangefs: rewrite readdir to fix several bugs In the past, readdir assumed that the user buffer will be large enough that all entries from the server will fit. If this was not true, entries would be skipped. Since it works now, request 512 entries rather than 96 per server operation. Signed-off-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/dir.c | 528 ++++++++++++++------------------------- fs/orangefs/downcall.h | 16 +- fs/orangefs/orangefs-dev-proto.h | 7 +- 3 files changed, 197 insertions(+), 354 deletions(-) diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c index 284373a57a08..cf0ebb06b84e 100644 --- a/fs/orangefs/dir.c +++ b/fs/orangefs/dir.c @@ -1,7 +1,5 @@ /* - * (C) 2001 Clemson University and The University of Chicago - * - * See COPYING in top-level directory. + * Copyright 2017 Omnibond Systems, L.L.C. */ #include "protocol.h" @@ -9,388 +7,244 @@ #include "orangefs-bufmap.h" /* - * decode routine used by kmod to deal with the blob sent from - * userspace for readdirs. The blob contains zero or more of these - * sub-blobs: - * __u32 - represents length of the character string that follows. - * string - between 1 and ORANGEFS_NAME_MAX bytes long. - * padding - (if needed) to cause the __u32 plus the string to be - * eight byte aligned. - * khandle - sizeof(khandle) bytes. + * There can be up to 512 directory entries. Each entry is encoded as + * follows: + * 4 bytes: string size (n) + * n bytes: string + * 1 byte: trailing zero + * padding to 8 bytes + * 16 bytes: khandle + * padding to 8 bytes */ -static long decode_dirents(char *ptr, size_t size, - struct orangefs_readdir_response_s *readdir) -{ - int i; - struct orangefs_readdir_response_s *rd = - (struct orangefs_readdir_response_s *) ptr; - char *buf = ptr; - int khandle_size = sizeof(struct orangefs_khandle); - size_t offset = offsetof(struct orangefs_readdir_response_s, - dirent_array); - /* 8 reflects eight byte alignment */ - int smallest_blob = khandle_size + 8; - __u32 len; - int aligned_len; - int sizeof_u32 = sizeof(__u32); - long ret; - - gossip_debug(GOSSIP_DIR_DEBUG, "%s: size:%zu:\n", __func__, size); +#define MAX_DIRECTORY ((4 + 257 + 3 + 16)*512) - /* size is = offset on empty dirs, > offset on non-empty dirs... */ - if (size < offset) { - gossip_err("%s: size:%zu: offset:%zu:\n", - __func__, - size, - offset); - ret = -EINVAL; - goto out; - } - - if ((size == offset) && (readdir->orangefs_dirent_outcount != 0)) { - gossip_err("%s: size:%zu: dirent_outcount:%d:\n", - __func__, - size, - readdir->orangefs_dirent_outcount); - ret = -EINVAL; - goto out; - } - - readdir->token = rd->token; - readdir->orangefs_dirent_outcount = rd->orangefs_dirent_outcount; - readdir->dirent_array = kcalloc(readdir->orangefs_dirent_outcount, - sizeof(*readdir->dirent_array), - GFP_KERNEL); - if (readdir->dirent_array == NULL) { - gossip_err("%s: kcalloc failed.\n", __func__); - ret = -ENOMEM; - goto out; - } - - buf += offset; - size -= offset; - - for (i = 0; i < readdir->orangefs_dirent_outcount; i++) { - if (size < smallest_blob) { - gossip_err("%s: size:%zu: smallest_blob:%d:\n", - __func__, - size, - smallest_blob); - ret = -EINVAL; - goto free; - } - - len = *(__u32 *)buf; - if ((len < 1) || (len > ORANGEFS_NAME_MAX)) { - gossip_err("%s: len:%d:\n", __func__, len); - ret = -EINVAL; - goto free; - } - - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: size:%zu: len:%d:\n", - __func__, - size, - len); - - readdir->dirent_array[i].d_name = buf + sizeof_u32; - readdir->dirent_array[i].d_length = len; - - /* - * Calculate "aligned" length of this string and its - * associated __u32 descriptor. - */ - aligned_len = ((sizeof_u32 + len + 1) + 7) & ~7; - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: aligned_len:%d:\n", - __func__, - aligned_len); - - /* - * The end of the blob should coincide with the end - * of the last sub-blob. - */ - if (size < aligned_len + khandle_size) { - gossip_err("%s: ran off the end of the blob.\n", - __func__); - ret = -EINVAL; - goto free; - } - size -= aligned_len + khandle_size; - - buf += aligned_len; - - readdir->dirent_array[i].khandle = - *(struct orangefs_khandle *) buf; - buf += khandle_size; - } - ret = buf - ptr; - gossip_debug(GOSSIP_DIR_DEBUG, "%s: returning:%ld:\n", __func__, ret); - goto out; - -free: - kfree(readdir->dirent_array); - readdir->dirent_array = NULL; - -out: - return ret; -} +struct orangefs_dir { + __u64 token; + void *directory; + size_t i, len; + int error; +}; /* - * Read directory entries from an instance of an open directory. + * The userspace component sends several directory entries of the + * following format. The first four bytes are the string length not + * including a trailing zero byte. This is followed by the string and a + * trailing zero padded to the next four byte boundry. This is followed + * by the sixteen byte khandle padded to the next eight byte boundry. + * + * The trailer_buf starts with a struct orangefs_readdir_response_s + * which must be skipped to get to the directory data. */ -static int orangefs_readdir(struct file *file, struct dir_context *ctx) -{ - int ret = 0; - int buffer_index; - /* - * ptoken supports Orangefs' distributed directory logic, added - * in 2.9.2. - */ - __u64 *ptoken = file->private_data; - __u64 pos = 0; - ino_t ino = 0; - struct dentry *dentry = file->f_path.dentry; - struct orangefs_kernel_op_s *new_op = NULL; - struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(dentry->d_inode); - struct orangefs_readdir_response_s readdir_response; - void *dents_buf; - int i = 0; - int len = 0; - ino_t current_ino = 0; - char *current_entry = NULL; - long bytes_decoded; - - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: ctx->pos:%lld, ptoken = %llu\n", - __func__, - lld(ctx->pos), - llu(*ptoken)); - - pos = (__u64) ctx->pos; - - /* are we done? */ - if (pos == ORANGEFS_READDIR_END) { - gossip_debug(GOSSIP_DIR_DEBUG, - "Skipping to termination path\n"); - return 0; - } - - gossip_debug(GOSSIP_DIR_DEBUG, - "orangefs_readdir called on %pd (pos=%llu)\n", - dentry, llu(pos)); - - memset(&readdir_response, 0, sizeof(readdir_response)); - new_op = op_alloc(ORANGEFS_VFS_OP_READDIR); - if (!new_op) +static int orangefs_dir_more(struct orangefs_inode_s *oi, + struct orangefs_dir *od, struct dentry *dentry) +{ + const size_t offset = + sizeof(struct orangefs_readdir_response_s); + struct orangefs_readdir_response_s *resp; + struct orangefs_kernel_op_s *op; + int bufi, r; + + op = op_alloc(ORANGEFS_VFS_OP_READDIR); + if (!op) { + od->error = -ENOMEM; return -ENOMEM; + } /* - * Only the indices are shared. No memory is actually shared, but the - * mechanism is used. + * Despite the badly named field, readdir does not use shared + * memory. However, there are a limited number of readdir + * slots, which must be allocated here. This flag simply tells + * the op scheduler to return the op here for retry. */ - new_op->uses_shared_memory = 1; - new_op->upcall.req.readdir.refn = orangefs_inode->refn; - new_op->upcall.req.readdir.max_dirent_count = + op->uses_shared_memory = 1; + op->upcall.req.readdir.refn = oi->refn; + op->upcall.req.readdir.token = od->token; + op->upcall.req.readdir.max_dirent_count = ORANGEFS_MAX_DIRENT_COUNT_READDIR; - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: upcall.req.readdir.refn.khandle: %pU\n", - __func__, - &new_op->upcall.req.readdir.refn.khandle); - - new_op->upcall.req.readdir.token = *ptoken; - -get_new_buffer_index: - buffer_index = orangefs_readdir_index_get(); - if (buffer_index < 0) { - ret = buffer_index; - gossip_lerr("orangefs_readdir: orangefs_readdir_index_get() failure (%d)\n", - ret); - goto out_free_op; +again: + bufi = orangefs_readdir_index_get(); + if (bufi < 0) { + op_release(op); + od->error = bufi; + return bufi; } - new_op->upcall.req.readdir.buf_index = buffer_index; - ret = service_operation(new_op, - "orangefs_readdir", - get_interruptible_flag(dentry->d_inode)); + op->upcall.req.readdir.buf_index = bufi; - gossip_debug(GOSSIP_DIR_DEBUG, - "Readdir downcall status is %d. ret:%d\n", - new_op->downcall.status, - ret); + r = service_operation(op, "orangefs_readdir", + get_interruptible_flag(dentry->d_inode)); - orangefs_readdir_index_put(buffer_index); + orangefs_readdir_index_put(bufi); - if (ret == -EAGAIN && op_state_purged(new_op)) { - /* Client-core indices are invalid after it restarted. */ - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: Getting new buffer_index for retry of readdir..\n", - __func__); - goto get_new_buffer_index; - } - - if (ret == -EIO && op_state_purged(new_op)) { - gossip_err("%s: Client is down. Aborting readdir call.\n", - __func__); - goto out_free_op; + if (op_state_purged(op)) { + if (r == -EAGAIN) { + vfree(op->downcall.trailer_buf); + goto again; + } else if (r == -EIO) { + vfree(op->downcall.trailer_buf); + op_release(op); + od->error = r; + return r; + } } - if (ret < 0 || new_op->downcall.status != 0) { - gossip_debug(GOSSIP_DIR_DEBUG, - "Readdir request failed. Status:%d\n", - new_op->downcall.status); - if (ret >= 0) - ret = new_op->downcall.status; - goto out_free_op; - } + if (r < 0) { + vfree(op->downcall.trailer_buf); + op_release(op); + od->error = r; + return r; + } else if (op->downcall.status) { + vfree(op->downcall.trailer_buf); + op_release(op); + od->error = op->downcall.status; + return op->downcall.status; + } + + resp = (struct orangefs_readdir_response_s *) + op->downcall.trailer_buf; + od->token = resp->token; + + if (od->len + op->downcall.trailer_size - offset <= + MAX_DIRECTORY) { + memcpy(od->directory + od->len, + op->downcall.trailer_buf + offset, + op->downcall.trailer_size - offset); + od->len += op->downcall.trailer_size - offset; + } else { + /* This limit was chosen based on protocol limits. */ + gossip_err("orangefs_dir_more: userspace sent too much data\n"); + vfree(op->downcall.trailer_buf); + op_release(op); + od->error = -EIO; + return -EIO; + } + + vfree(op->downcall.trailer_buf); + op_release(op); + return 0; +} - dents_buf = new_op->downcall.trailer_buf; - if (dents_buf == NULL) { - gossip_err("Invalid NULL buffer in readdir response\n"); - ret = -ENOMEM; - goto out_free_op; - } +static int orangefs_dir_fill(struct orangefs_inode_s *oi, + struct orangefs_dir *od, struct dentry *dentry, + struct dir_context *ctx) +{ + struct orangefs_khandle *khandle; + __u32 *len, padlen; + char *s; + while (od->i < od->len) { + if (od->len < od->i + sizeof *len) + goto eio; + len = od->directory + od->i; + /* + * len is the size of the string itself. padlen is the + * total size of the encoded string. + */ + padlen = (sizeof *len + *len + 1) + + (4 - (sizeof *len + *len + 1)%8)%8; + if (od->len < od->i + padlen + sizeof *khandle) + goto eio; + s = od->directory + od->i + sizeof *len; + if (s[*len] != 0) + goto eio; + khandle = od->directory + od->i + padlen; + + if (!dir_emit(ctx, s, *len, + orangefs_khandle_to_ino(khandle), DT_UNKNOWN)) + return 0; + od->i += padlen + sizeof *khandle; + od->i = od->i + (8 - od->i%8)%8; + ctx->pos = 2 + od->i; + } + BUG_ON(od->i > od->len); + return 0; +eio: + gossip_err("orangefs_dir_fill: userspace returns corrupt data\n"); + od->error = -EIO; + return -EIO; +} - bytes_decoded = decode_dirents(dents_buf, new_op->downcall.trailer_size, - &readdir_response); - if (bytes_decoded < 0) { - ret = bytes_decoded; - gossip_err("Could not decode readdir from buffer %d\n", ret); - goto out_vfree; - } +static int orangefs_dir_iterate(struct file *file, + struct dir_context *ctx) +{ + struct orangefs_inode_s *oi; + struct orangefs_dir *od; + struct dentry *dentry; + int r; - if (bytes_decoded != new_op->downcall.trailer_size) { - gossip_err("orangefs_readdir: # bytes decoded (%ld) " - "!= trailer size (%ld)\n", - bytes_decoded, - (long)new_op->downcall.trailer_size); - ret = -EINVAL; - goto out_destroy_handle; - } + dentry = file->f_path.dentry; + oi = ORANGEFS_I(dentry->d_inode); + od = file->private_data; - /* - * orangefs doesn't actually store dot and dot-dot, but - * we need to have them represented. - */ - if (pos == 0) { - ino = get_ino_from_khandle(dentry->d_inode); - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: calling dir_emit of \".\" with pos = %llu\n", - __func__, - llu(pos)); - ret = dir_emit(ctx, ".", 1, ino, DT_DIR); - pos += 1; - } + if (od->error) + return od->error; - if (pos == 1) { - ino = get_parent_ino_from_dentry(dentry); - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: calling dir_emit of \"..\" with pos = %llu\n", - __func__, - llu(pos)); - ret = dir_emit(ctx, "..", 2, ino, DT_DIR); - pos += 1; + if (ctx->pos == 0) { + if (!dir_emit_dot(file, ctx)) + return 0; + ctx->pos++; } - - /* - * we stored ORANGEFS_ITERATE_NEXT in ctx->pos last time around - * to prevent "finding" dot and dot-dot on any iteration - * other than the first. - */ - if (ctx->pos == ORANGEFS_ITERATE_NEXT) - ctx->pos = 0; - - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: dirent_outcount:%d:\n", - __func__, - readdir_response.orangefs_dirent_outcount); - for (i = ctx->pos; - i < readdir_response.orangefs_dirent_outcount; - i++) { - len = readdir_response.dirent_array[i].d_length; - current_entry = readdir_response.dirent_array[i].d_name; - current_ino = orangefs_khandle_to_ino( - &readdir_response.dirent_array[i].khandle); - - gossip_debug(GOSSIP_DIR_DEBUG, - "calling dir_emit for %s with len %d" - ", ctx->pos %ld\n", - current_entry, - len, - (unsigned long)ctx->pos); - /* - * type is unknown. We don't return object type - * in the dirent_array. This leaves getdents - * clueless about type. - */ - ret = - dir_emit(ctx, current_entry, len, current_ino, DT_UNKNOWN); - if (!ret) - break; + if (ctx->pos == 1) { + if (!dir_emit_dotdot(file, ctx)) + return 0; ctx->pos++; - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: ctx->pos:%lld\n", - __func__, - lld(ctx->pos)); - } - /* - * we ran all the way through the last batch, set up for - * getting another batch... - */ - if (ret) { - *ptoken = readdir_response.token; - ctx->pos = ORANGEFS_ITERATE_NEXT; + r = 0; + + if (od->i < od->len) { + r = orangefs_dir_fill(oi, od, dentry, ctx); + if (r) + return r; } - /* - * Did we hit the end of the directory? - */ - if (readdir_response.token == ORANGEFS_READDIR_END) { - gossip_debug(GOSSIP_DIR_DEBUG, - "End of dir detected; setting ctx->pos to ORANGEFS_READDIR_END.\n"); - ctx->pos = ORANGEFS_READDIR_END; + if (od->token != ORANGEFS_READDIR_END) { + r = orangefs_dir_more(oi, od, dentry); + if (r) + return r; + r = orangefs_dir_fill(oi, od, dentry, ctx); } -out_destroy_handle: - /* kfree(NULL) is safe */ - kfree(readdir_response.dirent_array); -out_vfree: - gossip_debug(GOSSIP_DIR_DEBUG, "vfree %p\n", dents_buf); - vfree(dents_buf); -out_free_op: - op_release(new_op); - gossip_debug(GOSSIP_DIR_DEBUG, "orangefs_readdir returning %d\n", ret); - return ret; + return r; } static int orangefs_dir_open(struct inode *inode, struct file *file) { - __u64 *ptoken; - - file->private_data = kmalloc(sizeof(__u64), GFP_KERNEL); + struct orangefs_dir *od; + file->private_data = kmalloc(sizeof(struct orangefs_dir), + GFP_KERNEL); if (!file->private_data) return -ENOMEM; - - ptoken = file->private_data; - *ptoken = ORANGEFS_READDIR_START; + od = file->private_data; + od->token = ORANGEFS_READDIR_START; + /* + * XXX: It seems wasteful to allocate such a large buffer for + * each request. Most will be much smaller. + */ + od->directory = alloc_pages_exact(MAX_DIRECTORY, GFP_KERNEL); + if (!od->directory) { + kfree(file->private_data); + return -ENOMEM; + } + od->i = 0; + od->len = 0; + od->error = 0; return 0; } static int orangefs_dir_release(struct inode *inode, struct file *file) { + struct orangefs_dir *od = file->private_data; orangefs_flush_inode(inode); - kfree(file->private_data); + free_pages_exact(od->directory, MAX_DIRECTORY); + kfree(od); return 0; } -/** ORANGEFS implementation of VFS directory operations */ const struct file_operations orangefs_dir_operations = { .read = generic_read_dir, - .iterate = orangefs_readdir, + .iterate = orangefs_dir_iterate, .open = orangefs_dir_open, - .release = orangefs_dir_release, + .release = orangefs_dir_release }; diff --git a/fs/orangefs/downcall.h b/fs/orangefs/downcall.h index 3b8923f8bf21..163001c95501 100644 --- a/fs/orangefs/downcall.h +++ b/fs/orangefs/downcall.h @@ -40,16 +40,6 @@ struct orangefs_mkdir_response { struct orangefs_object_kref refn; }; -/* - * duplication of some system interface structures so that I don't have - * to allocate extra memory - */ -struct orangefs_dirent { - char *d_name; - int d_length; - struct orangefs_khandle khandle; -}; - struct orangefs_statfs_response { __s64 block_size; __s64 blocks_total; @@ -131,12 +121,16 @@ struct orangefs_downcall_s { } resp; }; +/* + * The readdir response comes in the trailer. It is followed by the + * directory entries as described in dir.c. + */ + struct orangefs_readdir_response_s { __u64 token; __u64 directory_version; __u32 __pad2; __u32 orangefs_dirent_outcount; - struct orangefs_dirent *dirent_array; }; #endif /* __DOWNCALL_H */ diff --git a/fs/orangefs/orangefs-dev-proto.h b/fs/orangefs/orangefs-dev-proto.h index f380f9ed1b28..efe08c763e56 100644 --- a/fs/orangefs/orangefs-dev-proto.h +++ b/fs/orangefs/orangefs-dev-proto.h @@ -52,12 +52,7 @@ */ #define ORANGEFS_MAX_DEBUG_STRING_LEN 0x00000800 -/* - * The maximum number of directory entries in a single request is 96. - * XXX: Why can this not be higher. The client-side code can handle up to 512. - * XXX: What happens if we expect more than the client can return? - */ -#define ORANGEFS_MAX_DIRENT_COUNT_READDIR 96 +#define ORANGEFS_MAX_DIRENT_COUNT_READDIR 512 #include "upcall.h" #include "downcall.h" -- cgit v1.2.3 From 72f66b8329310b810dc9b70b08af728812d2e6c1 Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Tue, 25 Apr 2017 15:38:00 -0400 Subject: orangefs: support llseek on directories This and the previous commit fix xfstests generic/257. Signed-off-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/dir.c | 50 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c index cf0ebb06b84e..9744fb3ad144 100644 --- a/fs/orangefs/dir.c +++ b/fs/orangefs/dir.c @@ -21,7 +21,7 @@ struct orangefs_dir { __u64 token; void *directory; - size_t i, len; + size_t len; int error; }; @@ -132,35 +132,40 @@ static int orangefs_dir_fill(struct orangefs_inode_s *oi, { struct orangefs_khandle *khandle; __u32 *len, padlen; + loff_t i; char *s; - while (od->i < od->len) { - if (od->len < od->i + sizeof *len) + i = ctx->pos - 2; + while (i < od->len) { + if (od->len < i + sizeof *len) goto eio; - len = od->directory + od->i; + len = od->directory + i; /* * len is the size of the string itself. padlen is the * total size of the encoded string. */ padlen = (sizeof *len + *len + 1) + (4 - (sizeof *len + *len + 1)%8)%8; - if (od->len < od->i + padlen + sizeof *khandle) + if (od->len < i + padlen + sizeof *khandle) goto eio; - s = od->directory + od->i + sizeof *len; + s = od->directory + i + sizeof *len; if (s[*len] != 0) goto eio; - khandle = od->directory + od->i + padlen; + khandle = od->directory + i + padlen; if (!dir_emit(ctx, s, *len, orangefs_khandle_to_ino(khandle), DT_UNKNOWN)) return 0; - od->i += padlen + sizeof *khandle; - od->i = od->i + (8 - od->i%8)%8; - ctx->pos = 2 + od->i; + i += padlen + sizeof *khandle; + i = i + (8 - i%8)%8; + ctx->pos = i + 2; } - BUG_ON(od->i > od->len); + BUG_ON(i > od->len); return 0; eio: - gossip_err("orangefs_dir_fill: userspace returns corrupt data\n"); + /* + * Here either data from userspace is corrupt or the application + * has sought to an invalid location. + */ od->error = -EIO; return -EIO; } @@ -193,12 +198,29 @@ static int orangefs_dir_iterate(struct file *file, r = 0; - if (od->i < od->len) { + /* + * Must read more if the user has sought past what has been read + * so far. Stop a user who has sought past the end. + */ + while (od->token != ORANGEFS_READDIR_END && ctx->pos - 2 > + od->len) { + r = orangefs_dir_more(oi, od, dentry); + if (r) + return r; + } + if (od->token == ORANGEFS_READDIR_END && ctx->pos - 2 > + od->len) { + return -EIO; + } + + /* Then try to fill if there's any left in the buffer. */ + if (ctx->pos - 2 < od->len) { r = orangefs_dir_fill(oi, od, dentry, ctx); if (r) return r; } + /* Finally get some more and try to fill. */ if (od->token != ORANGEFS_READDIR_END) { r = orangefs_dir_more(oi, od, dentry); if (r) @@ -227,7 +249,6 @@ static int orangefs_dir_open(struct inode *inode, struct file *file) kfree(file->private_data); return -ENOMEM; } - od->i = 0; od->len = 0; od->error = 0; return 0; @@ -243,6 +264,7 @@ static int orangefs_dir_release(struct inode *inode, struct file *file) } const struct file_operations orangefs_dir_operations = { + .llseek = default_llseek, .read = generic_read_dir, .iterate = orangefs_dir_iterate, .open = orangefs_dir_open, -- cgit v1.2.3 From 480e3e532e31666a18520a7964bb4095d7a16b9a Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Tue, 25 Apr 2017 15:38:01 -0400 Subject: orangefs: support very large directories This works by maintaining a linked list of pages which the directory has been read into rather than one giant fixed-size buffer. This replaces code which limits the total directory size to the total amount that could be returned in one server request. Since filenames are usually considerably shorter than the maximum, the old code could usually handle several server requests before running out of space. Signed-off-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/dir.c | 273 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 185 insertions(+), 88 deletions(-) diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c index 9744fb3ad144..7e9814fc6cc3 100644 --- a/fs/orangefs/dir.c +++ b/fs/orangefs/dir.c @@ -6,6 +6,22 @@ #include "orangefs-kernel.h" #include "orangefs-bufmap.h" +struct orangefs_dir_part { + struct orangefs_dir_part *next; + size_t len; +}; + +struct orangefs_dir { + __u64 token; + struct orangefs_dir_part *part; + loff_t end; + int error; +}; + +#define PART_SHIFT (24) +#define PART_SIZE (1<<24) +#define PART_MASK (~(PART_SIZE - 1)) + /* * There can be up to 512 directory entries. Each entry is encoded as * follows: @@ -15,42 +31,39 @@ * padding to 8 bytes * 16 bytes: khandle * padding to 8 bytes - */ -#define MAX_DIRECTORY ((4 + 257 + 3 + 16)*512) - -struct orangefs_dir { - __u64 token; - void *directory; - size_t len; - int error; -}; - -/* - * The userspace component sends several directory entries of the - * following format. The first four bytes are the string length not - * including a trailing zero byte. This is followed by the string and a - * trailing zero padded to the next four byte boundry. This is followed - * by the sixteen byte khandle padded to the next eight byte boundry. * * The trailer_buf starts with a struct orangefs_readdir_response_s * which must be skipped to get to the directory data. + * + * The data which is received from the userspace daemon is termed a + * part and is stored in a linked list in case more than one part is + * needed for a large directory. + * + * The position pointer (ctx->pos) encodes the part and offset on which + * to begin reading at. Bits above PART_SHIFT encode the part and bits + * below PART_SHIFT encode the offset. Parts are stored in a linked + * list which grows as data is received from the server. The overhead + * associated with managing the list is presumed to be small compared to + * the overhead of communicating with the server. + * + * As data is received from the server, it is placed at the end of the + * part list. Data is parsed from the current position as it is needed. + * When data is determined to be corrupt, it is either because the + * userspace component has sent back corrupt data or because the file + * pointer has been moved to an invalid location. Since the two cannot + * be differentiated, return EIO. + * + * Part zero is synthesized to contains `.' and `..'. Part one is the + * first part of the part list. */ -static int orangefs_dir_more(struct orangefs_inode_s *oi, - struct orangefs_dir *od, struct dentry *dentry) +static int do_readdir(struct orangefs_inode_s *oi, + struct orangefs_dir *od, struct dentry *dentry, + struct orangefs_kernel_op_s *op) { - const size_t offset = - sizeof(struct orangefs_readdir_response_s); struct orangefs_readdir_response_s *resp; - struct orangefs_kernel_op_s *op; int bufi, r; - op = op_alloc(ORANGEFS_VFS_OP_READDIR); - if (!op) { - od->error = -ENOMEM; - return -ENOMEM; - } - /* * Despite the badly named field, readdir does not use shared * memory. However, there are a limited number of readdir @@ -66,7 +79,6 @@ static int orangefs_dir_more(struct orangefs_inode_s *oi, again: bufi = orangefs_readdir_index_get(); if (bufi < 0) { - op_release(op); od->error = bufi; return bufi; } @@ -84,7 +96,6 @@ again: goto again; } else if (r == -EIO) { vfree(op->downcall.trailer_buf); - op_release(op); od->error = r; return r; } @@ -92,82 +103,166 @@ again: if (r < 0) { vfree(op->downcall.trailer_buf); - op_release(op); od->error = r; return r; } else if (op->downcall.status) { vfree(op->downcall.trailer_buf); - op_release(op); od->error = op->downcall.status; return op->downcall.status; } + /* + * The maximum size is size per entry times the 512 entries plus + * the header. This is well under the limit. + */ + if (op->downcall.trailer_size > PART_SIZE) { + vfree(op->downcall.trailer_buf); + od->error = -EIO; + return -EIO; + } + resp = (struct orangefs_readdir_response_s *) op->downcall.trailer_buf; od->token = resp->token; + return 0; +} - if (od->len + op->downcall.trailer_size - offset <= - MAX_DIRECTORY) { - memcpy(od->directory + od->len, - op->downcall.trailer_buf + offset, - op->downcall.trailer_size - offset); - od->len += op->downcall.trailer_size - offset; - } else { - /* This limit was chosen based on protocol limits. */ - gossip_err("orangefs_dir_more: userspace sent too much data\n"); - vfree(op->downcall.trailer_buf); - op_release(op); - od->error = -EIO; - return -EIO; +static int parse_readdir(struct orangefs_dir *od, + struct orangefs_kernel_op_s *op) +{ + struct orangefs_dir_part *part, *new; + size_t count; + + count = 1; + part = od->part; + while (part && part->next) { + part = part->next; + count++; } - vfree(op->downcall.trailer_buf); - op_release(op); + new = (void *)op->downcall.trailer_buf; + new->next = NULL; + new->len = op->downcall.trailer_size - + sizeof(struct orangefs_readdir_response_s); + if (!od->part) + od->part = new; + else + part->next = new; + count++; + od->end = count << PART_SHIFT; + return 0; } -static int orangefs_dir_fill(struct orangefs_inode_s *oi, - struct orangefs_dir *od, struct dentry *dentry, +static int orangefs_dir_more(struct orangefs_inode_s *oi, + struct orangefs_dir *od, struct dentry *dentry) +{ + struct orangefs_kernel_op_s *op; + int r; + + op = op_alloc(ORANGEFS_VFS_OP_READDIR); + if (!op) { + od->error = -ENOMEM; + return -ENOMEM; + } + r = do_readdir(oi, od, dentry, op); + if (r) { + od->error = r; + goto out; + } + r = parse_readdir(od, op); + if (r) { + od->error = r; + goto out; + } + + od->error = 0; +out: + op_release(op); + return od->error; +} + +static int fill_from_part(struct orangefs_dir_part *part, struct dir_context *ctx) { + const int offset = sizeof(struct orangefs_readdir_response_s); struct orangefs_khandle *khandle; __u32 *len, padlen; loff_t i; char *s; - i = ctx->pos - 2; - while (i < od->len) { - if (od->len < i + sizeof *len) - goto eio; - len = od->directory + i; + i = ctx->pos & ~PART_MASK; + + /* The file offset from userspace is too large. */ + if (i > part->len) + return -EIO; + + while (i < part->len) { + if (part->len < i + sizeof *len) + return -EIO; + len = (void *)part + offset + i; /* * len is the size of the string itself. padlen is the * total size of the encoded string. */ padlen = (sizeof *len + *len + 1) + - (4 - (sizeof *len + *len + 1)%8)%8; - if (od->len < i + padlen + sizeof *khandle) - goto eio; - s = od->directory + i + sizeof *len; + (8 - (sizeof *len + *len + 1)%8)%8; + if (part->len < i + padlen + sizeof *khandle) + return -EIO; + s = (void *)part + offset + i + sizeof *len; if (s[*len] != 0) - goto eio; - khandle = od->directory + i + padlen; - + return -EIO; + khandle = (void *)part + offset + i + padlen; if (!dir_emit(ctx, s, *len, - orangefs_khandle_to_ino(khandle), DT_UNKNOWN)) + orangefs_khandle_to_ino(khandle), + DT_UNKNOWN)) return 0; i += padlen + sizeof *khandle; i = i + (8 - i%8)%8; - ctx->pos = i + 2; + BUG_ON(i > part->len); + ctx->pos = (ctx->pos & PART_MASK) | i; + } + return 1; +} + +static int orangefs_dir_fill(struct orangefs_inode_s *oi, + struct orangefs_dir *od, struct dentry *dentry, + struct dir_context *ctx) +{ + struct orangefs_dir_part *part; + size_t count; + + count = ((ctx->pos & PART_MASK) >> PART_SHIFT) - 1; + + part = od->part; + while (part->next && count) { + count--; + part = part->next; + } + /* This means the userspace file offset is invalid. */ + if (count) { + od->error = -EIO; + return -EIO; + } + + while (part && part->len) { + int r; + r = fill_from_part(part, ctx); + if (r < 0) { + od->error = r; + return r; + } else if (r == 0) { + /* Userspace buffer is full. */ + break; + } else { + /* + * The part ran out of data. Move to the next + * part. */ + ctx->pos = (ctx->pos & PART_MASK) + + (1 << PART_SHIFT); + part = part->next; + } } - BUG_ON(i > od->len); return 0; -eio: - /* - * Here either data from userspace is corrupt or the application - * has sought to an invalid location. - */ - od->error = -EIO; - return -EIO; } static int orangefs_dir_iterate(struct file *file, @@ -193,28 +288,33 @@ static int orangefs_dir_iterate(struct file *file, if (ctx->pos == 1) { if (!dir_emit_dotdot(file, ctx)) return 0; - ctx->pos++; + ctx->pos = 1 << PART_SHIFT; } + /* + * The seek position is in the first synthesized part but is not + * valid. + */ + if ((ctx->pos & PART_MASK) == 0) + return -EIO; + r = 0; /* * Must read more if the user has sought past what has been read * so far. Stop a user who has sought past the end. */ - while (od->token != ORANGEFS_READDIR_END && ctx->pos - 2 > - od->len) { + while (od->token != ORANGEFS_READDIR_END && + ctx->pos > od->end) { r = orangefs_dir_more(oi, od, dentry); if (r) return r; } - if (od->token == ORANGEFS_READDIR_END && ctx->pos - 2 > - od->len) { + if (od->token == ORANGEFS_READDIR_END && ctx->pos > od->end) return -EIO; - } /* Then try to fill if there's any left in the buffer. */ - if (ctx->pos - 2 < od->len) { + if (ctx->pos < od->end) { r = orangefs_dir_fill(oi, od, dentry, ctx); if (r) return r; @@ -240,16 +340,8 @@ static int orangefs_dir_open(struct inode *inode, struct file *file) return -ENOMEM; od = file->private_data; od->token = ORANGEFS_READDIR_START; - /* - * XXX: It seems wasteful to allocate such a large buffer for - * each request. Most will be much smaller. - */ - od->directory = alloc_pages_exact(MAX_DIRECTORY, GFP_KERNEL); - if (!od->directory) { - kfree(file->private_data); - return -ENOMEM; - } - od->len = 0; + od->part = NULL; + od->end = 1 << PART_SHIFT; od->error = 0; return 0; } @@ -257,8 +349,13 @@ static int orangefs_dir_open(struct inode *inode, struct file *file) static int orangefs_dir_release(struct inode *inode, struct file *file) { struct orangefs_dir *od = file->private_data; + struct orangefs_dir_part *part = od->part; orangefs_flush_inode(inode); - free_pages_exact(od->directory, MAX_DIRECTORY); + while (part) { + struct orangefs_dir_part *next = part->next; + vfree(part); + part = next; + } kfree(od); return 0; } -- cgit v1.2.3 From 7b796ae37092ef520641b3a96c211c1cc67a0346 Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Tue, 25 Apr 2017 15:38:02 -0400 Subject: orangefs: remove ORANGEFS_READDIR macros They are clones of the ORANGEFS_ITERATE macros in use elsewhere. Delete ORANGEFS_ITERATE_NEXT which is a hack previously used by readdir. Signed-off-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/dir.c | 8 ++++---- fs/orangefs/protocol.h | 9 ++------- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c index 7e9814fc6cc3..d5ec9ba82cef 100644 --- a/fs/orangefs/dir.c +++ b/fs/orangefs/dir.c @@ -304,13 +304,13 @@ static int orangefs_dir_iterate(struct file *file, * Must read more if the user has sought past what has been read * so far. Stop a user who has sought past the end. */ - while (od->token != ORANGEFS_READDIR_END && + while (od->token != ORANGEFS_ITERATE_END && ctx->pos > od->end) { r = orangefs_dir_more(oi, od, dentry); if (r) return r; } - if (od->token == ORANGEFS_READDIR_END && ctx->pos > od->end) + if (od->token == ORANGEFS_ITERATE_END && ctx->pos > od->end) return -EIO; /* Then try to fill if there's any left in the buffer. */ @@ -321,7 +321,7 @@ static int orangefs_dir_iterate(struct file *file, } /* Finally get some more and try to fill. */ - if (od->token != ORANGEFS_READDIR_END) { + if (od->token != ORANGEFS_ITERATE_END) { r = orangefs_dir_more(oi, od, dentry); if (r) return r; @@ -339,7 +339,7 @@ static int orangefs_dir_open(struct inode *inode, struct file *file) if (!file->private_data) return -ENOMEM; od = file->private_data; - od->token = ORANGEFS_READDIR_START; + od->token = ORANGEFS_ITERATE_START; od->part = NULL; od->end = 1 << PART_SHIFT; od->error = 0; diff --git a/fs/orangefs/protocol.h b/fs/orangefs/protocol.h index 971307ad69be..48bcc1bbe415 100644 --- a/fs/orangefs/protocol.h +++ b/fs/orangefs/protocol.h @@ -138,13 +138,8 @@ typedef __s64 ORANGEFS_offset; #define ORANGEFS_G_SGID (1 << 10) #define ORANGEFS_U_SUID (1 << 11) -/* definition taken from stdint.h */ -#define INT32_MAX (2147483647) -#define ORANGEFS_ITERATE_START (INT32_MAX - 1) -#define ORANGEFS_ITERATE_END (INT32_MAX - 2) -#define ORANGEFS_ITERATE_NEXT (INT32_MAX - 3) -#define ORANGEFS_READDIR_START ORANGEFS_ITERATE_START -#define ORANGEFS_READDIR_END ORANGEFS_ITERATE_END +#define ORANGEFS_ITERATE_START 2147483646 +#define ORANGEFS_ITERATE_END 2147483645 #define ORANGEFS_IMMUTABLE_FL FS_IMMUTABLE_FL #define ORANGEFS_APPEND_FL FS_APPEND_FL #define ORANGEFS_NOATIME_FL FS_NOATIME_FL -- cgit v1.2.3 From 68a24a6cc4a6025e111c282186a2506281d79b4b Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Tue, 25 Apr 2017 15:38:03 -0400 Subject: orangefs: implement statx Fortunately OrangeFS has had a getattr request mask for a long time. The server basically has two difficulty levels for attributes. Fetching any attribute except size requires communicating with the metadata server for that handle. Since all the attributes are right there, it makes sense to return them all. Fetching the size requires communicating with every I/O server (that the file is distributed across). Therefore if asked for anything except size, get everything except size, and if asked for size, get everything. Signed-off-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/file.c | 6 +++-- fs/orangefs/inode.c | 16 +++++++---- fs/orangefs/namei.c | 3 +++ fs/orangefs/orangefs-kernel.h | 4 ++- fs/orangefs/orangefs-utils.c | 63 +++++++++++++++++++++++++++++-------------- 5 files changed, 64 insertions(+), 28 deletions(-) diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index e6bbc8083d77..b421df11fe95 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -475,7 +475,8 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite /* Make sure generic_write_checks sees an up to date inode size. */ if (file->f_flags & O_APPEND) { - rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1); + rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1, + STATX_SIZE); if (rc == -ESTALE) rc = -EIO; if (rc) { @@ -693,7 +694,8 @@ static loff_t orangefs_file_llseek(struct file *file, loff_t offset, int origin) * NOTE: We are only interested in file size here, * so we set mask accordingly. */ - ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1); + ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1, + STATX_SIZE); if (ret == -ESTALE) ret = -EIO; if (ret) { diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index a304bf34b212..8baf5458cecf 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -161,7 +161,7 @@ static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr) iattr->ia_size); /* Ensure that we have a up to date size, so we know if it changed. */ - ret = orangefs_inode_getattr(inode, 0, 1); + ret = orangefs_inode_getattr(inode, 0, 1, STATX_SIZE); if (ret == -ESTALE) ret = -EIO; if (ret) { @@ -256,13 +256,19 @@ int orangefs_getattr(const struct path *path, struct kstat *stat, "orangefs_getattr: called on %pd\n", path->dentry); - ret = orangefs_inode_getattr(inode, 0, 0); + ret = orangefs_inode_getattr(inode, 0, 0, request_mask); if (ret == 0) { generic_fillattr(inode, stat); /* override block size reported to stat */ orangefs_inode = ORANGEFS_I(inode); stat->blksize = orangefs_inode->blksize; + + if (request_mask & STATX_SIZE) + stat->result_mask = STATX_BASIC_STATS; + else + stat->result_mask = STATX_BASIC_STATS & + ~STATX_SIZE; } return ret; } @@ -277,7 +283,7 @@ int orangefs_permission(struct inode *inode, int mask) gossip_debug(GOSSIP_INODE_DEBUG, "%s: refreshing\n", __func__); /* Make sure the permission (and other common attrs) are up to date. */ - ret = orangefs_inode_getattr(inode, 0, 0); + ret = orangefs_inode_getattr(inode, 0, 0, STATX_MODE); if (ret < 0) return ret; @@ -375,7 +381,7 @@ struct inode *orangefs_iget(struct super_block *sb, struct orangefs_object_kref if (!inode || !(inode->i_state & I_NEW)) return inode; - error = orangefs_inode_getattr(inode, 1, 1); + error = orangefs_inode_getattr(inode, 1, 1, STATX_ALL); if (error) { iget_failed(inode); return ERR_PTR(error); @@ -420,7 +426,7 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, orangefs_set_inode(inode, ref); inode->i_ino = hash; /* needed for stat etc */ - error = orangefs_inode_getattr(inode, 1, 1); + error = orangefs_inode_getattr(inode, 1, 1, STATX_ALL); if (error) goto out_iput; diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 7c315938e9c2..478e88bd7f9d 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -74,6 +74,7 @@ static int orangefs_create(struct inode *dir, unlock_new_inode(inode); orangefs_set_timeout(dentry); ORANGEFS_I(inode)->getattr_time = jiffies - 1; + ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS; gossip_debug(GOSSIP_NAME_DEBUG, "%s: dentry instantiated for %pd\n", @@ -322,6 +323,7 @@ static int orangefs_symlink(struct inode *dir, unlock_new_inode(inode); orangefs_set_timeout(dentry); ORANGEFS_I(inode)->getattr_time = jiffies - 1; + ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS; gossip_debug(GOSSIP_NAME_DEBUG, "Inode (Symlink) %pU -> %pd\n", @@ -386,6 +388,7 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode unlock_new_inode(inode); orangefs_set_timeout(dentry); ORANGEFS_I(inode)->getattr_time = jiffies - 1; + ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS; gossip_debug(GOSSIP_NAME_DEBUG, "Inode (Directory) %pU -> %pd\n", diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index d9b050bc8882..ea0ce507a6ab 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -215,6 +215,7 @@ struct orangefs_inode_s { unsigned long pinode_flags; unsigned long getattr_time; + u32 getattr_mask; }; #define P_ATIME_FLAG 0 @@ -495,7 +496,8 @@ int orangefs_inode_setxattr(struct inode *inode, size_t size, int flags); -int orangefs_inode_getattr(struct inode *inode, int new, int bypass); +int orangefs_inode_getattr(struct inode *inode, int new, int bypass, + u32 request_mask); int orangefs_inode_check_changed(struct inode *inode); diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index 9b96b99539d6..fcbf4e56fd06 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -251,7 +251,8 @@ static int orangefs_inode_is_stale(struct inode *inode, int new, return 0; } -int orangefs_inode_getattr(struct inode *inode, int new, int bypass) +int orangefs_inode_getattr(struct inode *inode, int new, int bypass, + u32 request_mask) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_kernel_op_s *new_op; @@ -262,7 +263,13 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass) get_khandle_from_ino(inode)); if (!new && !bypass) { - if (time_before(jiffies, orangefs_inode->getattr_time)) + /* + * Must have all the attributes in the mask and be within cache + * time. + */ + if ((request_mask & orangefs_inode->getattr_mask) == + request_mask && + time_before(jiffies, orangefs_inode->getattr_time)) return 0; } @@ -270,7 +277,15 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass) if (!new_op) return -ENOMEM; new_op->upcall.req.getattr.refn = orangefs_inode->refn; - new_op->upcall.req.getattr.mask = ORANGEFS_ATTR_SYS_ALL_NOHINT; + /* + * Size is the hardest attribute to get. The incremental cost of any + * other attribute is essentially zero. + */ + if (request_mask & STATX_SIZE || new) + new_op->upcall.req.getattr.mask = ORANGEFS_ATTR_SYS_ALL_NOHINT; + else + new_op->upcall.req.getattr.mask = + ORANGEFS_ATTR_SYS_ALL_NOHINT & ~ORANGEFS_ATTR_SYS_SIZE; ret = service_operation(new_op, __func__, get_interruptible_flag(inode)); @@ -291,25 +306,29 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass) case S_IFREG: inode->i_flags = orangefs_inode_flags(&new_op-> downcall.resp.getattr.attributes); - inode_size = (loff_t)new_op-> - downcall.resp.getattr.attributes.size; - rounded_up_size = - (inode_size + (4096 - (inode_size % 4096))); - inode->i_size = inode_size; - orangefs_inode->blksize = - new_op->downcall.resp.getattr.attributes.blksize; - spin_lock(&inode->i_lock); - inode->i_bytes = inode_size; - inode->i_blocks = - (unsigned long)(rounded_up_size / 512); - spin_unlock(&inode->i_lock); + if (request_mask & STATX_SIZE || new) { + inode_size = (loff_t)new_op-> + downcall.resp.getattr.attributes.size; + rounded_up_size = + (inode_size + (4096 - (inode_size % 4096))); + inode->i_size = inode_size; + orangefs_inode->blksize = + new_op->downcall.resp.getattr.attributes.blksize; + spin_lock(&inode->i_lock); + inode->i_bytes = inode_size; + inode->i_blocks = + (unsigned long)(rounded_up_size / 512); + spin_unlock(&inode->i_lock); + } break; case S_IFDIR: - inode->i_size = PAGE_SIZE; - orangefs_inode->blksize = i_blocksize(inode); - spin_lock(&inode->i_lock); - inode_set_bytes(inode, inode->i_size); - spin_unlock(&inode->i_lock); + if (request_mask & STATX_SIZE || new) { + inode->i_size = PAGE_SIZE; + orangefs_inode->blksize = i_blocksize(inode); + spin_lock(&inode->i_lock); + inode_set_bytes(inode, inode->i_size); + spin_unlock(&inode->i_lock); + } set_nlink(inode, 1); break; case S_IFLNK: @@ -349,6 +368,10 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass) orangefs_inode->getattr_time = jiffies + orangefs_getattr_timeout_msecs*HZ/1000; + if (request_mask & STATX_SIZE || new) + orangefs_inode->getattr_mask = STATX_BASIC_STATS; + else + orangefs_inode->getattr_mask = STATX_BASIC_STATS & ~STATX_SIZE; ret = 0; out: op_release(new_op); -- cgit v1.2.3 From 53950ef541675df48c219a8d665111a0e68dfc2f Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Tue, 25 Apr 2017 15:38:04 -0400 Subject: orangefs: do not check possibly stale size on truncate Let the server figure this out because our size might be out of date or not present. The bug was that xfs_io -f -t -c "pread -v 0 100" /mnt/foo echo "Test" > /mnt/foo xfs_io -f -t -c "pread -v 0 100" /mnt/foo fails because the second truncate did not happen if nothing had requested the size after the write in echo. Thus i_size was zero (not present) and the orangefs_setattr though i_size was zero and there was nothing to do. Signed-off-by: Martin Brandenburg Cc: stable@vger.kernel.org Signed-off-by: Mike Marshall --- fs/orangefs/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 8baf5458cecf..9428ea0aac16 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -218,8 +218,7 @@ int orangefs_setattr(struct dentry *dentry, struct iattr *iattr) if (ret) goto out; - if ((iattr->ia_valid & ATTR_SIZE) && - iattr->ia_size != i_size_read(inode)) { + if (iattr->ia_valid & ATTR_SIZE) { ret = orangefs_setattr_size(inode, iattr); if (ret) goto out; -- cgit v1.2.3 From 9d286b0d8207a70d7e0ffbd5be864ff7a62de05a Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Tue, 25 Apr 2017 15:38:05 -0400 Subject: orangefs: ensure the userspace component is unmounted if mount fails If the mount is aborted after userspace has been asked to mount, userspace must be told to unmount. Ordinarily orangefs_kill_sb does the unmount. However it cannot be called if the superblock has not been set up. This is a very narrow window. The NULL fs_id is not unmounted. Signed-off-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/orangefs-utils.c | 35 ----------------------------------- fs/orangefs/super.c | 28 +++++++++++++++++++++++++++- 2 files changed, 27 insertions(+), 36 deletions(-) diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index fcbf4e56fd06..aab6f1842963 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -523,41 +523,6 @@ int orangefs_flush_inode(struct inode *inode) return ret; } -int orangefs_unmount_sb(struct super_block *sb) -{ - int ret = -EINVAL; - struct orangefs_kernel_op_s *new_op = NULL; - - gossip_debug(GOSSIP_UTILS_DEBUG, - "orangefs_unmount_sb called on sb %p\n", - sb); - - new_op = op_alloc(ORANGEFS_VFS_OP_FS_UMOUNT); - if (!new_op) - return -ENOMEM; - new_op->upcall.req.fs_umount.id = ORANGEFS_SB(sb)->id; - new_op->upcall.req.fs_umount.fs_id = ORANGEFS_SB(sb)->fs_id; - strncpy(new_op->upcall.req.fs_umount.orangefs_config_server, - ORANGEFS_SB(sb)->devname, - ORANGEFS_MAX_SERVER_ADDR_LEN); - - gossip_debug(GOSSIP_UTILS_DEBUG, - "Attempting ORANGEFS Unmount via host %s\n", - new_op->upcall.req.fs_umount.orangefs_config_server); - - ret = service_operation(new_op, "orangefs_fs_umount", 0); - - gossip_debug(GOSSIP_UTILS_DEBUG, - "orangefs_unmount: got return value of %d\n", ret); - if (ret) - sb = ERR_PTR(ret); - else - ORANGEFS_SB(sb)->mount_pending = 1; - - op_release(new_op); - return ret; -} - void orangefs_make_bad_inode(struct inode *inode) { if (is_root_handle(inode)) { diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index 629d8c917fa6..5c7c273e17ec 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -376,6 +376,25 @@ static const struct export_operations orangefs_export_ops = { .fh_to_dentry = orangefs_fh_to_dentry, }; +static int orangefs_unmount(int id, __s32 fs_id, const char *devname) +{ + struct orangefs_kernel_op_s *op; + int r; + op = op_alloc(ORANGEFS_VFS_OP_FS_UMOUNT); + if (!op) + return -ENOMEM; + op->upcall.req.fs_umount.id = id; + op->upcall.req.fs_umount.fs_id = fs_id; + strncpy(op->upcall.req.fs_umount.orangefs_config_server, + devname, ORANGEFS_MAX_SERVER_ADDR_LEN); + r = service_operation(op, "orangefs_fs_umount", 0); + /* Not much to do about an error here. */ + if (r) + gossip_err("orangefs_unmount: service_operation %d\n", r); + op_release(op); + return r; +} + static int orangefs_fill_sb(struct super_block *sb, struct orangefs_fs_mount_response *fs_mount, void *data, int silent) @@ -484,6 +503,8 @@ struct dentry *orangefs_mount(struct file_system_type *fst, if (IS_ERR(sb)) { d = ERR_CAST(sb); + orangefs_unmount(new_op->downcall.resp.fs_mount.id, + new_op->downcall.resp.fs_mount.fs_id, devname); goto free_op; } @@ -539,6 +560,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst, free_sb_and_op: /* Will call orangefs_kill_sb with sb not in list. */ ORANGEFS_SB(sb)->no_list = 1; + /* ORANGEFS_VFS_OP_FS_UMOUNT is done by orangefs_kill_sb. */ deactivate_locked_super(sb); free_op: gossip_err("orangefs_mount: mount request failed with %d\n", ret); @@ -554,6 +576,7 @@ free_op: void orangefs_kill_sb(struct super_block *sb) { + int r; gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_kill_sb: called\n"); /* provided sb cleanup */ @@ -563,7 +586,10 @@ void orangefs_kill_sb(struct super_block *sb) * issue the unmount to userspace to tell it to remove the * dynamic mount info it has for this superblock */ - orangefs_unmount_sb(sb); + r = orangefs_unmount(ORANGEFS_SB(sb)->id, ORANGEFS_SB(sb)->fs_id, + ORANGEFS_SB(sb)->devname); + if (!r) + ORANGEFS_SB(sb)->mount_pending = 1; if (!ORANGEFS_SB(sb)->no_list) { /* remove the sb from our list of orangefs specific sb's */ -- cgit v1.2.3 From b7a57ccab891584d00ae03dce1176b2d4cbe08e7 Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Tue, 25 Apr 2017 15:38:06 -0400 Subject: orangefs: return from orangefs_devreq_read quickly if possible It is not necessary to take the lock and search through the request list if the list is empty. Signed-off-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/devorangefs-req.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c index e1534c9bab16..c19f0787c9c6 100644 --- a/fs/orangefs/devorangefs-req.c +++ b/fs/orangefs/devorangefs-req.c @@ -180,6 +180,10 @@ static ssize_t orangefs_devreq_read(struct file *file, return -EINVAL; } + /* Check for an empty list before locking. */ + if (list_empty(&orangefs_request_list)) + return -EAGAIN; + restart: /* Get next op (if any) from top of list. */ spin_lock(&orangefs_request_list_lock); -- cgit v1.2.3 From b5a9d61eebdd0016ccb383b25a5c3d04977a6549 Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Tue, 25 Apr 2017 15:38:07 -0400 Subject: orangefs: do not wait for timeout if umounting When the computer is turned off, all the processes are killed and then all the filesystems are umounted. OrangeFS should not wait for the userspace daemon to come back in that case. This only works for plain umount(2). To actually take advantage of this interactively, `umount -f' is needed; otherwise umount will issue a statfs first, which will wait for the userspace daemon to come back. Signed-off-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/waitqueue.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c index abcfa3fa9992..61e2ca7fec55 100644 --- a/fs/orangefs/waitqueue.c +++ b/fs/orangefs/waitqueue.c @@ -124,7 +124,14 @@ retry_servicing: gossip_debug(GOSSIP_WAIT_DEBUG, "%s:client core is NOT in service.\n", __func__); - timeout = op_timeout_secs * HZ; + /* + * Don't wait for the userspace component to return if + * the filesystem is being umounted anyway. + */ + if (op->upcall.type == ORANGEFS_VFS_OP_FS_UMOUNT) + timeout = 0; + else + timeout = op_timeout_secs * HZ; } spin_unlock(&orangefs_request_list_lock); -- cgit v1.2.3 From 907bfcd8d8a616ca794ba187f6bf1b0e12b3a8dd Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 14 Apr 2017 22:11:53 +0300 Subject: orangefs: handle zero size write in debugfs If we write zero bytes to this debugfs file, then it will cause an underflow when we do copy_from_user(buf, ubuf, count - 1). Debugfs can normally only be written to by root so the impact of this is low. Signed-off-by: Dan Carpenter Signed-off-by: Mike Marshall --- fs/orangefs/orangefs-debugfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c index 791912da97d7..716ed337f166 100644 --- a/fs/orangefs/orangefs-debugfs.c +++ b/fs/orangefs/orangefs-debugfs.c @@ -440,6 +440,9 @@ static ssize_t orangefs_debug_write(struct file *file, "orangefs_debug_write: %pD\n", file); + if (count == 0) + return 0; + /* * Thwart users who try to jamb a ridiculous number * of bytes into the debug file... -- cgit v1.2.3 From bf15ba7c1f9ad000d062968f931e80234db84a24 Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Tue, 2 May 2017 12:15:10 -0400 Subject: orangefs: skip forward to the next directory entry if seek is short If userspace seeks to a position in the stream which is not correct, it would have returned EIO because the data in the buffer at that offset would be incorrect. This and the userspace daemon returning a corrupt directory are indistinguishable. Now if the data does not look right, skip forward to the next chunk and try again. The motivation is that if the directory changes, an application may seek to a position that was valid and no longer is valid. It is not yet possible for a directory to change. Signed-off-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/dir.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c index d5ec9ba82cef..4cfb3e5b597e 100644 --- a/fs/orangefs/dir.c +++ b/fs/orangefs/dir.c @@ -194,11 +194,18 @@ static int fill_from_part(struct orangefs_dir_part *part, /* The file offset from userspace is too large. */ if (i > part->len) - return -EIO; + return 1; + + /* + * If the seek pointer is positioned just before an entry it + * should find the next entry. + */ + if (i % 8) + i = i + (8 - i%8)%8; while (i < part->len) { if (part->len < i + sizeof *len) - return -EIO; + break; len = (void *)part + offset + i; /* * len is the size of the string itself. padlen is the @@ -207,10 +214,10 @@ static int fill_from_part(struct orangefs_dir_part *part, padlen = (sizeof *len + *len + 1) + (8 - (sizeof *len + *len + 1)%8)%8; if (part->len < i + padlen + sizeof *khandle) - return -EIO; + goto next; s = (void *)part + offset + i + sizeof *len; if (s[*len] != 0) - return -EIO; + goto next; khandle = (void *)part + offset + i + padlen; if (!dir_emit(ctx, s, *len, orangefs_khandle_to_ino(khandle), @@ -220,6 +227,9 @@ static int fill_from_part(struct orangefs_dir_part *part, i = i + (8 - i%8)%8; BUG_ON(i > part->len); ctx->pos = (ctx->pos & PART_MASK) | i; + continue; +next: + i += 8; } return 1; } -- cgit v1.2.3 From 942835d68f6e16f2673c70791dc963c548681cb4 Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Tue, 2 May 2017 12:15:11 -0400 Subject: orangefs: invalidate stored directory on seek If an application seeks to a position before the point which has been read, it must want updates which have been made to the directory. So delete the copy stored in the kernel so it will be fetched again. Signed-off-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/dir.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c index 4cfb3e5b597e..cac601498925 100644 --- a/fs/orangefs/dir.c +++ b/fs/orangefs/dir.c @@ -275,6 +275,28 @@ static int orangefs_dir_fill(struct orangefs_inode_s *oi, return 0; } +static loff_t orangefs_dir_llseek(struct file *file, loff_t offset, + int whence) +{ + struct orangefs_dir *od = file->private_data; + /* + * Delete the stored data so userspace sees new directory + * entries. + */ + if (!whence && offset < od->end) { + struct orangefs_dir_part *part = od->part; + while (part) { + struct orangefs_dir_part *next = part->next; + vfree(part); + part = next; + } + od->token = ORANGEFS_ITERATE_START; + od->part = NULL; + od->end = 1 << PART_SHIFT; + } + return default_llseek(file, offset, whence); +} + static int orangefs_dir_iterate(struct file *file, struct dir_context *ctx) { @@ -371,7 +393,7 @@ static int orangefs_dir_release(struct inode *inode, struct file *file) } const struct file_operations orangefs_dir_operations = { - .llseek = default_llseek, + .llseek = orangefs_dir_llseek, .read = generic_read_dir, .iterate = orangefs_dir_iterate, .open = orangefs_dir_open, -- cgit v1.2.3 From 2f713b5c7d2a90baba6c88174c81fb9a96bfde21 Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Thu, 4 May 2017 13:16:04 -0400 Subject: orangefs: count directory pieces correctly A large directory full of differently sized file names triggered this. Most directories, even very large directories with shorter names, would be lucky enough to fit in one server response. Signed-off-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/dir.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c index cac601498925..d327cbd17756 100644 --- a/fs/orangefs/dir.c +++ b/fs/orangefs/dir.c @@ -135,9 +135,12 @@ static int parse_readdir(struct orangefs_dir *od, count = 1; part = od->part; - while (part && part->next) { - part = part->next; + while (part) { count++; + if (part->next) + part = part->next; + else + break; } new = (void *)op->downcall.trailer_buf; -- cgit v1.2.3