diff options
Diffstat (limited to 'fs/orangefs/file.c')
-rw-r--r-- | fs/orangefs/file.c | 389 |
1 files changed, 162 insertions, 227 deletions
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index b094d3d79354..a35c17017210 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * (C) 2001 Clemson University and The University of Chicago + * Copyright 2018 Omnibond Systems, L.L.C. * * See COPYING in top-level directory. */ @@ -44,15 +45,16 @@ static int flush_racache(struct inode *inode) /* * Post and wait for the I/O upcall to finish */ -static ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode, - loff_t *offset, struct iov_iter *iter, - size_t total_size, loff_t readahead_size) +ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode, + loff_t *offset, struct iov_iter *iter, size_t total_size, + loff_t readahead_size, struct orangefs_write_range *wr, int *index_return) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_khandle *handle = &orangefs_inode->refn.khandle; struct orangefs_kernel_op_s *new_op = NULL; int buffer_index = -1; ssize_t ret; + size_t copy_amount; new_op = op_alloc(ORANGEFS_VFS_OP_FILE_IO); if (!new_op) @@ -84,6 +86,10 @@ populate_shared_memory: new_op->upcall.req.io.buf_index = buffer_index; new_op->upcall.req.io.count = total_size; new_op->upcall.req.io.offset = *offset; + if (type == ORANGEFS_IO_WRITE && wr) { + new_op->upcall.uid = from_kuid(&init_user_ns, wr->uid); + new_op->upcall.gid = from_kgid(&init_user_ns, wr->gid); + } gossip_debug(GOSSIP_FILE_DEBUG, "%s(%pU): offset: %llu total_size: %zd\n", @@ -168,7 +174,10 @@ populate_shared_memory: * trigger the write. */ case OP_VFS_STATE_INPROGR: - ret = total_size; + if (type == ORANGEFS_IO_READ) + ret = -EINTR; + else + ret = total_size; break; default: gossip_err("%s: unexpected op state :%d:.\n", @@ -204,8 +213,25 @@ populate_shared_memory: * can futher be kernel-space or user-space addresses. * or it can pointers to struct page's */ + + /* + * When reading, readahead_size will only be zero when + * we're doing O_DIRECT, otherwise we got here from + * orangefs_readpage. + * + * If we got here from orangefs_readpage we want to + * copy either a page or the whole file into the io + * vector, whichever is smaller. + */ + if (readahead_size) + copy_amount = + min(new_op->downcall.resp.io.amt_complete, + (__s64)PAGE_SIZE); + else + copy_amount = new_op->downcall.resp.io.amt_complete; + ret = orangefs_bufmap_copy_to_iovec(iter, buffer_index, - new_op->downcall.resp.io.amt_complete); + copy_amount); if (ret < 0) { gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n", __func__, (long)ret); @@ -223,246 +249,112 @@ populate_shared_memory: out: if (buffer_index >= 0) { - orangefs_bufmap_put(buffer_index); - gossip_debug(GOSSIP_FILE_DEBUG, - "%s(%pU): PUT buffer_index %d\n", - __func__, handle, buffer_index); + if ((readahead_size) && (type == ORANGEFS_IO_READ)) { + /* readpage */ + *index_return = buffer_index; + gossip_debug(GOSSIP_FILE_DEBUG, + "%s: hold on to buffer_index :%d:\n", + __func__, buffer_index); + } else { + /* O_DIRECT */ + orangefs_bufmap_put(buffer_index); + gossip_debug(GOSSIP_FILE_DEBUG, + "%s(%pU): PUT buffer_index %d\n", + __func__, handle, buffer_index); + } buffer_index = -1; } op_release(new_op); return ret; } -/* - * Common entry point for read/write/readv/writev - * This function will dispatch it to either the direct I/O - * or buffered I/O path depending on the mount options and/or - * augmented/extended metadata attached to the file. - * Note: File extended attributes override any mount options. - */ -static ssize_t do_readv_writev(enum ORANGEFS_io_type type, struct file *file, - loff_t *offset, struct iov_iter *iter) +int orangefs_revalidate_mapping(struct inode *inode) { - struct inode *inode = file->f_mapping->host; struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); - struct orangefs_khandle *handle = &orangefs_inode->refn.khandle; - size_t count = iov_iter_count(iter); - ssize_t total_count = 0; - ssize_t ret = -EINVAL; - - gossip_debug(GOSSIP_FILE_DEBUG, - "%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n", - __func__, - handle, - (int)count); - - if (type == ORANGEFS_IO_WRITE) { - gossip_debug(GOSSIP_FILE_DEBUG, - "%s(%pU): proceeding with offset : %llu, " - "size %d\n", - __func__, - handle, - llu(*offset), - (int)count); - } + struct address_space *mapping = inode->i_mapping; + unsigned long *bitlock = &orangefs_inode->bitlock; + int ret; - if (count == 0) { - ret = 0; - goto out; + while (1) { + ret = wait_on_bit(bitlock, 1, TASK_KILLABLE); + if (ret) + return ret; + spin_lock(&inode->i_lock); + if (test_bit(1, bitlock)) { + spin_unlock(&inode->i_lock); + continue; + } + if (!time_before(jiffies, orangefs_inode->mapping_time)) + break; + spin_unlock(&inode->i_lock); + return 0; } - while (iov_iter_count(iter)) { - size_t each_count = iov_iter_count(iter); - size_t amt_complete; - - /* how much to transfer in this loop iteration */ - if (each_count > orangefs_bufmap_size_query()) - each_count = orangefs_bufmap_size_query(); + set_bit(1, bitlock); + smp_wmb(); + spin_unlock(&inode->i_lock); - gossip_debug(GOSSIP_FILE_DEBUG, - "%s(%pU): size of each_count(%d)\n", - __func__, - handle, - (int)each_count); - gossip_debug(GOSSIP_FILE_DEBUG, - "%s(%pU): BEFORE wait_for_io: offset is %d\n", - __func__, - handle, - (int)*offset); - - ret = wait_for_direct_io(type, inode, offset, iter, - each_count, 0); - gossip_debug(GOSSIP_FILE_DEBUG, - "%s(%pU): return from wait_for_io:%d\n", - __func__, - handle, - (int)ret); + unmap_mapping_range(mapping, 0, 0, 0); + ret = filemap_write_and_wait(mapping); + if (!ret) + ret = invalidate_inode_pages2(mapping); - if (ret < 0) - goto out; - - *offset += ret; - total_count += ret; - amt_complete = ret; + orangefs_inode->mapping_time = jiffies + + orangefs_cache_timeout_msecs*HZ/1000; - gossip_debug(GOSSIP_FILE_DEBUG, - "%s(%pU): AFTER wait_for_io: offset is %d\n", - __func__, - handle, - (int)*offset); - - /* - * if we got a short I/O operations, - * fall out and return what we got so far - */ - if (amt_complete < each_count) - break; - } /*end while */ - -out: - if (total_count > 0) - ret = total_count; - if (ret > 0) { - if (type == ORANGEFS_IO_READ) { - file_accessed(file); - } else { - file_update_time(file); - /* - * Must invalidate to ensure write loop doesn't - * prevent kernel from reading updated - * attribute. Size probably changed because of - * the write, and other clients could update - * any other attribute. - */ - orangefs_inode->getattr_time = jiffies - 1; - } - } - - gossip_debug(GOSSIP_FILE_DEBUG, - "%s(%pU): Value(%d) returned.\n", - __func__, - handle, - (int)ret); + clear_bit(1, bitlock); + smp_mb__after_atomic(); + wake_up_bit(bitlock, 1); return ret; } -/* - * Read data from a specified offset in a file (referenced by inode). - * Data may be placed either in a user or kernel buffer. - */ -ssize_t orangefs_inode_read(struct inode *inode, - struct iov_iter *iter, - loff_t *offset, - loff_t readahead_size) +static ssize_t orangefs_file_read_iter(struct kiocb *iocb, + struct iov_iter *iter) { - struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); - size_t count = iov_iter_count(iter); - size_t bufmap_size; - ssize_t ret = -EINVAL; + int ret; + struct orangefs_read_options *ro; orangefs_stats.reads++; - bufmap_size = orangefs_bufmap_size_query(); - if (count > bufmap_size) { - gossip_debug(GOSSIP_FILE_DEBUG, - "%s: count is too large (%zd/%zd)!\n", - __func__, count, bufmap_size); - return -EINVAL; + /* + * Remember how they set "count" in read(2) or pread(2) or whatever - + * users can use count as a knob to control orangefs io size and later + * we can try to help them fill as many pages as possible in readpage. + */ + if (!iocb->ki_filp->private_data) { + iocb->ki_filp->private_data = kmalloc(sizeof *ro, GFP_KERNEL); + if (!iocb->ki_filp->private_data) + return(ENOMEM); + ro = iocb->ki_filp->private_data; + ro->blksiz = iter->count; } - gossip_debug(GOSSIP_FILE_DEBUG, - "%s(%pU) %zd@%llu\n", - __func__, - &orangefs_inode->refn.khandle, - count, - llu(*offset)); - - ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, offset, iter, - count, readahead_size); - if (ret > 0) - *offset += ret; - - gossip_debug(GOSSIP_FILE_DEBUG, - "%s(%pU): Value(%zd) returned.\n", - __func__, - &orangefs_inode->refn.khandle, - ret); + down_read(&file_inode(iocb->ki_filp)->i_rwsem); + ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp)); + if (ret) + goto out; + ret = generic_file_read_iter(iocb, iter); +out: + up_read(&file_inode(iocb->ki_filp)->i_rwsem); return ret; } -static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) +static ssize_t orangefs_file_write_iter(struct kiocb *iocb, + struct iov_iter *iter) { - struct file *file = iocb->ki_filp; - loff_t pos = iocb->ki_pos; - ssize_t rc = 0; - - gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_read_iter\n"); - - orangefs_stats.reads++; - - rc = do_readv_writev(ORANGEFS_IO_READ, file, &pos, iter); - iocb->ki_pos = pos; - - return rc; -} - -static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *iter) -{ - struct file *file = iocb->ki_filp; - loff_t pos; - ssize_t rc; - - gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_write_iter\n"); - - inode_lock(file->f_mapping->host); - - /* Make sure generic_write_checks sees an up to date inode size. */ - if (file->f_flags & O_APPEND) { - rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1, - STATX_SIZE); - if (rc == -ESTALE) - rc = -EIO; - if (rc) { - gossip_err("%s: orangefs_inode_getattr failed, " - "rc:%zd:.\n", __func__, rc); - goto out; - } - } - - rc = generic_write_checks(iocb, iter); - - if (rc <= 0) { - gossip_err("%s: generic_write_checks failed, rc:%zd:.\n", - __func__, rc); - goto out; - } - - /* - * if we are appending, generic_write_checks would have updated - * pos to the end of the file, so we will wait till now to set - * pos... - */ - pos = iocb->ki_pos; - - rc = do_readv_writev(ORANGEFS_IO_WRITE, - file, - &pos, - iter); - if (rc < 0) { - gossip_err("%s: do_readv_writev failed, rc:%zd:.\n", - __func__, rc); - goto out; - } - - iocb->ki_pos = pos; + int ret; orangefs_stats.writes++; -out: + if (iocb->ki_pos > i_size_read(file_inode(iocb->ki_filp))) { + ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp)); + if (ret) + return ret; + } - inode_unlock(file->f_mapping->host); - return rc; + ret = generic_file_write_iter(iocb, iter); + return ret; } /* @@ -528,14 +420,13 @@ static vm_fault_t orangefs_fault(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; int ret; - - ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1, - STATX_SIZE); + ret = orangefs_inode_getattr(file->f_mapping->host, + ORANGEFS_GETATTR_SIZE); if (ret == -ESTALE) ret = -EIO; if (ret) { - gossip_err("%s: orangefs_inode_getattr failed, ret:%d:.\n", - __func__, ret); + gossip_err("%s: orangefs_inode_getattr failed, " + "ret:%d:.\n", __func__, ret); return VM_FAULT_SIGBUS; } return filemap_fault(vmf); @@ -544,7 +435,7 @@ static vm_fault_t orangefs_fault(struct vm_fault *vmf) static const struct vm_operations_struct orangefs_file_vm_ops = { .fault = orangefs_fault, .map_pages = filemap_map_pages, - .page_mkwrite = filemap_page_mkwrite, + .page_mkwrite = orangefs_page_mkwrite, }; /* @@ -552,15 +443,18 @@ static const struct vm_operations_struct orangefs_file_vm_ops = { */ static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma) { + int ret; + + ret = orangefs_revalidate_mapping(file_inode(file)); + if (ret) + return ret; + gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_mmap: called on %s\n", (file ? (char *)file->f_path.dentry->d_name.name : (char *)"Unknown")); - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) - return -EINVAL; - /* set the sequential readahead hint */ vma->vm_flags |= VM_SEQ_READ; vma->vm_flags &= ~VM_RAND_READ; @@ -600,8 +494,7 @@ static int orangefs_file_release(struct inode *inode, struct file *file) gossip_debug(GOSSIP_INODE_DEBUG, "flush_racache finished\n"); } - truncate_inode_pages(file_inode(file)->i_mapping, - 0); + } return 0; } @@ -619,6 +512,11 @@ static int orangefs_fsync(struct file *file, ORANGEFS_I(file_inode(file)); struct orangefs_kernel_op_s *new_op = NULL; + ret = filemap_write_and_wait_range(file_inode(file)->i_mapping, + start, end); + if (ret < 0) + return ret; + new_op = op_alloc(ORANGEFS_VFS_OP_FSYNC); if (!new_op) return -ENOMEM; @@ -656,8 +554,8 @@ static loff_t orangefs_file_llseek(struct file *file, loff_t offset, int origin) * NOTE: We are only interested in file size here, * so we set mask accordingly. */ - ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1, - STATX_SIZE); + ret = orangefs_inode_getattr(file->f_mapping->host, + ORANGEFS_GETATTR_SIZE); if (ret == -ESTALE) ret = -EIO; if (ret) { @@ -700,6 +598,42 @@ static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl) return rc; } +static int orangefs_file_open(struct inode * inode, struct file *file) +{ + file->private_data = NULL; + return generic_file_open(inode, file); +} + +static int orangefs_flush(struct file *file, fl_owner_t id) +{ + /* + * This is vfs_fsync_range(file, 0, LLONG_MAX, 0) without the + * service_operation in orangefs_fsync. + * + * Do not send fsync to OrangeFS server on a close. Do send fsync + * on an explicit fsync call. This duplicates historical OrangeFS + * behavior. + */ + struct inode *inode = file->f_mapping->host; + int r; + + kfree(file->private_data); + file->private_data = NULL; + + if (inode->i_state & I_DIRTY_TIME) { + spin_lock(&inode->i_lock); + inode->i_state &= ~I_DIRTY_TIME; + spin_unlock(&inode->i_lock); + mark_inode_dirty_sync(inode); + } + + r = filemap_write_and_wait_range(file->f_mapping, 0, LLONG_MAX); + if (r > 0) + return 0; + else + return r; +} + /** ORANGEFS implementation of VFS file operations */ const struct file_operations orangefs_file_operations = { .llseek = orangefs_file_llseek, @@ -708,7 +642,8 @@ const struct file_operations orangefs_file_operations = { .lock = orangefs_lock, .unlocked_ioctl = orangefs_ioctl, .mmap = orangefs_file_mmap, - .open = generic_file_open, + .open = orangefs_file_open, + .flush = orangefs_flush, .release = orangefs_file_release, .fsync = orangefs_fsync, }; |