diff options
Diffstat (limited to 'fs/ceph')
-rw-r--r-- | fs/ceph/addr.c | 273 | ||||
-rw-r--r-- | fs/ceph/caps.c | 132 | ||||
-rw-r--r-- | fs/ceph/dir.c | 27 | ||||
-rw-r--r-- | fs/ceph/file.c | 97 | ||||
-rw-r--r-- | fs/ceph/inode.c | 59 | ||||
-rw-r--r-- | fs/ceph/locks.c | 64 | ||||
-rw-r--r-- | fs/ceph/mds_client.c | 41 | ||||
-rw-r--r-- | fs/ceph/mds_client.h | 10 | ||||
-rw-r--r-- | fs/ceph/snap.c | 37 | ||||
-rw-r--r-- | fs/ceph/super.c | 16 | ||||
-rw-r--r-- | fs/ceph/super.h | 55 | ||||
-rw-r--r-- | fs/ceph/super.h.rej | 10 | ||||
-rw-r--r-- | fs/ceph/xattr.c | 7 |
13 files changed, 712 insertions, 116 deletions
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 18c06bbaf136..f5013d92a7e6 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -192,17 +192,30 @@ static int readpage_nounlock(struct file *filp, struct page *page) struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->client->osdc; int err = 0; + u64 off = page_offset(page); u64 len = PAGE_CACHE_SIZE; - err = ceph_readpage_from_fscache(inode, page); + if (off >= i_size_read(inode)) { + zero_user_segment(page, err, PAGE_CACHE_SIZE); + SetPageUptodate(page); + return 0; + } + /* + * Uptodate inline data should have been added into page cache + * while getting Fcr caps. + */ + if (ci->i_inline_version != CEPH_INLINE_NONE) + return -EINVAL; + + err = ceph_readpage_from_fscache(inode, page); if (err == 0) goto out; dout("readpage inode %p file %p page %p index %lu\n", inode, filp, page, page->index); err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, - (u64) page_offset(page), &len, + off, &len, ci->i_truncate_seq, ci->i_truncate_size, &page, 1, 0); if (err == -ENOENT) @@ -319,7 +332,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) off, len); vino = ceph_vino(inode); req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, - 1, CEPH_OSD_OP_READ, + 0, 1, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, ci->i_truncate_seq, ci->i_truncate_size, false); @@ -384,6 +397,9 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, int rc = 0; int max = 0; + if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE) + return -EINVAL; + rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list, &nr_pages); @@ -673,7 +689,7 @@ static int ceph_writepages_start(struct address_space *mapping, int rc = 0; unsigned wsize = 1 << inode->i_blkbits; struct ceph_osd_request *req = NULL; - int do_sync; + int do_sync = 0; u64 truncate_size, snap_size; u32 truncate_seq; @@ -750,7 +766,6 @@ retry: last_snapc = snapc; while (!done && index <= end) { - int num_ops = do_sync ? 2 : 1; unsigned i; int first; pgoff_t next; @@ -850,7 +865,8 @@ get_more_pages: len = wsize; req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, - offset, &len, num_ops, + offset, &len, 0, + do_sync ? 2 : 1, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, @@ -862,6 +878,9 @@ get_more_pages: break; } + if (do_sync) + osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); + req->r_callback = writepages_finish; req->r_inode = inode; @@ -1204,6 +1223,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct inode *inode = file_inode(vma->vm_file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_info *fi = vma->vm_file->private_data; + struct page *pinned_page = NULL; loff_t off = vmf->pgoff << PAGE_CACHE_SHIFT; int want, got, ret; @@ -1215,7 +1235,8 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) want = CEPH_CAP_FILE_CACHE; while (1) { got = 0; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); + ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, + -1, &got, &pinned_page); if (ret == 0) break; if (ret != -ERESTARTSYS) { @@ -1226,12 +1247,54 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) dout("filemap_fault %p %llu~%zd got cap refs on %s\n", inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got)); - ret = filemap_fault(vma, vmf); + if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || + ci->i_inline_version == CEPH_INLINE_NONE) + ret = filemap_fault(vma, vmf); + else + ret = -EAGAIN; dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n", inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret); + if (pinned_page) + page_cache_release(pinned_page); ceph_put_cap_refs(ci, got); + if (ret != -EAGAIN) + return ret; + + /* read inline data */ + if (off >= PAGE_CACHE_SIZE) { + /* does not support inline data > PAGE_SIZE */ + ret = VM_FAULT_SIGBUS; + } else { + int ret1; + struct address_space *mapping = inode->i_mapping; + struct page *page = find_or_create_page(mapping, 0, + mapping_gfp_mask(mapping) & + ~__GFP_FS); + if (!page) { + ret = VM_FAULT_OOM; + goto out; + } + ret1 = __ceph_do_getattr(inode, page, + CEPH_STAT_CAP_INLINE_DATA, true); + if (ret1 < 0 || off >= i_size_read(inode)) { + unlock_page(page); + page_cache_release(page); + ret = VM_FAULT_SIGBUS; + goto out; + } + if (ret1 < PAGE_CACHE_SIZE) + zero_user_segment(page, ret1, PAGE_CACHE_SIZE); + else + flush_dcache_page(page); + SetPageUptodate(page); + vmf->page = page; + ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; + } +out: + dout("filemap_fault %p %llu~%zd read inline data ret %d\n", + inode, off, (size_t)PAGE_CACHE_SIZE, ret); return ret; } @@ -1250,6 +1313,19 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) size_t len; int want, got, ret; + if (ci->i_inline_version != CEPH_INLINE_NONE) { + struct page *locked_page = NULL; + if (off == 0) { + lock_page(page); + locked_page = page; + } + ret = ceph_uninline_data(vma->vm_file, locked_page); + if (locked_page) + unlock_page(locked_page); + if (ret < 0) + return VM_FAULT_SIGBUS; + } + if (off + PAGE_CACHE_SIZE <= size) len = PAGE_CACHE_SIZE; else @@ -1263,7 +1339,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) want = CEPH_CAP_FILE_BUFFER; while (1) { got = 0; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, off + len); + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, + &got, NULL); if (ret == 0) break; if (ret != -ERESTARTSYS) { @@ -1297,11 +1374,13 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = VM_FAULT_SIGBUS; } out: - if (ret != VM_FAULT_LOCKED) { + if (ret != VM_FAULT_LOCKED) unlock_page(page); - } else { + if (ret == VM_FAULT_LOCKED || + ci->i_inline_version != CEPH_INLINE_NONE) { int dirty; spin_lock(&ci->i_ceph_lock); + ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); spin_unlock(&ci->i_ceph_lock); if (dirty) @@ -1315,6 +1394,178 @@ out: return ret; } +void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, + char *data, size_t len) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + + if (locked_page) { + page = locked_page; + } else { + if (i_size_read(inode) == 0) + return; + page = find_or_create_page(mapping, 0, + mapping_gfp_mask(mapping) & ~__GFP_FS); + if (!page) + return; + if (PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + return; + } + } + + dout("fill_inline_data %p %llx.%llx len %lu locked_page %p\n", + inode, ceph_vinop(inode), len, locked_page); + + if (len > 0) { + void *kaddr = kmap_atomic(page); + memcpy(kaddr, data, len); + kunmap_atomic(kaddr); + } + + if (page != locked_page) { + if (len < PAGE_CACHE_SIZE) + zero_user_segment(page, len, PAGE_CACHE_SIZE); + else + flush_dcache_page(page); + + SetPageUptodate(page); + unlock_page(page); + page_cache_release(page); + } +} + +int ceph_uninline_data(struct file *filp, struct page *locked_page) +{ + struct inode *inode = file_inode(filp); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_osd_request *req; + struct page *page = NULL; + u64 len, inline_version; + int err = 0; + bool from_pagecache = false; + + spin_lock(&ci->i_ceph_lock); + inline_version = ci->i_inline_version; + spin_unlock(&ci->i_ceph_lock); + + dout("uninline_data %p %llx.%llx inline_version %llu\n", + inode, ceph_vinop(inode), inline_version); + + if (inline_version == 1 || /* initial version, no data */ + inline_version == CEPH_INLINE_NONE) + goto out; + + if (locked_page) { + page = locked_page; + WARN_ON(!PageUptodate(page)); + } else if (ceph_caps_issued(ci) & + (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) { + page = find_get_page(inode->i_mapping, 0); + if (page) { + if (PageUptodate(page)) { + from_pagecache = true; + lock_page(page); + } else { + page_cache_release(page); + page = NULL; + } + } + } + + if (page) { + len = i_size_read(inode); + if (len > PAGE_CACHE_SIZE) + len = PAGE_CACHE_SIZE; + } else { + page = __page_cache_alloc(GFP_NOFS); + if (!page) { + err = -ENOMEM; + goto out; + } + err = __ceph_do_getattr(inode, page, + CEPH_STAT_CAP_INLINE_DATA, true); + if (err < 0) { + /* no inline data */ + if (err == -ENODATA) + err = 0; + goto out; + } + len = err; + } + + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + ceph_vino(inode), 0, &len, 0, 1, + CEPH_OSD_OP_CREATE, + CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, + ci->i_snap_realm->cached_context, + 0, 0, false); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + + ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); + err = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!err) + err = ceph_osdc_wait_request(&fsc->client->osdc, req); + ceph_osdc_put_request(req); + if (err < 0) + goto out; + + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + ceph_vino(inode), 0, &len, 1, 3, + CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, + ci->i_snap_realm->cached_context, + ci->i_truncate_seq, ci->i_truncate_size, + false); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + + osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false); + + err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR, + "inline_version", &inline_version, + sizeof(inline_version), + CEPH_OSD_CMPXATTR_OP_GT, + CEPH_OSD_CMPXATTR_MODE_U64); + if (err) + goto out_put; + + err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR, + "inline_version", &inline_version, + sizeof(inline_version), 0, 0); + if (err) + goto out_put; + + ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); + err = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!err) + err = ceph_osdc_wait_request(&fsc->client->osdc, req); +out_put: + ceph_osdc_put_request(req); + if (err == -ECANCELED) + err = 0; +out: + if (page && page != locked_page) { + if (from_pagecache) { + unlock_page(page); + page_cache_release(page); + } else + __free_pages(page, 0); + } + + dout("uninline_data %p %llx.%llx inline_version %llu = %d\n", + inode, ceph_vinop(inode), inline_version, err); + return err; +} + static struct vm_operations_struct ceph_vmops = { .fault = ceph_filemap_fault, .page_mkwrite = ceph_page_mkwrite, diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index cefca661464b..b93c631c6c87 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -975,10 +975,12 @@ static int send_cap_msg(struct ceph_mds_session *session, kuid_t uid, kgid_t gid, umode_t mode, u64 xattr_version, struct ceph_buffer *xattrs_buf, - u64 follows) + u64 follows, bool inline_data) { struct ceph_mds_caps *fc; struct ceph_msg *msg; + void *p; + size_t extra_len; dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" " seq %u/%u mseq %u follows %lld size %llu/%llu" @@ -988,7 +990,10 @@ static int send_cap_msg(struct ceph_mds_session *session, seq, issue_seq, mseq, follows, size, max_size, xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS, false); + /* flock buffer size + inline version + inline data size */ + extra_len = 4 + 8 + 4; + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len, + GFP_NOFS, false); if (!msg) return -ENOMEM; @@ -1020,6 +1025,14 @@ static int send_cap_msg(struct ceph_mds_session *session, fc->gid = cpu_to_le32(from_kgid(&init_user_ns, gid)); fc->mode = cpu_to_le32(mode); + p = fc + 1; + /* flock buffer size */ + ceph_encode_32(&p, 0); + /* inline version */ + ceph_encode_64(&p, inline_data ? 0 : CEPH_INLINE_NONE); + /* inline data size */ + ceph_encode_32(&p, 0); + fc->xattr_version = cpu_to_le64(xattr_version); if (xattrs_buf) { msg->middle = ceph_buffer_get(xattrs_buf); @@ -1126,6 +1139,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, u64 flush_tid = 0; int i; int ret; + bool inline_data; held = cap->issued | cap->implemented; revoking = cap->implemented & ~cap->issued; @@ -1209,13 +1223,15 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, xattr_version = ci->i_xattrs.version; } + inline_data = ci->i_inline_version != CEPH_INLINE_NONE; + spin_unlock(&ci->i_ceph_lock); ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, size, max_size, &mtime, &atime, time_warp_seq, uid, gid, mode, xattr_version, xattr_blob, - follows); + follows, inline_data); if (ret < 0) { dout("error sending cap msg, must requeue %p\n", inode); delayed = 1; @@ -1336,7 +1352,7 @@ retry: capsnap->time_warp_seq, capsnap->uid, capsnap->gid, capsnap->mode, capsnap->xattr_version, capsnap->xattr_blob, - capsnap->follows); + capsnap->follows, capsnap->inline_data); next_follows = capsnap->follows + 1; ceph_put_cap_snap(capsnap); @@ -2057,15 +2073,17 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got) * requested from the MDS. */ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, - int *got, loff_t endoff, int *check_max, int *err) + loff_t endoff, int *got, struct page **pinned_page, + int *check_max, int *err) { struct inode *inode = &ci->vfs_inode; int ret = 0; - int have, implemented; + int have, implemented, _got = 0; int file_wanted; dout("get_cap_refs %p need %s want %s\n", inode, ceph_cap_string(need), ceph_cap_string(want)); +again: spin_lock(&ci->i_ceph_lock); /* make sure file is actually open */ @@ -2075,7 +2093,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, ceph_cap_string(need), ceph_cap_string(file_wanted)); *err = -EBADF; ret = 1; - goto out; + goto out_unlock; } /* finish pending truncate */ @@ -2095,7 +2113,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, *check_max = 1; ret = 1; } - goto out; + goto out_unlock; } /* * If a sync write is in progress, we must wait, so that we @@ -2103,7 +2121,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, */ if (__ceph_have_pending_cap_snap(ci)) { dout("get_cap_refs %p cap_snap_pending\n", inode); - goto out; + goto out_unlock; } } @@ -2120,18 +2138,50 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, inode, ceph_cap_string(have), ceph_cap_string(not), ceph_cap_string(revoking)); if ((revoking & not) == 0) { - *got = need | (have & want); - __take_cap_refs(ci, *got); + _got = need | (have & want); + __take_cap_refs(ci, _got); ret = 1; } } else { dout("get_cap_refs %p have %s needed %s\n", inode, ceph_cap_string(have), ceph_cap_string(need)); } -out: +out_unlock: spin_unlock(&ci->i_ceph_lock); + + if (ci->i_inline_version != CEPH_INLINE_NONE && + (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && + i_size_read(inode) > 0) { + int ret1; + struct page *page = find_get_page(inode->i_mapping, 0); + if (page) { + if (PageUptodate(page)) { + *pinned_page = page; + goto out; + } + page_cache_release(page); + } + /* + * drop cap refs first because getattr while holding + * caps refs can cause deadlock. + */ + ceph_put_cap_refs(ci, _got); + _got = 0; + + /* getattr request will bring inline data into page cache */ + ret1 = __ceph_do_getattr(inode, NULL, + CEPH_STAT_CAP_INLINE_DATA, true); + if (ret1 >= 0) { + ret = 0; + goto again; + } + *err = ret1; + ret = 1; + } +out: dout("get_cap_refs %p ret %d got %s\n", inode, - ret, ceph_cap_string(*got)); + ret, ceph_cap_string(_got)); + *got = _got; return ret; } @@ -2168,8 +2218,8 @@ static void check_max_size(struct inode *inode, loff_t endoff) * due to a small max_size, make sure we check_max_size (and possibly * ask the mds) so we don't get hung up indefinitely. */ -int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got, - loff_t endoff) +int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, + loff_t endoff, int *got, struct page **pinned_page) { int check_max, ret, err; @@ -2179,8 +2229,8 @@ retry: check_max = 0; err = 0; ret = wait_event_interruptible(ci->i_cap_wq, - try_get_cap_refs(ci, need, want, - got, endoff, + try_get_cap_refs(ci, need, want, endoff, + got, pinned_page, &check_max, &err)); if (err) ret = err; @@ -2383,6 +2433,8 @@ static void invalidate_aliases(struct inode *inode) static void handle_cap_grant(struct ceph_mds_client *mdsc, struct inode *inode, struct ceph_mds_caps *grant, void *snaptrace, int snaptrace_len, + u64 inline_version, + void *inline_data, int inline_len, struct ceph_buffer *xattr_buf, struct ceph_mds_session *session, struct ceph_cap *cap, int issued) @@ -2403,6 +2455,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, bool queue_invalidate = false; bool queue_revalidate = false; bool deleted_inode = false; + bool fill_inline = false; dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", inode, cap, mds, seq, ceph_cap_string(newcaps)); @@ -2576,6 +2629,13 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, } BUG_ON(cap->issued & ~cap->implemented); + if (inline_version > 0 && inline_version >= ci->i_inline_version) { + ci->i_inline_version = inline_version; + if (ci->i_inline_version != CEPH_INLINE_NONE && + (newcaps & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO))) + fill_inline = true; + } + spin_unlock(&ci->i_ceph_lock); if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { @@ -2589,6 +2649,9 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, wake = true; } + if (fill_inline) + ceph_fill_inline_data(inode, NULL, inline_data, inline_len); + if (queue_trunc) { ceph_queue_vmtruncate(inode); ceph_queue_revalidate(inode); @@ -2996,11 +3059,12 @@ void ceph_handle_caps(struct ceph_mds_session *session, u64 cap_id; u64 size, max_size; u64 tid; + u64 inline_version = 0; + void *inline_data = NULL; + u32 inline_len = 0; void *snaptrace; size_t snaptrace_len; - void *flock; - void *end; - u32 flock_len; + void *p, *end; dout("handle_caps from mds%d\n", mds); @@ -3021,30 +3085,37 @@ void ceph_handle_caps(struct ceph_mds_session *session, snaptrace = h + 1; snaptrace_len = le32_to_cpu(h->snap_trace_len); + p = snaptrace + snaptrace_len; if (le16_to_cpu(msg->hdr.version) >= 2) { - void *p = snaptrace + snaptrace_len; + u32 flock_len; ceph_decode_32_safe(&p, end, flock_len, bad); if (p + flock_len > end) goto bad; - flock = p; - } else { - flock = NULL; - flock_len = 0; + p += flock_len; } if (le16_to_cpu(msg->hdr.version) >= 3) { if (op == CEPH_CAP_OP_IMPORT) { - void *p = flock + flock_len; if (p + sizeof(*peer) > end) goto bad; peer = p; + p += sizeof(*peer); } else if (op == CEPH_CAP_OP_EXPORT) { /* recorded in unused fields */ peer = (void *)&h->size; } } + if (le16_to_cpu(msg->hdr.version) >= 4) { + ceph_decode_64_safe(&p, end, inline_version, bad); + ceph_decode_32_safe(&p, end, inline_len, bad); + if (p + inline_len > end) + goto bad; + inline_data = p; + p += inline_len; + } + /* lookup ino */ inode = ceph_find_inode(sb, vino); ci = ceph_inode(inode); @@ -3085,6 +3156,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, handle_cap_import(mdsc, inode, h, peer, session, &cap, &issued); handle_cap_grant(mdsc, inode, h, snaptrace, snaptrace_len, + inline_version, inline_data, inline_len, msg->middle, session, cap, issued); goto done_unlocked; } @@ -3105,8 +3177,9 @@ void ceph_handle_caps(struct ceph_mds_session *session, case CEPH_CAP_OP_GRANT: __ceph_caps_issued(ci, &issued); issued |= __ceph_caps_dirty(ci); - handle_cap_grant(mdsc, inode, h, NULL, 0, msg->middle, - session, cap, issued); + handle_cap_grant(mdsc, inode, h, NULL, 0, + inline_version, inline_data, inline_len, + msg->middle, session, cap, issued); goto done_unlocked; case CEPH_CAP_OP_FLUSH_ACK: @@ -3137,8 +3210,7 @@ flush_cap_releases: done: mutex_unlock(&session->s_mutex); done_unlocked: - if (inode) - iput(inode); + iput(inode); return; bad: diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 681a8537b64f..c241603764fd 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -183,7 +183,7 @@ more: spin_unlock(&parent->d_lock); /* make sure a dentry wasn't dropped while we didn't have parent lock */ - if (!ceph_dir_is_complete(dir)) { + if (!ceph_dir_is_complete_ordered(dir)) { dout(" lost dir complete on %p; falling back to mds\n", dir); dput(dentry); err = -EAGAIN; @@ -261,10 +261,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) /* always start with . and .. */ if (ctx->pos == 0) { - /* note dir version at start of readdir so we can tell - * if any dentries get dropped */ - fi->dir_release_count = atomic_read(&ci->i_release_count); - dout("readdir off 0 -> '.'\n"); if (!dir_emit(ctx, ".", 1, ceph_translate_ino(inode->i_sb, inode->i_ino), @@ -289,7 +285,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) if ((ctx->pos == 2 || fi->dentry) && !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && ceph_snap(inode) != CEPH_SNAPDIR && - __ceph_dir_is_complete(ci) && + __ceph_dir_is_complete_ordered(ci) && __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { u32 shared_gen = ci->i_shared_gen; spin_unlock(&ci->i_ceph_lock); @@ -312,6 +308,13 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) /* proceed with a normal readdir */ + if (ctx->pos == 2) { + /* note dir version at start of readdir so we can tell + * if any dentries get dropped */ + fi->dir_release_count = atomic_read(&ci->i_release_count); + fi->dir_ordered_count = ci->i_ordered_count; + } + more: /* do we have the correct frag content buffered? */ if (fi->frag != frag || fi->last_readdir == NULL) { @@ -446,8 +449,12 @@ more: */ spin_lock(&ci->i_ceph_lock); if (atomic_read(&ci->i_release_count) == fi->dir_release_count) { - dout(" marking %p complete\n", inode); - __ceph_dir_set_complete(ci, fi->dir_release_count); + if (ci->i_ordered_count == fi->dir_ordered_count) + dout(" marking %p complete and ordered\n", inode); + else + dout(" marking %p complete\n", inode); + __ceph_dir_set_complete(ci, fi->dir_release_count, + fi->dir_ordered_count); } spin_unlock(&ci->i_ceph_lock); @@ -805,7 +812,9 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) acls.pagelist = NULL; } err = ceph_mdsc_do_request(mdsc, dir, req); - if (!err && !req->r_reply_info.head->is_dentry) + if (!err && + !req->r_reply_info.head->is_target && + !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); ceph_mdsc_put_request(req); out: diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 9f8e3572040e..ce74b394b49d 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -333,6 +333,11 @@ int ceph_release(struct inode *inode, struct file *file) return 0; } +enum { + CHECK_EOF = 1, + READ_INLINE = 2, +}; + /* * Read a range of bytes striped over one or more objects. Iterate over * objects we stripe over. (That's not atomic, but good enough for now.) @@ -412,7 +417,7 @@ more: ret = read; /* did we bounce off eof? */ if (pos + left > inode->i_size) - *checkeof = 1; + *checkeof = CHECK_EOF; } dout("striped_read returns %d\n", ret); @@ -598,7 +603,7 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) snapc = ci->i_snap_realm->cached_context; vino = ceph_vino(inode); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, - vino, pos, &len, + vino, pos, &len, 0, 2,/*include a 'startsync' command*/ CEPH_OSD_OP_WRITE, flags, snapc, ci->i_truncate_seq, @@ -609,6 +614,8 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) break; } + osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); + n = iov_iter_get_pages_alloc(from, &pages, len, &start); if (unlikely(n < 0)) { ret = n; @@ -713,7 +720,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) snapc = ci->i_snap_realm->cached_context; vino = ceph_vino(inode); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, - vino, pos, &len, 1, + vino, pos, &len, 0, 1, CEPH_OSD_OP_WRITE, flags, snapc, ci->i_truncate_seq, ci->i_truncate_size, @@ -803,9 +810,10 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) size_t len = iocb->ki_nbytes; struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); + struct page *pinned_page = NULL; ssize_t ret; int want, got = 0; - int checkeof = 0, read = 0; + int retry_op = 0, read = 0; again: dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", @@ -815,7 +823,7 @@ again: want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_CACHE; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); + ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); if (ret < 0) return ret; @@ -827,8 +835,12 @@ again: inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, ceph_cap_string(got)); - /* hmm, this isn't really async... */ - ret = ceph_sync_read(iocb, to, &checkeof); + if (ci->i_inline_version == CEPH_INLINE_NONE) { + /* hmm, this isn't really async... */ + ret = ceph_sync_read(iocb, to, &retry_op); + } else { + retry_op = READ_INLINE; + } } else { dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, @@ -838,13 +850,55 @@ again: } dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); + if (pinned_page) { + page_cache_release(pinned_page); + pinned_page = NULL; + } ceph_put_cap_refs(ci, got); + if (retry_op && ret >= 0) { + int statret; + struct page *page = NULL; + loff_t i_size; + if (retry_op == READ_INLINE) { + page = __page_cache_alloc(GFP_NOFS); + if (!page) + return -ENOMEM; + } - if (checkeof && ret >= 0) { - int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); + statret = __ceph_do_getattr(inode, page, + CEPH_STAT_CAP_INLINE_DATA, !!page); + if (statret < 0) { + __free_page(page); + if (statret == -ENODATA) { + BUG_ON(retry_op != READ_INLINE); + goto again; + } + return statret; + } + + i_size = i_size_read(inode); + if (retry_op == READ_INLINE) { + /* does not support inline data > PAGE_SIZE */ + if (i_size > PAGE_CACHE_SIZE) { + ret = -EIO; + } else if (iocb->ki_pos < i_size) { + loff_t end = min_t(loff_t, i_size, + iocb->ki_pos + len); + if (statret < end) + zero_user_segment(page, statret, end); + ret = copy_page_to_iter(page, + iocb->ki_pos & ~PAGE_MASK, + end - iocb->ki_pos, to); + iocb->ki_pos += ret; + } else { + ret = 0; + } + __free_pages(page, 0); + return ret; + } /* hit EOF or hole? */ - if (statret == 0 && iocb->ki_pos < inode->i_size && + if (retry_op == CHECK_EOF && iocb->ki_pos < i_size && ret < len) { dout("sync_read hit hole, ppos %lld < size %lld" ", reading more\n", iocb->ki_pos, @@ -852,7 +906,7 @@ again: read += ret; len -= ret; - checkeof = 0; + retry_op = 0; goto again; } } @@ -909,6 +963,12 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) if (err) goto out; + if (ci->i_inline_version != CEPH_INLINE_NONE) { + err = ceph_uninline_data(file, NULL); + if (err < 0) + goto out; + } + retry_snap: if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) { err = -ENOSPC; @@ -922,7 +982,8 @@ retry_snap: else want = CEPH_CAP_FILE_BUFFER; got = 0; - err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos + count); + err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, pos + count, + &got, NULL); if (err < 0) goto out; @@ -969,6 +1030,7 @@ retry_snap: if (written >= 0) { int dirty; spin_lock(&ci->i_ceph_lock); + ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); spin_unlock(&ci->i_ceph_lock); if (dirty) @@ -1111,7 +1173,7 @@ static int ceph_zero_partial_object(struct inode *inode, req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), offset, length, - 1, op, + 0, 1, op, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, NULL, 0, 0, false); @@ -1214,6 +1276,12 @@ static long ceph_fallocate(struct file *file, int mode, goto unlock; } + if (ci->i_inline_version != CEPH_INLINE_NONE) { + ret = ceph_uninline_data(file, NULL); + if (ret < 0) + goto unlock; + } + size = i_size_read(inode); if (!(mode & FALLOC_FL_KEEP_SIZE)) endoff = offset + length; @@ -1223,7 +1291,7 @@ static long ceph_fallocate(struct file *file, int mode, else want = CEPH_CAP_FILE_BUFFER; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff); + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, endoff, &got, NULL); if (ret < 0) goto unlock; @@ -1240,6 +1308,7 @@ static long ceph_fallocate(struct file *file, int mode, if (!ret) { spin_lock(&ci->i_ceph_lock); + ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); spin_unlock(&ci->i_ceph_lock); if (dirty) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index a5593d51d035..f61a74115beb 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -387,8 +387,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb) spin_lock_init(&ci->i_ceph_lock); ci->i_version = 0; + ci->i_inline_version = 0; ci->i_time_warp_seq = 0; ci->i_ceph_flags = 0; + ci->i_ordered_count = 0; atomic_set(&ci->i_release_count, 1); atomic_set(&ci->i_complete_count, 0); ci->i_symlink = NULL; @@ -657,7 +659,7 @@ void ceph_fill_file_time(struct inode *inode, int issued, * Populate an inode based on info from mds. May be called on new or * existing inodes. */ -static int fill_inode(struct inode *inode, +static int fill_inode(struct inode *inode, struct page *locked_page, struct ceph_mds_reply_info_in *iinfo, struct ceph_mds_reply_dirfrag *dirinfo, struct ceph_mds_session *session, @@ -675,6 +677,7 @@ static int fill_inode(struct inode *inode, bool wake = false; bool queue_trunc = false; bool new_version = false; + bool fill_inline = false; dout("fill_inode %p ino %llx.%llx v %llu had %llu\n", inode, ceph_vinop(inode), le64_to_cpu(info->version), @@ -845,7 +848,8 @@ static int fill_inode(struct inode *inode, (issued & CEPH_CAP_FILE_EXCL) == 0 && !__ceph_dir_is_complete(ci)) { dout(" marking %p complete (empty)\n", inode); - __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count)); + __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count), + ci->i_ordered_count); } /* were we issued a capability? */ @@ -873,8 +877,23 @@ static int fill_inode(struct inode *inode, ceph_vinop(inode)); __ceph_get_fmode(ci, cap_fmode); } + + if (iinfo->inline_version > 0 && + iinfo->inline_version >= ci->i_inline_version) { + int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; + ci->i_inline_version = iinfo->inline_version; + if (ci->i_inline_version != CEPH_INLINE_NONE && + (locked_page || + (le32_to_cpu(info->cap.caps) & cache_caps))) + fill_inline = true; + } + spin_unlock(&ci->i_ceph_lock); + if (fill_inline) + ceph_fill_inline_data(inode, locked_page, + iinfo->inline_data, iinfo->inline_len); + if (wake) wake_up_all(&ci->i_cap_wq); @@ -1062,7 +1081,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, struct inode *dir = req->r_locked_dir; if (dir) { - err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag, + err = fill_inode(dir, NULL, + &rinfo->diri, rinfo->dirfrag, session, req->r_request_started, -1, &req->r_caps_reservation); if (err < 0) @@ -1132,7 +1152,7 @@ retry_lookup: } req->r_target_inode = in; - err = fill_inode(in, &rinfo->targeti, NULL, + err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL, session, req->r_request_started, (!req->r_aborted && rinfo->head->result == 0) ? req->r_fmode : -1, @@ -1204,8 +1224,8 @@ retry_lookup: ceph_invalidate_dentry_lease(dn); /* d_move screws up sibling dentries' offsets */ - ceph_dir_clear_complete(dir); - ceph_dir_clear_complete(olddir); + ceph_dir_clear_ordered(dir); + ceph_dir_clear_ordered(olddir); dout("dn %p gets new offset %lld\n", req->r_old_dentry, ceph_dentry(req->r_old_dentry)->offset); @@ -1217,6 +1237,7 @@ retry_lookup: if (!rinfo->head->is_target) { dout("fill_trace null dentry\n"); if (dn->d_inode) { + ceph_dir_clear_ordered(dir); dout("d_delete %p\n", dn); d_delete(dn); } else { @@ -1233,7 +1254,7 @@ retry_lookup: /* attach proper inode */ if (!dn->d_inode) { - ceph_dir_clear_complete(dir); + ceph_dir_clear_ordered(dir); ihold(in); dn = splice_dentry(dn, in, &have_lease); if (IS_ERR(dn)) { @@ -1263,7 +1284,7 @@ retry_lookup: BUG_ON(!dir); BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR); dout(" linking snapped dir %p to dn %p\n", in, dn); - ceph_dir_clear_complete(dir); + ceph_dir_clear_ordered(dir); ihold(in); dn = splice_dentry(dn, in, NULL); if (IS_ERR(dn)) { @@ -1300,7 +1321,7 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, dout("new_inode badness got %d\n", err); continue; } - rc = fill_inode(in, &rinfo->dir_in[i], NULL, session, + rc = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, req->r_request_started, -1, &req->r_caps_reservation); if (rc < 0) { @@ -1416,7 +1437,7 @@ retry_lookup: } } - if (fill_inode(in, &rinfo->dir_in[i], NULL, session, + if (fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, req->r_request_started, -1, &req->r_caps_reservation) < 0) { pr_err("fill_inode badness on %p\n", in); @@ -1899,7 +1920,8 @@ out_put: * Verify that we have a lease on the given mask. If not, * do a getattr against an mds. */ -int ceph_do_getattr(struct inode *inode, int mask, bool force) +int __ceph_do_getattr(struct inode *inode, struct page *locked_page, + int mask, bool force) { struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; @@ -1911,7 +1933,8 @@ int ceph_do_getattr(struct inode *inode, int mask, bool force) return 0; } - dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode); + dout("do_getattr inode %p mask %s mode 0%o\n", + inode, ceph_cap_string(mask), inode->i_mode); if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1)) return 0; @@ -1922,7 +1945,19 @@ int ceph_do_getattr(struct inode *inode, int mask, bool force) ihold(inode); req->r_num_caps = 1; req->r_args.getattr.mask = cpu_to_le32(mask); + req->r_locked_page = locked_page; err = ceph_mdsc_do_request(mdsc, NULL, req); + if (locked_page && err == 0) { + u64 inline_version = req->r_reply_info.targeti.inline_version; + if (inline_version == 0) { + /* the reply is supposed to contain inline data */ + err = -EINVAL; + } else if (inline_version == CEPH_INLINE_NONE) { + err = -ENODATA; + } else { + err = req->r_reply_info.targeti.inline_len; + } + } ceph_mdsc_put_request(req); dout("do_getattr result=%d\n", err); return err; diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index fbc39c47bacd..c35c5c614e38 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -9,6 +9,8 @@ #include <linux/ceph/pagelist.h> static u64 lock_secret; +static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req); static inline u64 secure_addr(void *addr) { @@ -40,6 +42,9 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, u64 length = 0; u64 owner; + if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK) + wait = 0; + req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); @@ -68,6 +73,9 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, req->r_args.filelock_change.length = cpu_to_le64(length); req->r_args.filelock_change.wait = wait; + if (wait) + req->r_wait_for_completion = ceph_lock_wait_for_completion; + err = ceph_mdsc_do_request(mdsc, inode, req); if (operation == CEPH_MDS_OP_GETFILELOCK) { @@ -96,6 +104,52 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, return err; } +static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + struct ceph_mds_request *intr_req; + struct inode *inode = req->r_inode; + int err, lock_type; + + BUG_ON(req->r_op != CEPH_MDS_OP_SETFILELOCK); + if (req->r_args.filelock_change.rule == CEPH_LOCK_FCNTL) + lock_type = CEPH_LOCK_FCNTL_INTR; + else if (req->r_args.filelock_change.rule == CEPH_LOCK_FLOCK) + lock_type = CEPH_LOCK_FLOCK_INTR; + else + BUG_ON(1); + BUG_ON(req->r_args.filelock_change.type == CEPH_LOCK_UNLOCK); + + err = wait_for_completion_interruptible(&req->r_completion); + if (!err) + return 0; + + dout("ceph_lock_wait_for_completion: request %llu was interrupted\n", + req->r_tid); + + intr_req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETFILELOCK, + USE_AUTH_MDS); + if (IS_ERR(intr_req)) + return PTR_ERR(intr_req); + + intr_req->r_inode = inode; + ihold(inode); + intr_req->r_num_caps = 1; + + intr_req->r_args.filelock_change = req->r_args.filelock_change; + intr_req->r_args.filelock_change.rule = lock_type; + intr_req->r_args.filelock_change.type = CEPH_LOCK_UNLOCK; + + err = ceph_mdsc_do_request(mdsc, inode, intr_req); + ceph_mdsc_put_request(intr_req); + + if (err && err != -ERESTARTSYS) + return err; + + wait_for_completion(&req->r_completion); + return 0; +} + /** * Attempt to set an fcntl lock. * For now, this just goes away to the server. Later it may be more awesome. @@ -143,11 +197,6 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) err); } } - - } else if (err == -ERESTARTSYS) { - dout("undoing lock\n"); - ceph_lock_message(CEPH_LOCK_FCNTL, op, file, - CEPH_LOCK_UNLOCK, 0, fl); } return err; } @@ -186,11 +235,6 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) file, CEPH_LOCK_UNLOCK, 0, fl); dout("got %d on flock_lock_file_wait, undid lock", err); } - } else if (err == -ERESTARTSYS) { - dout("undoing lock\n"); - ceph_lock_message(CEPH_LOCK_FLOCK, - CEPH_MDS_OP_SETFILELOCK, - file, CEPH_LOCK_UNLOCK, 0, fl); } return err; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index a92d3f5c6c12..d2171f4a6980 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -89,6 +89,16 @@ static int parse_reply_info_in(void **p, void *end, ceph_decode_need(p, end, info->xattr_len, bad); info->xattr_data = *p; *p += info->xattr_len; + + if (features & CEPH_FEATURE_MDS_INLINE_DATA) { + ceph_decode_64_safe(p, end, info->inline_version, bad); + ceph_decode_32_safe(p, end, info->inline_len, bad); + ceph_decode_need(p, end, info->inline_len, bad); + info->inline_data = *p; + *p += info->inline_len; + } else + info->inline_version = CEPH_INLINE_NONE; + return 0; bad: return err; @@ -524,8 +534,7 @@ void ceph_mdsc_release_request(struct kref *kref) } if (req->r_locked_dir) ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); - if (req->r_target_inode) - iput(req->r_target_inode); + iput(req->r_target_inode); if (req->r_dentry) dput(req->r_dentry); if (req->r_old_dentry) @@ -861,8 +870,11 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 /* * Serialize client metadata into waiting buffer space, using * the format that userspace expects for map<string, string> + * + * ClientSession messages with metadata are v2 */ - msg->hdr.version = 2; /* ClientSession messages with metadata are v2 */ + msg->hdr.version = cpu_to_le16(2); + msg->hdr.compat_version = cpu_to_le16(1); /* The write pointer, following the session_head structure */ p = msg->front.iov_base + sizeof(*h); @@ -1066,8 +1078,7 @@ out: session->s_cap_iterator = NULL; spin_unlock(&session->s_cap_lock); - if (last_inode) - iput(last_inode); + iput(last_inode); if (old_cap) ceph_put_cap(session->s_mdsc, old_cap); @@ -1874,7 +1885,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, goto out_free2; } - msg->hdr.version = 2; + msg->hdr.version = cpu_to_le16(2); msg->hdr.tid = cpu_to_le64(req->r_tid); head = msg->front.iov_base; @@ -2208,6 +2219,8 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, &req->r_completion, req->r_timeout); if (err == 0) err = -EIO; + } else if (req->r_wait_for_completion) { + err = req->r_wait_for_completion(mdsc, req); } else { err = wait_for_completion_killable(&req->r_completion); } @@ -3744,6 +3757,20 @@ static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con, return msg; } +static int sign_message(struct ceph_connection *con, struct ceph_msg *msg) +{ + struct ceph_mds_session *s = con->private; + struct ceph_auth_handshake *auth = &s->s_auth; + return ceph_auth_sign_message(auth, msg); +} + +static int check_message_signature(struct ceph_connection *con, struct ceph_msg *msg) +{ + struct ceph_mds_session *s = con->private; + struct ceph_auth_handshake *auth = &s->s_auth; + return ceph_auth_check_message_signature(auth, msg); +} + static const struct ceph_connection_operations mds_con_ops = { .get = con_get, .put = con_put, @@ -3753,6 +3780,8 @@ static const struct ceph_connection_operations mds_con_ops = { .invalidate_authorizer = invalidate_authorizer, .peer_reset = peer_reset, .alloc_msg = mds_alloc_msg, + .sign_message = sign_message, + .check_message_signature = check_message_signature, }; /* eof */ diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 3288359353e9..e2817d00f7d9 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -41,6 +41,9 @@ struct ceph_mds_reply_info_in { char *symlink; u32 xattr_len; char *xattr_data; + u64 inline_version; + u32 inline_len; + char *inline_data; }; /* @@ -166,6 +169,11 @@ struct ceph_mds_client; */ typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc, struct ceph_mds_request *req); +/* + * wait for request completion callback + */ +typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc, + struct ceph_mds_request *req); /* * an in-flight mds request @@ -215,6 +223,7 @@ struct ceph_mds_request { int r_request_release_offset; struct ceph_msg *r_reply; struct ceph_mds_reply_info_parsed r_reply_info; + struct page *r_locked_page; int r_err; bool r_aborted; @@ -239,6 +248,7 @@ struct ceph_mds_request { struct completion r_completion; struct completion r_safe_completion; ceph_mds_request_callback_t r_callback; + ceph_mds_request_wait_callback_t r_wait_for_completion; struct list_head r_unsafe_item; /* per-session unsafe list item */ bool r_got_unsafe, r_got_safe, r_got_result; diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index f01645a27752..ce35fbd4ba5d 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -288,6 +288,9 @@ static int cmpu64_rev(const void *a, const void *b) return 0; } + +static struct ceph_snap_context *empty_snapc; + /* * build the snap context for a given realm. */ @@ -328,6 +331,12 @@ static int build_snap_context(struct ceph_snap_realm *realm) return 0; } + if (num == 0 && realm->seq == empty_snapc->seq) { + ceph_get_snap_context(empty_snapc); + snapc = empty_snapc; + goto done; + } + /* alloc new snap context */ err = -ENOMEM; if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64)) @@ -365,8 +374,8 @@ static int build_snap_context(struct ceph_snap_realm *realm) realm->ino, realm, snapc, snapc->seq, (unsigned int) snapc->num_snaps); - if (realm->cached_context) - ceph_put_snap_context(realm->cached_context); +done: + ceph_put_snap_context(realm->cached_context); realm->cached_context = snapc; return 0; @@ -466,6 +475,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) cap_snap. lucky us. */ dout("queue_cap_snap %p already pending\n", inode); kfree(capsnap); + } else if (ci->i_snap_realm->cached_context == empty_snapc) { + dout("queue_cap_snap %p empty snapc\n", inode); + kfree(capsnap); } else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) { struct ceph_snap_context *snapc = ci->i_head_snapc; @@ -504,6 +516,8 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) capsnap->xattr_version = 0; } + capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; + /* dirty page count moved from _head to this cap_snap; all subsequent writes page dirties occur _after_ this snapshot. */ @@ -590,15 +604,13 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) if (!inode) continue; spin_unlock(&realm->inodes_with_caps_lock); - if (lastinode) - iput(lastinode); + iput(lastinode); lastinode = inode; ceph_queue_cap_snap(ci); spin_lock(&realm->inodes_with_caps_lock); } spin_unlock(&realm->inodes_with_caps_lock); - if (lastinode) - iput(lastinode); + iput(lastinode); list_for_each_entry(child, &realm->children, child_item) { dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n", @@ -928,5 +940,16 @@ out: return; } +int __init ceph_snap_init(void) +{ + empty_snapc = ceph_create_snap_context(0, GFP_NOFS); + if (!empty_snapc) + return -ENOMEM; + empty_snapc->seq = 1; + return 0; +} - +void ceph_snap_exit(void) +{ + ceph_put_snap_context(empty_snapc); +} diff --git a/fs/ceph/super.c b/fs/ceph/super.c index f6e12377335c..50f06cddc94b 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -515,7 +515,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, struct ceph_fs_client *fsc; const u64 supported_features = CEPH_FEATURE_FLOCK | - CEPH_FEATURE_DIRLAYOUTHASH; + CEPH_FEATURE_DIRLAYOUTHASH | + CEPH_FEATURE_MDS_INLINE_DATA; const u64 required_features = 0; int page_count; size_t size; @@ -1017,9 +1018,6 @@ static struct file_system_type ceph_fs_type = { }; MODULE_ALIAS_FS("ceph"); -#define _STRINGIFY(x) #x -#define STRINGIFY(x) _STRINGIFY(x) - static int __init init_ceph(void) { int ret = init_caches(); @@ -1028,15 +1026,20 @@ static int __init init_ceph(void) ceph_flock_init(); ceph_xattr_init(); + ret = ceph_snap_init(); + if (ret) + goto out_xattr; ret = register_filesystem(&ceph_fs_type); if (ret) - goto out_icache; + goto out_snap; pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL); return 0; -out_icache: +out_snap: + ceph_snap_exit(); +out_xattr: ceph_xattr_exit(); destroy_caches(); out: @@ -1047,6 +1050,7 @@ static void __exit exit_ceph(void) { dout("exit_ceph\n"); unregister_filesystem(&ceph_fs_type); + ceph_snap_exit(); ceph_xattr_exit(); destroy_caches(); } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index b82f507979b8..e1aa32d0759d 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -161,6 +161,7 @@ struct ceph_cap_snap { u64 time_warp_seq; int writing; /* a sync write is still in progress */ int dirty_pages; /* dirty pages awaiting writeback */ + bool inline_data; }; static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) @@ -253,9 +254,11 @@ struct ceph_inode_info { spinlock_t i_ceph_lock; u64 i_version; + u64 i_inline_version; u32 i_time_warp_seq; unsigned i_ceph_flags; + int i_ordered_count; atomic_t i_release_count; atomic_t i_complete_count; @@ -434,14 +437,19 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, /* * Ceph inode. */ -#define CEPH_I_NODELAY 4 /* do not delay cap release */ -#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ -#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ +#define CEPH_I_DIR_ORDERED 1 /* dentries in dir are ordered */ +#define CEPH_I_NODELAY 4 /* do not delay cap release */ +#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ +#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, - int release_count) + int release_count, int ordered_count) { atomic_set(&ci->i_complete_count, release_count); + if (ci->i_ordered_count == ordered_count) + ci->i_ceph_flags |= CEPH_I_DIR_ORDERED; + else + ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED; } static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci) @@ -455,16 +463,35 @@ static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci) atomic_read(&ci->i_release_count); } +static inline bool __ceph_dir_is_complete_ordered(struct ceph_inode_info *ci) +{ + return __ceph_dir_is_complete(ci) && + (ci->i_ceph_flags & CEPH_I_DIR_ORDERED); +} + static inline void ceph_dir_clear_complete(struct inode *inode) { __ceph_dir_clear_complete(ceph_inode(inode)); } -static inline bool ceph_dir_is_complete(struct inode *inode) +static inline void ceph_dir_clear_ordered(struct inode *inode) { - return __ceph_dir_is_complete(ceph_inode(inode)); + struct ceph_inode_info *ci = ceph_inode(inode); + spin_lock(&ci->i_ceph_lock); + ci->i_ordered_count++; + ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED; + spin_unlock(&ci->i_ceph_lock); } +static inline bool ceph_dir_is_complete_ordered(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + bool ret; + spin_lock(&ci->i_ceph_lock); + ret = __ceph_dir_is_complete_ordered(ci); + spin_unlock(&ci->i_ceph_lock); + return ret; +} /* find a specific frag @f */ extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, @@ -580,6 +607,7 @@ struct ceph_file_info { char *last_name; /* last entry in previous chunk */ struct dentry *dentry; /* next dentry (for dcache readdir) */ int dir_release_count; + int dir_ordered_count; /* used for -o dirstat read() on directory thing */ char *dir_info; @@ -673,6 +701,8 @@ extern void ceph_queue_cap_snap(struct ceph_inode_info *ci); extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap); extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc); +extern int ceph_snap_init(void); +extern void ceph_snap_exit(void); /* * a cap_snap is "pending" if it is still awaiting an in-progress @@ -715,7 +745,12 @@ extern void ceph_queue_vmtruncate(struct inode *inode); extern void ceph_queue_invalidate(struct inode *inode); extern void ceph_queue_writeback(struct inode *inode); -extern int ceph_do_getattr(struct inode *inode, int mask, bool force); +extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page, + int mask, bool force); +static inline int ceph_do_getattr(struct inode *inode, int mask, bool force) +{ + return __ceph_do_getattr(inode, NULL, mask, force); +} extern int ceph_permission(struct inode *inode, int mask); extern int ceph_setattr(struct dentry *dentry, struct iattr *attr); extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, @@ -830,7 +865,7 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn, int mds, int drop, int unless); extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, - int *got, loff_t endoff); + loff_t endoff, int *got, struct page **pinned_page); /* for counting open files by mode */ static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode) @@ -852,7 +887,9 @@ extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned flags, umode_t mode, int *opened); extern int ceph_release(struct inode *inode, struct file *filp); - +extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, + char *data, size_t len); +int ceph_uninline_data(struct file *filp, struct page *locked_page); /* dir.c */ extern const struct file_operations ceph_dir_fops; extern const struct inode_operations ceph_dir_iops; diff --git a/fs/ceph/super.h.rej b/fs/ceph/super.h.rej new file mode 100644 index 000000000000..88fe3dfadb29 --- /dev/null +++ b/fs/ceph/super.h.rej @@ -0,0 +1,10 @@ +--- fs/ceph/super.h ++++ fs/ceph/super.h +@@ -254,6 +255,7 @@ + spinlock_t i_ceph_lock; + + u64 i_version; ++ u64 i_inline_version; + u32 i_time_warp_seq; + + unsigned i_ceph_flags; diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 678b0d2bbbc4..5a492caf34cb 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -854,7 +854,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, struct ceph_pagelist *pagelist = NULL; int err; - if (value) { + if (size > 0) { /* copy value into pagelist */ pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); if (!pagelist) @@ -864,7 +864,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, err = ceph_pagelist_append(pagelist, value, size); if (err) goto out; - } else { + } else if (!value) { flags |= CEPH_XATTR_REMOVE; } @@ -1001,6 +1001,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name, if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return generic_setxattr(dentry, name, value, size, flags); + if (size == 0) + value = ""; /* empty EA, do not remove */ + return __ceph_setxattr(dentry, name, value, size, flags); } |