From f3c60c5918f26ea16761ddc8b12d8401a3db626b Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 11 Aug 2010 14:51:23 -0700 Subject: ceph: fix multiple mds session shutdown The use of a completion when waiting for session shutdown during umount is inappropriate, given the complexity of the condition. For multiple MDS's, this resulted in the umount thread spinning, often preventing the session close message from being processed in some cases. Switch to a waitqueue and defined a condition helper. This cleans things up nicely. Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 68 +++++++++++++++++++++++++++------------------------- fs/ceph/mds_client.h | 3 ++- 2 files changed, 37 insertions(+), 34 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index a75ddbf9fe37..397a47b696ce 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2208,7 +2208,7 @@ static void handle_session(struct ceph_mds_session *session, pr_info("mds%d reconnect denied\n", session->s_mds); remove_session_caps(session); wake = 1; /* for good measure */ - complete_all(&mdsc->session_close_waiters); + wake_up_all(&mdsc->session_close_wq); kick_requests(mdsc, mds); break; @@ -2876,7 +2876,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client) return -ENOMEM; init_completion(&mdsc->safe_umount_waiters); - init_completion(&mdsc->session_close_waiters); + init_waitqueue_head(&mdsc->session_close_wq); INIT_LIST_HEAD(&mdsc->waiting_for_map); mdsc->sessions = NULL; mdsc->max_sessions = 0; @@ -3021,6 +3021,23 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush)); } +/* + * true if all sessions are closed, or we force unmount + */ +bool done_closing_sessions(struct ceph_mds_client *mdsc) +{ + int i, n = 0; + + if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN) + return true; + + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) + if (mdsc->sessions[i]) + n++; + mutex_unlock(&mdsc->mutex); + return n == 0; +} /* * called after sb is ro. @@ -3029,45 +3046,32 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) { struct ceph_mds_session *session; int i; - int n; struct ceph_client *client = mdsc->client; - unsigned long started, timeout = client->mount_args->mount_timeout * HZ; + unsigned long timeout = client->mount_args->mount_timeout * HZ; dout("close_sessions\n"); - mutex_lock(&mdsc->mutex); - /* close sessions */ - started = jiffies; - while (time_before(jiffies, started + timeout)) { - dout("closing sessions\n"); - n = 0; - for (i = 0; i < mdsc->max_sessions; i++) { - session = __ceph_lookup_mds_session(mdsc, i); - if (!session) - continue; - mutex_unlock(&mdsc->mutex); - mutex_lock(&session->s_mutex); - __close_session(mdsc, session); - mutex_unlock(&session->s_mutex); - ceph_put_mds_session(session); - mutex_lock(&mdsc->mutex); - n++; - } - if (n == 0) - break; - - if (client->mount_state == CEPH_MOUNT_SHUTDOWN) - break; - - dout("waiting for sessions to close\n"); + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) { + session = __ceph_lookup_mds_session(mdsc, i); + if (!session) + continue; mutex_unlock(&mdsc->mutex); - wait_for_completion_timeout(&mdsc->session_close_waiters, - timeout); + mutex_lock(&session->s_mutex); + __close_session(mdsc, session); + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); mutex_lock(&mdsc->mutex); } + mutex_unlock(&mdsc->mutex); + + dout("waiting for sessions to close\n"); + wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc), + timeout); /* tear down remaining sessions */ + mutex_lock(&mdsc->mutex); for (i = 0; i < mdsc->max_sessions; i++) { if (mdsc->sessions[i]) { session = get_session(mdsc->sessions[i]); @@ -3080,9 +3084,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) mutex_lock(&mdsc->mutex); } } - WARN_ON(!list_empty(&mdsc->cap_delay_list)); - mutex_unlock(&mdsc->mutex); ceph_cleanup_empty_realms(mdsc); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index ab7e89f5e344..c98267ce6d2a 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -234,7 +234,8 @@ struct ceph_mds_client { struct mutex mutex; /* all nested structures */ struct ceph_mdsmap *mdsmap; - struct completion safe_umount_waiters, session_close_waiters; + struct completion safe_umount_waiters; + wait_queue_head_t session_close_wq; struct list_head waiting_for_map; struct ceph_mds_session **sessions; /* NULL for mds if no session */ -- cgit v1.2.3 From 082afec92d1052305af1195f591602f4d0f44277 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 22 Aug 2010 15:16:41 -0700 Subject: ceph: fix xattr cap writeback We should include the xattr metadata blob in the cap update message any time we are flushing dirty state, NOT just when we are also dropping the cap. This fixes async xattr writeback. Also, clean up the code slightly to avoid duplicating the bit test. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 7bf182b03973..0ac2703f3bdf 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1082,6 +1082,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, gid_t gid; struct ceph_mds_session *session; u64 xattr_version = 0; + struct ceph_buffer *xattr_blob = NULL; int delayed = 0; u64 flush_tid = 0; int i; @@ -1160,9 +1161,10 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, gid = inode->i_gid; mode = inode->i_mode; - if (dropping & CEPH_CAP_XATTR_EXCL) { + if (flushing & CEPH_CAP_XATTR_EXCL) { __ceph_build_xattrs_blob(ci); - xattr_version = ci->i_xattrs.version + 1; + xattr_blob = ci->i_xattrs.blob; + xattr_version = ci->i_xattrs.version; } spin_unlock(&inode->i_lock); @@ -1170,9 +1172,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, size, max_size, &mtime, &atime, time_warp_seq, - uid, gid, mode, - xattr_version, - (flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL, + uid, gid, mode, xattr_version, xattr_blob, follows); if (ret < 0) { dout("error sending cap msg, must requeue %p\n", inode); -- cgit v1.2.3 From 4a625be47243e0e07dedd0a1a6b94c66c2ab93ba Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 22 Aug 2010 15:03:56 -0700 Subject: ceph: include dirty xattrs state in snapped caps When we snapshot dirty metadata that needs to be written back to the MDS, include dirty xattr metadata. Make the capsnap reference the encoded xattr blob so that it will be written back in the FLUSHSNAP op. Also fix the capsnap creation guard to include dirty auth or file bits, not just tests specific to dirty file data or file writes in progress (this fixes auth metadata writeback). Signed-off-by: Sage Weil --- fs/ceph/caps.c | 2 +- fs/ceph/snap.c | 23 ++++++++++++++++------- fs/ceph/super.h | 8 +++++--- fs/ceph/xattr.c | 1 + 4 files changed, 23 insertions(+), 11 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 0ac2703f3bdf..ba5bbf318fe1 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1282,7 +1282,7 @@ retry: &capsnap->mtime, &capsnap->atime, capsnap->time_warp_seq, capsnap->uid, capsnap->gid, capsnap->mode, - 0, NULL, + capsnap->xattr_version, capsnap->xattr_blob, capsnap->follows); next_follows = capsnap->follows + 1; diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index c0b26b6badba..2cb190c2bd99 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -435,7 +435,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) { struct inode *inode = &ci->vfs_inode; struct ceph_cap_snap *capsnap; - int used; + int used, dirty; capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS); if (!capsnap) { @@ -445,6 +445,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) spin_lock(&inode->i_lock); used = __ceph_caps_used(ci); + dirty = __ceph_caps_dirty(ci); if (__ceph_have_pending_cap_snap(ci)) { /* there is no point in queuing multiple "pending" cap_snaps, as no new writes are allowed to start when pending, so any @@ -452,11 +453,13 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) cap_snap. lucky us. */ dout("queue_cap_snap %p already pending\n", inode); kfree(capsnap); - } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) { + } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR) || + (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| + CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) { struct ceph_snap_context *snapc = ci->i_head_snapc; igrab(inode); - + atomic_set(&capsnap->nref, 1); capsnap->ci = ci; INIT_LIST_HEAD(&capsnap->ci_item); @@ -464,15 +467,21 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) capsnap->follows = snapc->seq - 1; capsnap->issued = __ceph_caps_issued(ci, NULL); - capsnap->dirty = __ceph_caps_dirty(ci); + capsnap->dirty = dirty; capsnap->mode = inode->i_mode; capsnap->uid = inode->i_uid; capsnap->gid = inode->i_gid; - /* fixme? */ - capsnap->xattr_blob = NULL; - capsnap->xattr_len = 0; + if (dirty & CEPH_CAP_XATTR_EXCL) { + __ceph_build_xattrs_blob(ci); + capsnap->xattr_blob = + ceph_buffer_get(ci->i_xattrs.blob); + capsnap->xattr_version = ci->i_xattrs.version; + } else { + capsnap->xattr_blob = NULL; + capsnap->xattr_version = 0; + } /* dirty page count moved from _head to this cap_snap; all subsequent writes page dirties occur _after_ this diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 2482d696f0de..b33929d8f287 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -216,8 +216,7 @@ struct ceph_cap_snap { uid_t uid; gid_t gid; - void *xattr_blob; - int xattr_len; + struct ceph_buffer *xattr_blob; u64 xattr_version; u64 size; @@ -229,8 +228,11 @@ struct ceph_cap_snap { static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) { - if (atomic_dec_and_test(&capsnap->nref)) + if (atomic_dec_and_test(&capsnap->nref)) { + if (capsnap->xattr_blob) + ceph_buffer_put(capsnap->xattr_blob); kfree(capsnap); + } } /* diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 097a2654c00f..9578af610b73 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -485,6 +485,7 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci) ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob; ci->i_xattrs.prealloc_blob = NULL; ci->i_xattrs.dirty = false; + ci->i_xattrs.version++; } } -- cgit v1.2.3 From ed326044489ed89c740c50a3df5dffc9c3b20b96 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 16 Aug 2010 13:37:31 -0700 Subject: ceph: queue cap snap writeback for realm children on snap update When a realm is updated, we need to queue writeback on inodes in that realm _and_ its children. Otherwise, if the inode gets cowed on the server, we can get a hang later due to out-of-sync cap/snap state. Signed-off-by: Sage Weil --- fs/ceph/snap.c | 60 ++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 23 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 2cb190c2bd99..6bdbf3ae7082 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -548,6 +548,41 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, return 1; /* caller may want to ceph_flush_snaps */ } +/* + * Queue cap_snaps for snap writeback for this realm and its children. + * Called under snap_rwsem, so realm topology won't change. + */ +static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) +{ + struct ceph_inode_info *ci; + struct inode *lastinode = NULL; + struct ceph_snap_realm *child; + + dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino); + + spin_lock(&realm->inodes_with_caps_lock); + list_for_each_entry(ci, &realm->inodes_with_caps, + i_snap_realm_item) { + struct inode *inode = igrab(&ci->vfs_inode); + if (!inode) + continue; + spin_unlock(&realm->inodes_with_caps_lock); + if (lastinode) + iput(lastinode); + lastinode = inode; + ceph_queue_cap_snap(ci); + spin_lock(&realm->inodes_with_caps_lock); + } + spin_unlock(&realm->inodes_with_caps_lock); + if (lastinode) + iput(lastinode); + + dout("queue_realm_cap_snaps %p %llx children\n", realm, realm->ino); + list_for_each_entry(child, &realm->children, child_item) + queue_realm_cap_snaps(child); + + dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino); +} /* * Parse and apply a snapblob "snap trace" from the MDS. This specifies @@ -598,29 +633,8 @@ more: * * ...unless it's a snap deletion! */ - if (!deletion) { - struct ceph_inode_info *ci; - struct inode *lastinode = NULL; - - spin_lock(&realm->inodes_with_caps_lock); - list_for_each_entry(ci, &realm->inodes_with_caps, - i_snap_realm_item) { - struct inode *inode = igrab(&ci->vfs_inode); - if (!inode) - continue; - spin_unlock(&realm->inodes_with_caps_lock); - if (lastinode) - iput(lastinode); - lastinode = inode; - ceph_queue_cap_snap(ci); - spin_lock(&realm->inodes_with_caps_lock); - } - spin_unlock(&realm->inodes_with_caps_lock); - if (lastinode) - iput(lastinode); - dout("update_snap_trace cap_snaps queued\n"); - } - + if (!deletion) + queue_realm_cap_snaps(realm); } else { dout("update_snap_trace %llx %p seq %lld unchanged\n", realm->ino, realm, realm->seq); -- cgit v1.2.3 From eb6bb1c5bdc6e455a9d16cb845cc65afc9b0a617 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 16 Aug 2010 09:21:27 -0700 Subject: ceph: direct requests in snapped namespace based on nonsnap parent When making a request in the virtual snapdir or a snapped portion of the namespace, we should choose the MDS based on the first nonsnap parent (and its caps). If that is not the best place, we will get forward hints to find the right MDS in the cluster. This fixes ESTALE errors when using the .snap directory and namespace with multiple MDSs. Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 397a47b696ce..8d1f11c7a5a2 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -560,6 +560,13 @@ static void __unregister_request(struct ceph_mds_client *mdsc, * * Called under mdsc->mutex. */ +struct dentry *get_nonsnap_parent(struct dentry *dentry) +{ + while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP) + dentry = dentry->d_parent; + return dentry; +} + static int __choose_mds(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { @@ -590,14 +597,29 @@ static int __choose_mds(struct ceph_mds_client *mdsc, if (req->r_inode) { inode = req->r_inode; } else if (req->r_dentry) { - if (req->r_dentry->d_inode) { + struct inode *dir = req->r_dentry->d_parent->d_inode; + + if (dir->i_sb != mdsc->client->sb) { + /* not this fs! */ + inode = req->r_dentry->d_inode; + } else if (ceph_snap(dir) != CEPH_NOSNAP) { + /* direct snapped/virtual snapdir requests + * based on parent dir inode */ + struct dentry *dn = + get_nonsnap_parent(req->r_dentry->d_parent); + inode = dn->d_inode; + dout("__choose_mds using nonsnap parent %p\n", inode); + } else if (req->r_dentry->d_inode) { + /* dentry target */ inode = req->r_dentry->d_inode; } else { - inode = req->r_dentry->d_parent->d_inode; + /* dir + name */ + inode = dir; hash = req->r_dentry->d_name.hash; is_hash = true; } } + dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash, (int)hash, mode); if (!inode) -- cgit v1.2.3 From 679ceace848e9fd570678396ffe1ef034e00e82d Mon Sep 17 00:00:00 2001 From: Michael Rubin Date: Fri, 20 Aug 2010 02:31:26 -0700 Subject: mm: exporting account_page_dirty This allows code outside of the mm core to safely manipulate page state and not worry about the other accounting. Not using these routines means that some code will lose track of the accounting and we get bugs. This has happened once already. Signed-off-by: Michael Rubin Signed-off-by: Sage Weil --- fs/ceph/addr.c | 8 +------- mm/page-writeback.c | 1 + 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 5598a0d02295..420d46974ec8 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -105,13 +105,7 @@ static int ceph_set_page_dirty(struct page *page) spin_lock_irq(&mapping->tree_lock); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(!PageUptodate(page)); - - if (mapping_cap_account_dirty(mapping)) { - __inc_zone_page_state(page, NR_FILE_DIRTY); - __inc_bdi_stat(mapping->backing_dev_info, - BDI_RECLAIMABLE); - task_io_account_write(PAGE_CACHE_SIZE); - } + account_page_dirtied(page, page->mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 37498ef61548..849d0ccbe914 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1096,6 +1096,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) task_io_account_write(PAGE_CACHE_SIZE); } } +EXPORT_SYMBOL(account_page_dirtied); /* * For address_spaces which do not use buffers. Just tag the page as dirty in -- cgit v1.2.3 From 124514918b030d74f1f3e15483b7bf3b85268082 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 22 Aug 2010 21:33:32 -0700 Subject: ceph: don't improperly set dir complete when holding EXCL cap If we hold the EXCL cap, we cannot trust the dir stats from the MDS (num files, subdirs) and must not incorrectly conclude that the directory is empty. If we do, we get can bad results from lookup (bad ENOENT) and bad readdir results. Signed-off-by: Sage Weil --- fs/ceph/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/ceph') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 5d893d31e399..3e6b52cb5ee8 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -677,6 +677,7 @@ static int fill_inode(struct inode *inode, if (ci->i_files == 0 && ci->i_subdirs == 0 && ceph_snap(inode) == CEPH_NOSNAP && (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && + (issued & CEPH_CAP_FILE_EXCL) == 0 && (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) { dout(" marking %p complete (empty)\n", inode); ci->i_ceph_flags |= CEPH_I_COMPLETE; -- cgit v1.2.3 From 07a27e226d1ed210d2d4218bd0642b40f5405c6a Mon Sep 17 00:00:00 2001 From: Henry C Chang Date: Sun, 22 Aug 2010 21:34:27 -0700 Subject: ceph: fix osd request lru adjustment when sending request Fix argument order. We want to move the item to the end of the list, not change the position of the head. Signed-off-by: Henry C Chang Signed-off-by: Sage Weil --- fs/ceph/osd_client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ceph') diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index bed6391e52c7..dfced1dacbcd 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -661,7 +661,7 @@ static int __send_request(struct ceph_osd_client *osdc, reqhead->reassert_version = req->r_reassert_version; req->r_stamp = jiffies; - list_move_tail(&osdc->req_lru, &req->r_req_lru_item); + list_move_tail(&req->r_req_lru_item, &osdc->req_lru); ceph_msg_get(req->r_request); /* send consumes a ref */ ceph_con_send(&req->r_osd->o_con, req->r_request); -- cgit v1.2.3 From 7d8cb26d7dcb911f110b7762bd5941e8f009d6c3 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 24 Aug 2010 08:44:16 -0700 Subject: ceph: maintain i_head_snapc when any caps are dirty, not just for data We used to use i_head_snapc to keep track of which snapc the current epoch of dirty data was dirtied under. It is used by queue_cap_snap to set up the cap_snap. However, since we queue cap snaps for any dirty caps, not just for dirty file data, we need to keep a valid i_head_snapc anytime we have dirty|flushing caps. This fixes a NULL pointer deref in queue_cap_snap when writing back dirty caps without data (e.g., snaptest-authwb.sh). Signed-off-by: Sage Weil --- fs/ceph/addr.c | 4 ++-- fs/ceph/caps.c | 20 +++++++++++++++++--- fs/ceph/snap.c | 6 +++++- fs/ceph/super.h | 3 ++- 4 files changed, 26 insertions(+), 7 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 420d46974ec8..4cfce1ee31fa 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -87,7 +87,7 @@ static int ceph_set_page_dirty(struct page *page) /* dirty the head */ spin_lock(&inode->i_lock); - if (ci->i_wrbuffer_ref_head == 0) + if (ci->i_head_snapc == NULL) ci->i_head_snapc = ceph_get_snap_context(snapc); ++ci->i_wrbuffer_ref_head; if (ci->i_wrbuffer_ref == 0) @@ -346,7 +346,7 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode, break; } } - if (!snapc && ci->i_head_snapc) { + if (!snapc && ci->i_wrbuffer_ref_head) { snapc = ceph_get_snap_context(ci->i_head_snapc); dout(" head snapc %p has %d dirty pages\n", snapc, ci->i_wrbuffer_ref_head); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index ba5bbf318fe1..a2069b6680ae 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1143,6 +1143,10 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, for (i = 0; i < CEPH_CAP_BITS; i++) if (flushing & (1 << i)) ci->i_cap_flush_tid[i] = flush_tid; + + follows = ci->i_head_snapc->seq; + } else { + follows = 0; } keep = cap->implemented; @@ -1156,7 +1160,6 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, mtime = inode->i_mtime; atime = inode->i_atime; time_warp_seq = ci->i_time_warp_seq; - follows = ci->i_snap_realm->cached_context->seq; uid = inode->i_uid; gid = inode->i_gid; mode = inode->i_mode; @@ -1332,7 +1335,11 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) ceph_cap_string(was | mask)); ci->i_dirty_caps |= mask; if (was == 0) { - dout(" inode %p now dirty\n", &ci->vfs_inode); + if (!ci->i_head_snapc) + ci->i_head_snapc = ceph_get_snap_context( + ci->i_snap_realm->cached_context); + dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode, + ci->i_head_snapc); BUG_ON(!list_empty(&ci->i_dirty_item)); spin_lock(&mdsc->cap_dirty_lock); list_add(&ci->i_dirty_item, &mdsc->cap_dirty); @@ -2190,7 +2197,9 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, if (ci->i_head_snapc == snapc) { ci->i_wrbuffer_ref_head -= nr; - if (!ci->i_wrbuffer_ref_head) { + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) { + BUG_ON(!ci->i_head_snapc); ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; } @@ -2483,6 +2492,11 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, dout(" inode %p now clean\n", inode); BUG_ON(!list_empty(&ci->i_dirty_item)); drop = 1; + if (ci->i_wrbuffer_ref_head == 0) { + BUG_ON(!ci->i_head_snapc); + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; + } } else { BUG_ON(list_empty(&ci->i_dirty_item)); } diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 6bdbf3ae7082..4868b9dcac5a 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -458,6 +458,8 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) { struct ceph_snap_context *snapc = ci->i_head_snapc; + dout("queue_cap_snap %p cap_snap %p queuing under %p\n", inode, + capsnap, snapc); igrab(inode); atomic_set(&capsnap->nref, 1); @@ -489,7 +491,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) capsnap->dirty_pages = ci->i_wrbuffer_ref_head; ci->i_wrbuffer_ref_head = 0; capsnap->context = snapc; - ci->i_head_snapc = NULL; + ci->i_head_snapc = + ceph_get_snap_context(ci->i_snap_realm->cached_context); + dout(" new snapc is %p\n", ci->i_head_snapc); list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); if (used & CEPH_CAP_FILE_WR) { diff --git a/fs/ceph/super.h b/fs/ceph/super.h index b33929d8f287..c33897ae5725 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -344,7 +344,8 @@ struct ceph_inode_info { unsigned i_cap_exporting_issued; struct ceph_cap_reservation i_cap_migration_resv; struct list_head i_cap_snaps; /* snapped state pending flush to mds */ - struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 */ + struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or + dirty|flushing caps */ unsigned i_snap_caps; /* cap bits for snapped files */ int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ -- cgit v1.2.3 From 36e21687e6e51c4225c42e6291938363f7bbfa7c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 24 Aug 2010 16:23:48 -0700 Subject: ceph: initialize fields on new dentry_infos Signed-off-by: Sage Weil --- fs/ceph/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ceph') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 67bbb41d5526..6e4f43ff23ec 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -46,7 +46,7 @@ int ceph_init_dentry(struct dentry *dentry) else dentry->d_op = &ceph_snap_dentry_ops; - di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS); + di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO); if (!di) return -ENOMEM; /* oh well */ -- cgit v1.2.3 From ac1f12ef569d49b013c3db86e11be7e15d66b1c3 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Aug 2010 09:11:35 +0200 Subject: ceph: ceph_get_inode() returns an ERR_PTR ceph_get_inode() returns an ERR_PTR and it doesn't return a NULL. Signed-off-by: Dan Carpenter Signed-off-by: Sage Weil --- fs/ceph/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 3e6b52cb5ee8..e7cca414da03 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1230,11 +1230,11 @@ retry_lookup: in = dn->d_inode; } else { in = ceph_get_inode(parent->d_sb, vino); - if (in == NULL) { + if (IS_ERR(in)) { dout("new_inode badness\n"); d_delete(dn); dput(dn); - err = -ENOMEM; + err = PTR_ERR(in); goto out; } dn = splice_dentry(dn, in, NULL); -- cgit v1.2.3 From ad8453ab0a5b98884074302ba3cc37664791e261 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Wed, 25 Aug 2010 13:26:32 +0100 Subject: ceph: Fix warnings Just scrubbing some warnings so I can see real problem ones in the build noise. For 32bit we need to coax gcc politely into believing we really honestly intend to the casts. Using (u64)(unsigned long) means we cast from a pointer to a type of the right size and then extend it. This stops the warning spew. Signed-off-by: Alan Cox Signed-off-by: Sage Weil --- fs/ceph/locks.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index ae85af06454f..ff4e753aae92 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -82,7 +82,8 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) length = fl->fl_end - fl->fl_start + 1; err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, - (u64)fl->fl_pid, (u64)fl->fl_nspid, + (u64)fl->fl_pid, + (u64)(unsigned long)fl->fl_nspid, lock_cmd, fl->fl_start, length, wait); if (!err) { @@ -92,7 +93,8 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) /* undo! This should only happen if the kernel detects * local deadlock. */ ceph_lock_message(CEPH_LOCK_FCNTL, op, file, - (u64)fl->fl_pid, (u64)fl->fl_nspid, + (u64)fl->fl_pid, + (u64)(unsigned long)fl->fl_nspid, CEPH_LOCK_UNLOCK, fl->fl_start, length, 0); dout("got %d on posix_lock_file, undid lock", err); @@ -132,7 +134,8 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) length = fl->fl_end - fl->fl_start + 1; err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, - file, (u64)fl->fl_pid, (u64)fl->fl_nspid, + file, (u64)fl->fl_pid, + (u64)(unsigned long)fl->fl_nspid, lock_cmd, fl->fl_start, length, wait); if (!err) { @@ -141,7 +144,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, file, (u64)fl->fl_pid, - (u64)fl->fl_nspid, + (u64)(unsigned long)fl->fl_nspid, CEPH_LOCK_UNLOCK, fl->fl_start, length, 0); dout("got %d on flock_lock_file_wait, undid lock", err); @@ -235,7 +238,8 @@ int lock_to_ceph_filelock(struct file_lock *lock, cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); cephlock->client = cpu_to_le64(0); cephlock->pid = cpu_to_le64(lock->fl_pid); - cephlock->pid_namespace = cpu_to_le64((u64)lock->fl_nspid); + cephlock->pid_namespace = + cpu_to_le64((u64)(unsigned long)lock->fl_nspid); switch (lock->fl_type) { case F_RDLCK: -- cgit v1.2.3 From f44c3890d9fd6e4284518ff3bb16879fee194a3a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 26 Aug 2010 11:07:24 +0200 Subject: ceph: ceph_mdsc_build_path() returns an ERR_PTR ceph_mdsc_build_path() returns an ERR_PTR but this code is set up to handle NULL returns. Signed-off-by: Dan Carpenter Signed-off-by: Sage Weil --- fs/ceph/debugfs.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/ceph') diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 360c4f22718d..6fd8b20a8611 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -171,6 +171,8 @@ static int mdsc_show(struct seq_file *s, void *p) } else if (req->r_dentry) { path = ceph_mdsc_build_path(req->r_dentry, &pathlen, &pathbase, 0); + if (IS_ERR(path)) + path = NULL; spin_lock(&req->r_dentry->d_lock); seq_printf(s, " #%llx/%.*s (%s)", ceph_ino(req->r_dentry->d_parent->d_inode), @@ -187,6 +189,8 @@ static int mdsc_show(struct seq_file *s, void *p) if (req->r_old_dentry) { path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen, &pathbase, 0); + if (IS_ERR(path)) + path = NULL; spin_lock(&req->r_old_dentry->d_lock); seq_printf(s, " #%llx/%.*s (%s)", ceph_ino(req->r_old_dentry->d_parent->d_inode), -- cgit v1.2.3 From e072f8aa3587710cd35cce0f6b6efd7b4276c327 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 26 Aug 2010 09:26:37 -0700 Subject: ceph: don't BUG on ENOMEM during mds reconnect We are in a position to return an error; do that instead. Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 8d1f11c7a5a2..f091b1351786 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2324,7 +2324,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0); if (IS_ERR(path)) { err = PTR_ERR(path); - BUG_ON(err); + goto out_dput; } } else { path = NULL; @@ -2332,7 +2332,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, } err = ceph_pagelist_encode_string(pagelist, path, pathlen); if (err) - goto out; + goto out_free; spin_lock(&inode->i_lock); cap->seq = 0; /* reset cap seq */ @@ -2376,8 +2376,9 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, unlock_kernel(); } -out: +out_free: kfree(path); +out_dput: dput(dentry); return err; } -- cgit v1.2.3 From b545787dbb00a041c541a4759d938ddb0108295a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 26 Aug 2010 11:12:38 +0200 Subject: ceph: fix get_ticket_handler() error handling get_ticket_handler() returns a valid pointer or it returns ERR_PTR(-ENOMEM) if kzalloc() fails. Signed-off-by: Dan Carpenter Signed-off-by: Sage Weil --- fs/ceph/auth_x.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c index 582e0b2caf8a..a2d002cbdec2 100644 --- a/fs/ceph/auth_x.c +++ b/fs/ceph/auth_x.c @@ -376,7 +376,7 @@ static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed) th = get_ticket_handler(ac, service); - if (!th) { + if (IS_ERR(th)) { *pneed |= service; continue; } @@ -399,6 +399,9 @@ static int ceph_x_build_request(struct ceph_auth_client *ac, struct ceph_x_ticket_handler *th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); + if (IS_ERR(th)) + return PTR_ERR(th); + ceph_x_validate_tickets(ac, &need); dout("build_request want %x have %x need %x\n", @@ -450,7 +453,6 @@ static int ceph_x_build_request(struct ceph_auth_client *ac, return -ERANGE; head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY); - BUG_ON(!th); ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer); if (ret) return ret; @@ -505,7 +507,8 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result, case CEPHX_GET_PRINCIPAL_SESSION_KEY: th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); - BUG_ON(!th); + if (IS_ERR(th)) + return PTR_ERR(th); ret = ceph_x_proc_ticket_reply(ac, &th->session_key, buf + sizeof(*head), end); break; @@ -563,8 +566,8 @@ static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac, void *end = p + sizeof(au->reply_buf); th = get_ticket_handler(ac, au->service); - if (!th) - return -EIO; /* hrm! */ + if (IS_ERR(th)) + return PTR_ERR(th); ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply)); if (ret < 0) return ret; @@ -626,7 +629,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, struct ceph_x_ticket_handler *th; th = get_ticket_handler(ac, peer_type); - if (th && !IS_ERR(th)) + if (!IS_ERR(th)) remove_ticket_handler(ac, th); } -- cgit v1.2.3 From ca04d9c3ec721e474f00992efc1b1afb625507f5 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 26 Aug 2010 16:12:01 -0700 Subject: ceph: fix null pointer deref on anon root dentry release When we release a root dentry, particularly after a splice, the parent (actually our) inode was evaluating to NULL and was getting dereferenced by ceph_snap(). This is reproduced by something as simple as mount -t ceph monhost:/a/b mnt mount -t ceph monhost:/a mnt2 ls mnt2 A splice_dentry() would kill the old 'b' inode's root dentry, and we'd crash while releasing it. Fix by checking for both the ROOT and NULL cases explicitly. We only need to invalidate the parent dir when we have a correct parent to invalidate. Signed-off-by: Sage Weil --- fs/ceph/dir.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 6e4f43ff23ec..a1986eb52045 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1021,11 +1021,15 @@ out_touch: static void ceph_dentry_release(struct dentry *dentry) { struct ceph_dentry_info *di = ceph_dentry(dentry); - struct inode *parent_inode = dentry->d_parent->d_inode; - u64 snapid = ceph_snap(parent_inode); + struct inode *parent_inode = NULL; + u64 snapid = CEPH_NOSNAP; + if (!IS_ROOT(dentry)) { + parent_inode = dentry->d_parent->d_inode; + if (parent_inode) + snapid = ceph_snap(parent_inode); + } dout("dentry_release %p parent %p\n", dentry, parent_inode); - if (parent_inode && snapid != CEPH_SNAPDIR) { struct ceph_inode_info *ci = ceph_inode(parent_inode); -- cgit v1.2.3 From 3d4401d9d0aef5c40706350685ddea3df6708496 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Fri, 3 Sep 2010 12:57:11 -0700 Subject: ceph: fix pagelist kunmap tail A wrong parameter was passed to the kunmap. Signed-off-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/pagelist.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c index b6859f47d364..46a368b6dce5 100644 --- a/fs/ceph/pagelist.c +++ b/fs/ceph/pagelist.c @@ -5,10 +5,18 @@ #include "pagelist.h" +static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl) +{ + struct page *page = list_entry(pl->head.prev, struct page, + lru); + kunmap(page); +} + int ceph_pagelist_release(struct ceph_pagelist *pl) { if (pl->mapped_tail) - kunmap(pl->mapped_tail); + ceph_pagelist_unmap_tail(pl); + while (!list_empty(&pl->head)) { struct page *page = list_first_entry(&pl->head, struct page, lru); @@ -26,7 +34,7 @@ static int ceph_pagelist_addpage(struct ceph_pagelist *pl) pl->room += PAGE_SIZE; list_add_tail(&page->lru, &pl->head); if (pl->mapped_tail) - kunmap(pl->mapped_tail); + ceph_pagelist_unmap_tail(pl); pl->mapped_tail = kmap(page); return 0; } -- cgit v1.2.3 From 3612abbd5df6baa9ca3e0777c6c8646e202d3f66 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 7 Sep 2010 15:59:27 -0700 Subject: ceph: fix reconnect encoding for old servers Fix the reconnect encoding to encode the cap record when the MDS does not have the FLOCK capability (i.e., pre v0.22). Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/ceph') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index f091b1351786..fad95f8f2608 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2374,6 +2374,8 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, num_fcntl_locks, num_flock_locks); unlock_kernel(); + } else { + err = ceph_pagelist_append(pagelist, &rec, reclen); } out_free: -- cgit v1.2.3 From a77d9f7dce7600058d56f0670ed29d77abffcde2 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 11 Sep 2010 10:55:25 -0700 Subject: ceph: fix file offset wrapping at 4GB on 32-bit archs Cast the value before shifting so that we don't run out of bits with a 32-bit unsigned long. This fixes wrapping of high file offsets into the low 4GB of a file on disk, and the subsequent data corruption for large files. Signed-off-by: Sage Weil --- fs/ceph/addr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/ceph') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 4cfce1ee31fa..50461b8c23a4 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -766,7 +766,8 @@ get_more_pages: /* ok */ if (locked_pages == 0) { /* prepare async write request */ - offset = page->index << PAGE_CACHE_SHIFT; + offset = (unsigned long long)page->index + << PAGE_CACHE_SHIFT; len = wsize; req = ceph_osdc_new_request(&client->osdc, &ci->i_layout, -- cgit v1.2.3 From 467c525109d5d542d7d416b0c11bdd54610fe2f4 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 13 Sep 2010 11:39:20 -0700 Subject: ceph: fix dn offset during readdir_prepopulate When adding the readdir results to the cache, ceph_set_dentry_offset was clobbered our just-set offset. This can cause the readdir result offsets to get out of sync with the server. Add an argument to the helper so that it does not. This bug was introduced by 1cd3935bedccf592d44343890251452a6dd74fc4. Signed-off-by: Sage Weil --- fs/ceph/inode.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index e7cca414da03..62377ec37edf 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -845,7 +845,7 @@ static void ceph_set_dentry_offset(struct dentry *dn) * the caller) if we fail. */ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, - bool *prehash) + bool *prehash, bool set_offset) { struct dentry *realdn; @@ -877,7 +877,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, } if ((!prehash || *prehash) && d_unhashed(dn)) d_rehash(dn); - ceph_set_dentry_offset(dn); + if (set_offset) + ceph_set_dentry_offset(dn); out: return dn; } @@ -1062,7 +1063,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, d_delete(dn); goto done; } - dn = splice_dentry(dn, in, &have_lease); + dn = splice_dentry(dn, in, &have_lease, true); if (IS_ERR(dn)) { err = PTR_ERR(dn); goto done; @@ -1105,7 +1106,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, goto done; } dout(" linking snapped dir %p to dn %p\n", in, dn); - dn = splice_dentry(dn, in, NULL); + dn = splice_dentry(dn, in, NULL, true); if (IS_ERR(dn)) { err = PTR_ERR(dn); goto done; @@ -1237,7 +1238,7 @@ retry_lookup: err = PTR_ERR(in); goto out; } - dn = splice_dentry(dn, in, NULL); + dn = splice_dentry(dn, in, NULL, false); if (IS_ERR(dn)) dn = NULL; } -- cgit v1.2.3 From 8bef9239ee1a42eb37d3f83bacf6a75f019c028d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 14 Sep 2010 15:45:44 -0700 Subject: ceph: correctly set 'follows' in flushsnap messages The 'follows' should match the seq for the snap context for the given snap cap, which is the context under which we have been dirtying and writing data and metadata. The snapshot that _contains_ those updates thus _follows_ that context's seq #. Signed-off-by: Sage Weil --- fs/ceph/snap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ceph') diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 4868b9dcac5a..9e836afba341 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -467,7 +467,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) INIT_LIST_HEAD(&capsnap->ci_item); INIT_LIST_HEAD(&capsnap->flushing_item); - capsnap->follows = snapc->seq - 1; + capsnap->follows = snapc->seq; capsnap->issued = __ceph_caps_issued(ci, NULL); capsnap->dirty = dirty; -- cgit v1.2.3 From cfc0bf6640dfd0f43bf8bfec5a475284809baa4d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 14 Sep 2010 15:50:59 -0700 Subject: ceph: stop sending FLUSHSNAPs when we hit a dirty capsnap Stop sending FLUSHSNAP messages when we hit a capsnap that has dirty_pages or is still writing. We'll send the newer capsnaps only after the older ones complete. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index a2069b6680ae..9fbe9019155c 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1227,7 +1227,7 @@ retry: * pages to be written out. */ if (capsnap->dirty_pages || capsnap->writing) - continue; + break; /* * if cap writeback already occurred, we should have dropped @@ -1276,8 +1276,8 @@ retry: &session->s_cap_snaps_flushing); spin_unlock(&inode->i_lock); - dout("flush_snaps %p cap_snap %p follows %lld size %llu\n", - inode, capsnap, next_follows, capsnap->size); + dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n", + inode, capsnap, capsnap->follows, capsnap->flush_tid); send_cap_msg(session, ceph_vino(inode).ino, 0, CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0, capsnap->dirty, 0, capsnap->flush_tid, 0, mseq, -- cgit v1.2.3 From ae00d4f37f4df56821331deb1028748110dd6dc9 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 16 Sep 2010 16:26:51 -0700 Subject: ceph: fix cap_snap and realm split The cap_snap creation/queueing relies on both the current i_head_snapc _and_ the i_snap_realm pointers being correct, so that the new cap_snap can properly reference the old context and the new i_head_snapc can be updated to reference the new snaprealm's context. To fix this, we: - move inodes completely to the new (split) realm so that i_snap_realm is correct, and - generate the new snapc's _before_ queueing the cap_snaps in ceph_update_snap_trace(). Signed-off-by: Sage Weil --- fs/ceph/addr.c | 4 +-- fs/ceph/snap.c | 88 +++++++++++++++++++-------------------------------------- fs/ceph/super.h | 2 ++ 3 files changed, 33 insertions(+), 61 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 50461b8c23a4..efbc604001c8 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -411,8 +411,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) if (i_size < page_off + len) len = i_size - page_off; - dout("writepage %p page %p index %lu on %llu~%u\n", - inode, page, page->index, page_off, len); + dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", + inode, page, page->index, page_off, len, snapc); writeback_stat = atomic_long_inc_return(&client->writeback_count); if (writeback_stat > diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 9e836afba341..9e6eef14b7df 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -119,6 +119,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm( INIT_LIST_HEAD(&realm->children); INIT_LIST_HEAD(&realm->child_item); INIT_LIST_HEAD(&realm->empty_item); + INIT_LIST_HEAD(&realm->dirty_item); INIT_LIST_HEAD(&realm->inodes_with_caps); spin_lock_init(&realm->inodes_with_caps_lock); __insert_snap_realm(&mdsc->snap_realms, realm); @@ -604,6 +605,7 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm; int invalidate = 0; int err = -ENOMEM; + LIST_HEAD(dirty_realms); dout("update_snap_trace deletion=%d\n", deletion); more: @@ -626,24 +628,6 @@ more: } } - if (le64_to_cpu(ri->seq) > realm->seq) { - dout("update_snap_trace updating %llx %p %lld -> %lld\n", - realm->ino, realm, realm->seq, le64_to_cpu(ri->seq)); - /* - * if the realm seq has changed, queue a cap_snap for every - * inode with open caps. we do this _before_ we update - * the realm info so that we prepare for writeback under the - * _previous_ snap context. - * - * ...unless it's a snap deletion! - */ - if (!deletion) - queue_realm_cap_snaps(realm); - } else { - dout("update_snap_trace %llx %p seq %lld unchanged\n", - realm->ino, realm, realm->seq); - } - /* ensure the parent is correct */ err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent)); if (err < 0) @@ -651,6 +635,8 @@ more: invalidate += err; if (le64_to_cpu(ri->seq) > realm->seq) { + dout("update_snap_trace updating %llx %p %lld -> %lld\n", + realm->ino, realm, realm->seq, le64_to_cpu(ri->seq)); /* update realm parameters, snap lists */ realm->seq = le64_to_cpu(ri->seq); realm->created = le64_to_cpu(ri->created); @@ -668,9 +654,17 @@ more: if (err < 0) goto fail; + /* queue realm for cap_snap creation */ + list_add(&realm->dirty_item, &dirty_realms); + invalidate = 1; } else if (!realm->cached_context) { + dout("update_snap_trace %llx %p seq %lld new\n", + realm->ino, realm, realm->seq); invalidate = 1; + } else { + dout("update_snap_trace %llx %p seq %lld unchanged\n", + realm->ino, realm, realm->seq); } dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino, @@ -683,6 +677,14 @@ more: if (invalidate) rebuild_snap_realms(realm); + /* + * queue cap snaps _after_ we've built the new snap contexts, + * so that i_head_snapc can be set appropriately. + */ + list_for_each_entry(realm, &dirty_realms, dirty_item) { + queue_realm_cap_snaps(realm); + } + __cleanup_empty_realms(mdsc); return 0; @@ -816,6 +818,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, }; struct inode *inode = ceph_find_inode(sb, vino); struct ceph_inode_info *ci; + struct ceph_snap_realm *oldrealm; if (!inode) continue; @@ -841,18 +844,19 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, dout(" will move %p to split realm %llx %p\n", inode, realm->ino, realm); /* - * Remove the inode from the realm's inode - * list, but don't add it to the new realm - * yet. We don't want the cap_snap to be - * queued (again) by ceph_update_snap_trace() - * below. Queue it _now_, under the old context. + * Move the inode to the new realm */ spin_lock(&realm->inodes_with_caps_lock); list_del_init(&ci->i_snap_realm_item); + list_add(&ci->i_snap_realm_item, + &realm->inodes_with_caps); + oldrealm = ci->i_snap_realm; + ci->i_snap_realm = realm; spin_unlock(&realm->inodes_with_caps_lock); spin_unlock(&inode->i_lock); - ceph_queue_cap_snap(ci); + ceph_get_snap_realm(mdsc, realm); + ceph_put_snap_realm(mdsc, oldrealm); iput(inode); continue; @@ -880,43 +884,9 @@ skip_inode: ceph_update_snap_trace(mdsc, p, e, op == CEPH_SNAP_OP_DESTROY); - if (op == CEPH_SNAP_OP_SPLIT) { - /* - * ok, _now_ add the inodes into the new realm. - */ - for (i = 0; i < num_split_inos; i++) { - struct ceph_vino vino = { - .ino = le64_to_cpu(split_inos[i]), - .snap = CEPH_NOSNAP, - }; - struct inode *inode = ceph_find_inode(sb, vino); - struct ceph_inode_info *ci; - - if (!inode) - continue; - ci = ceph_inode(inode); - spin_lock(&inode->i_lock); - if (list_empty(&ci->i_snap_realm_item)) { - struct ceph_snap_realm *oldrealm = - ci->i_snap_realm; - - dout(" moving %p to split realm %llx %p\n", - inode, realm->ino, realm); - spin_lock(&realm->inodes_with_caps_lock); - list_add(&ci->i_snap_realm_item, - &realm->inodes_with_caps); - ci->i_snap_realm = realm; - spin_unlock(&realm->inodes_with_caps_lock); - ceph_get_snap_realm(mdsc, realm); - ceph_put_snap_realm(mdsc, oldrealm); - } - spin_unlock(&inode->i_lock); - iput(inode); - } - + if (op == CEPH_SNAP_OP_SPLIT) /* we took a reference when we created the realm, above */ ceph_put_snap_realm(mdsc, realm); - } __cleanup_empty_realms(mdsc); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index c33897ae5725..c80bfbe27b05 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -690,6 +690,8 @@ struct ceph_snap_realm { struct list_head empty_item; /* if i have ref==0 */ + struct list_head dirty_item; /* if realm needs new context */ + /* the current set of snaps for this realm */ struct ceph_snap_context *cached_context; -- cgit v1.2.3 From e835124c2be289515b918f2688ced4249e2de566 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 17 Sep 2010 08:03:08 -0700 Subject: ceph: only send one flushsnap per cap_snap per mds session Sending multiple flushsnap messages is problematic because we ignore the response if the tid doesn't match, and the server may only respond to each one once. It's also a waste. So, skip cap_snaps that are already on the flushing list, unless the caller tells us to resend (because we are reconnecting). Signed-off-by: Sage Weil --- fs/ceph/caps.c | 19 +++++++++++++++---- fs/ceph/snap.c | 2 +- fs/ceph/super.h | 3 ++- 3 files changed, 18 insertions(+), 6 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 9fbe9019155c..b01c316a8148 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1195,10 +1195,14 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, * asynchronously back to the MDS once sync writes complete and dirty * data is written out. * + * Unless @again is true, skip cap_snaps that were already sent to + * the MDS (i.e., during this session). + * * Called under i_lock. Takes s_mutex as needed. */ void __ceph_flush_snaps(struct ceph_inode_info *ci, - struct ceph_mds_session **psession) + struct ceph_mds_session **psession, + int again) __releases(ci->vfs_inode->i_lock) __acquires(ci->vfs_inode->i_lock) { @@ -1240,6 +1244,13 @@ retry: dout("no auth cap (migrating?), doing nothing\n"); goto out; } + + /* only flush each capsnap once */ + if (!again && !list_empty(&capsnap->flushing_item)) { + dout("already flushed %p, skipping\n", capsnap); + continue; + } + mds = ci->i_auth_cap->session->s_mds; mseq = ci->i_auth_cap->mseq; @@ -1314,7 +1325,7 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci) struct inode *inode = &ci->vfs_inode; spin_lock(&inode->i_lock); - __ceph_flush_snaps(ci, NULL); + __ceph_flush_snaps(ci, NULL, 0); spin_unlock(&inode->i_lock); } @@ -1477,7 +1488,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, /* flush snaps first time around only */ if (!list_empty(&ci->i_cap_snaps)) - __ceph_flush_snaps(ci, &session); + __ceph_flush_snaps(ci, &session, 0); goto retry_locked; retry: spin_lock(&inode->i_lock); @@ -1894,7 +1905,7 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc, if (cap && cap->session == session) { dout("kick_flushing_caps %p cap %p capsnap %p\n", inode, cap, capsnap); - __ceph_flush_snaps(ci, &session); + __ceph_flush_snaps(ci, &session, 1); } else { pr_err("%p auth cap %p not mds%d ???\n", inode, cap, session->s_mds); diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 9e6eef14b7df..190b6c4a6f2b 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -717,7 +717,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc) igrab(inode); spin_unlock(&mdsc->snap_flush_lock); spin_lock(&inode->i_lock); - __ceph_flush_snaps(ci, &session); + __ceph_flush_snaps(ci, &session, 0); spin_unlock(&inode->i_lock); iput(inode); spin_lock(&mdsc->snap_flush_lock); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index c80bfbe27b05..b87638e84c4b 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -828,7 +828,8 @@ extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc); extern void __ceph_flush_snaps(struct ceph_inode_info *ci, - struct ceph_mds_session **psession); + struct ceph_mds_session **psession, + int again); extern void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_session *session); extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc); -- cgit v1.2.3 From a43fb73101eaf6db0b33d22c152b338ab8b3edbb Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 17 Sep 2010 09:54:08 -0700 Subject: ceph: check mapping to determine if FILE_CACHE cap is used See if the i_data mapping has any pages to determine if the FILE_CACHE capability is currently in use, instead of assuming it is any time the rdcache_gen value is set (i.e., issued -> used). This allows the MDS RECALL_STATE process work for inodes that have cached pages. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ceph') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index b01c316a8148..73c153092f72 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -814,7 +814,7 @@ int __ceph_caps_used(struct ceph_inode_info *ci) used |= CEPH_CAP_PIN; if (ci->i_rd_ref) used |= CEPH_CAP_FILE_RD; - if (ci->i_rdcache_ref || ci->i_rdcache_gen) + if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages) used |= CEPH_CAP_FILE_CACHE; if (ci->i_wr_ref) used |= CEPH_CAP_FILE_WR; -- cgit v1.2.3 From be4f104dfd3b5e3ae262bff607965cfc38027dec Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 17 Sep 2010 12:30:31 -0700 Subject: ceph: select CRYPTO We select CRYPTO_AES, but not CRYPTO. Signed-off-by: Sage Weil --- fs/ceph/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/ceph') diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index bc87b9c1d27e..0fcd2640c23f 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -3,6 +3,7 @@ config CEPH_FS depends on INET && EXPERIMENTAL select LIBCRC32C select CRYPTO_AES + select CRYPTO help Choose Y or M here to include support for mounting the experimental Ceph distributed file system. Ceph is an extremely -- cgit v1.2.3