From 978097c907b58a2d085bbf7632bee1a5a7e6f6ba Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 8 Mar 2010 15:27:53 -0800 Subject: ceph: implemented caps should always be superset of issued caps Added assertion, and cleared one case where the implemented caps were not following the issued caps. Signed-off-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/caps.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index db122bb357b8..57d9b44a8820 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2334,6 +2334,7 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, revoked_rdcache) reply = 2; /* send revoke ack in check_caps */ cap->issued = newcaps; + cap->implemented |= newcaps; } else if (cap->issued == newcaps) { dout("caps unchanged: %s -> %s\n", ceph_cap_string(cap->issued), ceph_cap_string(newcaps)); @@ -2346,6 +2347,7 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, * pending revocation */ wake = 1; } + BUG_ON(cap->issued & ~cap->implemented); spin_unlock(&inode->i_lock); if (writeback) -- cgit v1.2.3 From 052bb34af3bf8ae2001b9f03d884ba0def3e427c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 9 Mar 2010 12:52:26 -0800 Subject: ceph: add missing locking to protect i_snap_realm_item during split All ci->i_snap_realm_item/realm->inodes_with_caps manipulation should be protected by realm->inodes_with_caps_lock. This bug would have only bit us in a rare race with a realm split (during some snap creations). Signed-off-by: Sage Weil --- fs/ceph/snap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index bf2a5f3846a4..8a43bc8675eb 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -818,7 +818,9 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, * queued (again) by ceph_update_snap_trace() * below. Queue it _now_, under the old context. */ + spin_lock(&realm->inodes_with_caps_lock); list_del_init(&ci->i_snap_realm_item); + spin_unlock(&realm->inodes_with_caps_lock); spin_unlock(&inode->i_lock); ceph_queue_cap_snap(ci, -- cgit v1.2.3 From 8b218b8a4a65bf4e304ae8690cadb9100ef029c0 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 9 Mar 2010 12:59:08 -0800 Subject: ceph: fix inode removal from snap realm when racing with migration When an inode was dropped while being migrated between two MDSs, i_cap_exporting_issued was non-zero such that issue caps were non-zero and __ceph_is_any_caps(ci) was true. This prevented the inode from being removed from the snap realm, even as it was dropped from the cache. Fix this by dropping any residual i_snap_realm ref in destroy_inode. Signed-off-by: Sage Weil --- fs/ceph/inode.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 7abe1aed819b..aca82d55cc53 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -378,6 +378,22 @@ void ceph_destroy_inode(struct inode *inode) ceph_queue_caps_release(inode); + /* + * we may still have a snap_realm reference if there are stray + * caps in i_cap_exporting_issued or i_snap_caps. + */ + if (ci->i_snap_realm) { + struct ceph_mds_client *mdsc = + &ceph_client(ci->vfs_inode.i_sb)->mdsc; + struct ceph_snap_realm *realm = ci->i_snap_realm; + + dout(" dropping residual ref to snap realm %p\n", realm); + spin_lock(&realm->inodes_with_caps_lock); + list_del_init(&ci->i_snap_realm_item); + spin_unlock(&realm->inodes_with_caps_lock); + ceph_put_snap_realm(mdsc, realm); + } + kfree(ci->i_symlink); while ((n = rb_first(&ci->i_fragtree)) != NULL) { frag = rb_entry(n, struct ceph_inode_frag, node); -- cgit v1.2.3 From 63733a0fc55cca74b1911769633dc5dfd1a45907 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 15 Mar 2010 15:47:22 -0700 Subject: ceph: fix authenticator timeout We were failing to reconnect to services due to an old authenticator, even though we had the new ticket, because we weren't properly retrying the connect handshake, because we were calling an old/incorrect helper that left in_base_pos incorrect. The result was a failure to reconnect to the OSD or MDS (with an authentication error) if the MDS restarted after the service had been up a few hours (long enough for the original authenticator to be invalid). This was only a problem if the AUTH_X authentication was enabled. Now that the 'negotiate' and 'connect' stages are fully separated, we should use the prepare_read_connect() helper instead, and remove the obsolete one. Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 781656a49bf8..203c4359b549 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -830,13 +830,6 @@ static void prepare_read_connect(struct ceph_connection *con) con->in_base_pos = 0; } -static void prepare_read_connect_retry(struct ceph_connection *con) -{ - dout("prepare_read_connect_retry %p\n", con); - con->in_base_pos = strlen(CEPH_BANNER) + sizeof(con->actual_peer_addr) - + sizeof(con->peer_addr_for_me); -} - static void prepare_read_ack(struct ceph_connection *con) { dout("prepare_read_ack %p\n", con); @@ -1146,7 +1139,7 @@ static int process_connect(struct ceph_connection *con) } con->auth_retry = 1; prepare_write_connect(con->msgr, con, 0); - prepare_read_connect_retry(con); + prepare_read_connect(con); break; case CEPH_MSGR_TAG_RESETSESSION: -- cgit v1.2.3 From 807c86e2ceba8febe79b289d50cd0d5e0b0af917 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 15 Mar 2010 15:52:17 -0700 Subject: ceph: fix authenticator buffer size calculation The buffer size was incorrectly calculated for the ceph_x_encrypt() encapsulated ticket blob. Use a helper (with correct arithmetic) and BUG out if we were wrong. Signed-off-by: Sage Weil --- fs/ceph/auth_x.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c index f0318427b6da..96e7aaa77678 100644 --- a/fs/ceph/auth_x.c +++ b/fs/ceph/auth_x.c @@ -28,6 +28,12 @@ static int ceph_x_is_authenticated(struct ceph_auth_client *ac) return (ac->want_keys & xi->have_keys) == ac->want_keys; } +static int ceph_x_encrypt_buflen(int ilen) +{ + return sizeof(struct ceph_x_encrypt_header) + ilen + 16 + + sizeof(u32); +} + static int ceph_x_encrypt(struct ceph_crypto_key *secret, void *ibuf, int ilen, void *obuf, size_t olen) { @@ -242,7 +248,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac, struct ceph_x_ticket_handler *th, struct ceph_x_authorizer *au) { - int len; + int maxlen; struct ceph_x_authorize_a *msg_a; struct ceph_x_authorize_b msg_b; void *p, *end; @@ -253,15 +259,15 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac, dout("build_authorizer for %s %p\n", ceph_entity_type_name(th->service), au); - len = sizeof(*msg_a) + sizeof(msg_b) + sizeof(u32) + - ticket_blob_len + 16; - dout(" need len %d\n", len); - if (au->buf && au->buf->alloc_len < len) { + maxlen = sizeof(*msg_a) + sizeof(msg_b) + + ceph_x_encrypt_buflen(ticket_blob_len); + dout(" need len %d\n", maxlen); + if (au->buf && au->buf->alloc_len < maxlen) { ceph_buffer_put(au->buf); au->buf = NULL; } if (!au->buf) { - au->buf = ceph_buffer_new(len, GFP_NOFS); + au->buf = ceph_buffer_new(maxlen, GFP_NOFS); if (!au->buf) return -ENOMEM; } @@ -296,6 +302,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac, au->buf->vec.iov_len = p - au->buf->vec.iov_base; dout(" built authorizer nonce %llx len %d\n", au->nonce, (int)au->buf->vec.iov_len); + BUG_ON(au->buf->vec.iov_len > maxlen); return 0; out_buf: -- cgit v1.2.3 From 5b3dbb44ab40660a080d03585bd35f45b2890c49 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 15 Mar 2010 15:57:04 -0700 Subject: ceph: release old ticket_blob buffer Release the old ticket_blob buffer when we get an updated service ticket from the monitor. Previously these were getting leaked. Signed-off-by: Sage Weil --- fs/ceph/auth_x.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c index 96e7aaa77678..33d3ad4dc456 100644 --- a/fs/ceph/auth_x.c +++ b/fs/ceph/auth_x.c @@ -156,6 +156,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, struct timespec validity; struct ceph_crypto_key old_key; void *tp, *tpend; + struct ceph_buffer *new_ticket_blob; ceph_decode_need(&p, end, sizeof(u32) + 1, bad); @@ -223,9 +224,12 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad); struct_v = ceph_decode_8(&tp); th->secret_id = ceph_decode_64(&tp); - ret = ceph_decode_buffer(&th->ticket_blob, &tp, tpend); + ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend); if (ret) goto out; + if (th->ticket_blob) + ceph_buffer_put(th->ticket_blob); + th->ticket_blob = new_ticket_blob; dout(" got ticket service %d (%s) secret_id %lld len %d\n", type, ceph_entity_type_name(type), th->secret_id, (int)th->ticket_blob->vec.iov_len); -- cgit v1.2.3 From 0a990e7093566ceb07e38951e1a01686923d4f09 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 15 Mar 2010 16:12:25 -0700 Subject: ceph: clean up service ticket decoding Previously we would decode state directly into our current ticket_handler. This is problematic if for some reason we fail to decode, because we end up with half new state and half old state. We are probably already in bad shape if we get an update we can't decode, but we may as well be tidy anyway. Decode into new_* temporaries and update the ticket_handler only on success. Signed-off-by: Sage Weil --- fs/ceph/auth_x.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c index 33d3ad4dc456..8d8a84964763 100644 --- a/fs/ceph/auth_x.c +++ b/fs/ceph/auth_x.c @@ -156,7 +156,11 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, struct timespec validity; struct ceph_crypto_key old_key; void *tp, *tpend; + struct ceph_timespec new_validity; + struct ceph_crypto_key new_session_key; struct ceph_buffer *new_ticket_blob; + unsigned long new_expires, new_renew_after; + u64 new_secret_id; ceph_decode_need(&p, end, sizeof(u32) + 1, bad); @@ -189,16 +193,16 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, goto bad; memcpy(&old_key, &th->session_key, sizeof(old_key)); - ret = ceph_crypto_key_decode(&th->session_key, &dp, dend); + ret = ceph_crypto_key_decode(&new_session_key, &dp, dend); if (ret) goto out; - ceph_decode_copy(&dp, &th->validity, sizeof(th->validity)); - ceph_decode_timespec(&validity, &th->validity); - th->expires = get_seconds() + validity.tv_sec; - th->renew_after = th->expires - (validity.tv_sec / 4); - dout(" expires=%lu renew_after=%lu\n", th->expires, - th->renew_after); + ceph_decode_copy(&dp, &new_validity, sizeof(new_validity)); + ceph_decode_timespec(&validity, &new_validity); + new_expires = get_seconds() + validity.tv_sec; + new_renew_after = new_expires - (validity.tv_sec / 4); + dout(" expires=%lu renew_after=%lu\n", new_expires, + new_renew_after); /* ticket blob for service */ ceph_decode_8_safe(&p, end, is_enc, bad); @@ -223,13 +227,21 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, dout(" ticket blob is %d bytes\n", dlen); ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad); struct_v = ceph_decode_8(&tp); - th->secret_id = ceph_decode_64(&tp); + new_secret_id = ceph_decode_64(&tp); ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend); if (ret) goto out; + + /* all is well, update our ticket */ + ceph_crypto_key_destroy(&th->session_key); if (th->ticket_blob) ceph_buffer_put(th->ticket_blob); + th->session_key = new_session_key; th->ticket_blob = new_ticket_blob; + th->validity = new_validity; + th->secret_id = new_secret_id; + th->expires = new_expires; + th->renew_after = new_renew_after; dout(" got ticket service %d (%s) secret_id %lld len %d\n", type, ceph_entity_type_name(type), th->secret_id, (int)th->ticket_blob->vec.iov_len); -- cgit v1.2.3 From 12eadc190038e68b5884a4aa313b6ab89ba60f5e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 15 Mar 2010 22:20:39 -0700 Subject: ceph: fix null pointer deref of r_osd in debug output This causes an oops when debug output is enabled and we kick an osd request with no current r_osd (sometime after an osd failure). Check the pointer before dereferencing. Signed-off-by: Sage Weil --- fs/ceph/osd_client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index dbe63db9762f..221038253e61 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -913,7 +913,7 @@ static int __kick_requests(struct ceph_osd_client *osdc, kick: dout("kicking %p tid %llu osd%d\n", req, req->r_tid, - req->r_osd->o_osd); + req->r_osd ? req->r_osd->o_osd : -1); req->r_flags |= CEPH_OSD_FLAG_RETRY; err = __send_request(osdc, req); if (err) { -- cgit v1.2.3 From 4ea0043a29c82ca52ca54728d837314563bec574 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 16 Mar 2010 10:36:40 -0700 Subject: ceph: drop unnecessary WARN_ON in caps migration If we don't have the exported cap it's because we already released it. No need to WARN. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 57d9b44a8820..726c8d445995 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2550,9 +2550,8 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, ci->i_cap_exporting_issued = cap->issued; } __ceph_remove_cap(cap); - } else { - WARN_ON(!cap); } + /* else, we already released it */ spin_unlock(&inode->i_lock); } -- cgit v1.2.3 From cdc2ce056a3620139056b60ad7f6d355ad13f445 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 16 Mar 2010 13:39:28 -0700 Subject: ceph: fix session locking in handle_caps, ceph_check_caps Passing a session pointer to ceph_check_caps() used to mean it would leave the session mutex locked. That wasn't always possible if it wasn't passed CHECK_CAPS_AUTHONLY. If could unlock the passed session and lock a differet session mutex, which was clearly wrong, and also emitted a warning when it a racing CPU retook it and we did an unlock from the wrong context. This was only a problem when there was more than one MDS. First, make ceph_check_caps unconditionally drop the session mutex, so that it is free to lock other sessions as needed. Then adjust the one caller that passes in a session (handle_cap_grant) accordingly. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 726c8d445995..782848632e81 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1407,6 +1407,7 @@ static int try_nonblocking_invalidate(struct inode *inode) */ void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_session *session) + __releases(session->s_mutex) { struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode); struct ceph_mds_client *mdsc = &client->mdsc; @@ -1414,7 +1415,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_cap *cap; int file_wanted, used; int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ - int drop_session_lock = session ? 0 : 1; int issued, implemented, want, retain, revoking, flushing = 0; int mds = -1; /* keep track of how far we've gone through i_caps list to avoid an infinite loop on retry */ @@ -1639,7 +1639,7 @@ ack: if (queue_invalidate) ceph_queue_invalidate(inode); - if (session && drop_session_lock) + if (session) mutex_unlock(&session->s_mutex); if (took_snap_rwsem) up_read(&mdsc->snap_rwsem); @@ -2688,14 +2688,17 @@ void ceph_handle_caps(struct ceph_mds_session *session, case CEPH_CAP_OP_REVOKE: case CEPH_CAP_OP_GRANT: r = handle_cap_grant(inode, h, session, cap, msg->middle); - if (r == 1) + if (r == 1) { ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY, session); - else if (r == 2) + session = NULL; + } else if (r == 2) { ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, session); + session = NULL; + } break; case CEPH_CAP_OP_FLUSH_ACK: @@ -2713,7 +2716,8 @@ void ceph_handle_caps(struct ceph_mds_session *session, } done: - mutex_unlock(&session->s_mutex); + if (session) + mutex_unlock(&session->s_mutex); if (check_caps) ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, NULL); -- cgit v1.2.3 From 15637c8b1251c38694c32214eba69b72a30e9d9b Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 16 Mar 2010 13:42:00 -0700 Subject: ceph: clean up handle_cap_grant, handle_caps wrt session mutex Drop session mutex unconditionally in handle_cap_grant, and do the check_caps from the handle_cap_grant helper. This avoids using a magic return value. Also avoid using a flag variable in the IMPORT case and call check_caps at the appropriate point. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 57 +++++++++++++++++++++++++-------------------------------- 1 file changed, 25 insertions(+), 32 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 782848632e81..d9e860ff9f3a 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2195,18 +2195,19 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, * Handle a cap GRANT message from the MDS. (Note that a GRANT may * actually be a revocation if it specifies a smaller cap set.) * - * caller holds s_mutex. + * caller holds s_mutex and i_lock, we drop both. + * * return value: * 0 - ok * 1 - check_caps on auth cap only (writeback) * 2 - check_caps (ack revoke) */ -static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, - struct ceph_mds_session *session, - struct ceph_cap *cap, - struct ceph_buffer *xattr_buf) +static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, + struct ceph_mds_session *session, + struct ceph_cap *cap, + struct ceph_buffer *xattr_buf) __releases(inode->i_lock) - + __releases(session->s_mutex) { struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; @@ -2216,7 +2217,7 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, u64 size = le64_to_cpu(grant->size); u64 max_size = le64_to_cpu(grant->max_size); struct timespec mtime, atime, ctime; - int reply = 0; + int check_caps = 0; int wake = 0; int writeback = 0; int revoked_rdcache = 0; @@ -2329,10 +2330,10 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER) writeback = 1; /* will delay ack */ else if (dirty & ~newcaps) - reply = 1; /* initiate writeback in check_caps */ + check_caps = 1; /* initiate writeback in check_caps */ else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 || revoked_rdcache) - reply = 2; /* send revoke ack in check_caps */ + check_caps = 2; /* send revoke ack in check_caps */ cap->issued = newcaps; cap->implemented |= newcaps; } else if (cap->issued == newcaps) { @@ -2361,7 +2362,14 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, ceph_queue_invalidate(inode); if (wake) wake_up(&ci->i_cap_wq); - return reply; + + if (check_caps == 1) + ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY, + session); + else if (check_caps == 2) + ceph_check_caps(ci, CHECK_CAPS_NODELAY, session); + else + mutex_unlock(&session->s_mutex); } /* @@ -2622,9 +2630,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, u64 cap_id; u64 size, max_size; u64 tid; - int check_caps = 0; void *snaptrace; - int r; dout("handle_caps from mds%d\n", mds); @@ -2669,8 +2675,9 @@ void ceph_handle_caps(struct ceph_mds_session *session, case CEPH_CAP_OP_IMPORT: handle_cap_import(mdsc, inode, h, session, snaptrace, le32_to_cpu(h->snap_trace_len)); - check_caps = 1; /* we may have sent a RELEASE to the old auth */ - goto done; + ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, + session); + goto done_unlocked; } /* the rest require a cap */ @@ -2687,19 +2694,8 @@ void ceph_handle_caps(struct ceph_mds_session *session, switch (op) { case CEPH_CAP_OP_REVOKE: case CEPH_CAP_OP_GRANT: - r = handle_cap_grant(inode, h, session, cap, msg->middle); - if (r == 1) { - ceph_check_caps(ceph_inode(inode), - CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY, - session); - session = NULL; - } else if (r == 2) { - ceph_check_caps(ceph_inode(inode), - CHECK_CAPS_NODELAY, - session); - session = NULL; - } - break; + handle_cap_grant(inode, h, session, cap, msg->middle); + goto done_unlocked; case CEPH_CAP_OP_FLUSH_ACK: handle_cap_flush_ack(inode, tid, h, session, cap); @@ -2716,11 +2712,8 @@ void ceph_handle_caps(struct ceph_mds_session *session, } done: - if (session) - mutex_unlock(&session->s_mutex); - - if (check_caps) - ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, NULL); + mutex_unlock(&session->s_mutex); +done_unlocked: if (inode) iput(inode); return; -- cgit v1.2.3 From 916623da10e270c7e9e802a7ddfe1ec8f890982d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 16 Mar 2010 15:01:07 -0700 Subject: ceph: only release unused caps with mds requests We were releasing used caps (e.g. FILE_CACHE) from encode_inode_release with MDS requests (e.g. setattr). We don't carry refs on most caps, so this code worked most of the time, but for setattr (utimes) we try to drop Fscr. This causes cap state to get slightly out of sync with reality, and may result in subsequent mds revoke messages getting ignored. Fix by only releasing unused caps. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index d9e860ff9f3a..7d0a0d0adc18 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2836,11 +2836,18 @@ int ceph_encode_inode_release(void **p, struct inode *inode, struct ceph_cap *cap; struct ceph_mds_request_release *rel = *p; int ret = 0; - - dout("encode_inode_release %p mds%d drop %s unless %s\n", inode, - mds, ceph_cap_string(drop), ceph_cap_string(unless)); + int used = 0; spin_lock(&inode->i_lock); + used = __ceph_caps_used(ci); + + dout("encode_inode_release %p mds%d used %s drop %s unless %s\n", inode, + mds, ceph_cap_string(used), ceph_cap_string(drop), + ceph_cap_string(unless)); + + /* only drop unused caps */ + drop &= ~used; + cap = __get_cap_for_mds(ci, mds); if (cap && __cap_is_valid(cap)) { if (force || -- cgit v1.2.3 From 80fc7314a7e26e8d2e4ba5b3d8cc2d4aeb750015 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 16 Mar 2010 15:28:54 -0700 Subject: ceph: fix mds sync() race with completing requests The wait_unsafe_requests() helper dropped the mdsc mutex to wait for each request to complete, and then examined r_node to get the next request after retaking the lock. But the request completion removes the request from the tree, so r_node was always undefined at this point. Since it's a small race, it usually led to a valid request, but not always. The result was an occasional crash in rb_next() while dereferencing node->rb_left. Fix this by clearing the rb_node when removing the request from the request tree, and not walking off into the weeds when we are done waiting for a request. Since the request we waited on will _always_ be out of the request tree, take a ref on the next request, in the hopes that it won't be. But if it is, it's ok: we can start over from the beginning (and traverse over older read requests again). Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index a2600101ec22..5ec864198368 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -529,6 +529,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc, { dout("__unregister_request %p tid %lld\n", req, req->r_tid); rb_erase(&req->r_node, &mdsc->request_tree); + RB_CLEAR_NODE(&req->r_node); ceph_mdsc_put_request(req); if (req->r_unsafe_dir) { @@ -2682,29 +2683,41 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) */ static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid) { - struct ceph_mds_request *req = NULL; + struct ceph_mds_request *req = NULL, *nextreq; struct rb_node *n; mutex_lock(&mdsc->mutex); dout("wait_unsafe_requests want %lld\n", want_tid); +restart: req = __get_oldest_req(mdsc); while (req && req->r_tid <= want_tid) { + /* find next request */ + n = rb_next(&req->r_node); + if (n) + nextreq = rb_entry(n, struct ceph_mds_request, r_node); + else + nextreq = NULL; if ((req->r_op & CEPH_MDS_OP_WRITE)) { /* write op */ ceph_mdsc_get_request(req); + if (nextreq) + ceph_mdsc_get_request(nextreq); mutex_unlock(&mdsc->mutex); dout("wait_unsafe_requests wait on %llu (want %llu)\n", req->r_tid, want_tid); wait_for_completion(&req->r_safe_completion); mutex_lock(&mdsc->mutex); - n = rb_next(&req->r_node); ceph_mdsc_put_request(req); - } else { - n = rb_next(&req->r_node); + if (!nextreq) + break; /* next dne before, so we're done! */ + if (RB_EMPTY_NODE(&nextreq->r_node)) { + /* next request was removed from tree */ + ceph_mdsc_put_request(nextreq); + goto restart; + } + ceph_mdsc_put_request(nextreq); /* won't go away */ } - if (!n) - break; - req = rb_entry(n, struct ceph_mds_request, r_node); + req = nextreq; } mutex_unlock(&mdsc->mutex); dout("wait_unsafe_requests done\n"); -- cgit v1.2.3 From efd7576b2392cc5a0934352936d793e8884c46bf Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 17 Mar 2010 10:05:28 -0700 Subject: ceph: fix pg pool decoding from incremental osdmap update The incremental map decoding of pg pool updates wasn't skipping the snaps and removed_snaps vectors. This caused osd requests to stall when pool snapshots were created or fs snapshots were deleted. Use a common helper for full and incremental map decoders that decodes pools properly. Signed-off-by: Sage Weil --- fs/ceph/osdmap.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index b83f2692b835..d82fe87c2a6e 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -480,6 +480,14 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id) return NULL; } +void __decode_pool(void **p, struct ceph_pg_pool_info *pi) +{ + ceph_decode_copy(p, &pi->v, sizeof(pi->v)); + calc_pg_masks(pi); + *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64); + *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2; +} + /* * decode a full map. */ @@ -526,12 +534,8 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) ev, CEPH_PG_POOL_VERSION); goto bad; } - ceph_decode_copy(p, &pi->v, sizeof(pi->v)); + __decode_pool(p, pi); __insert_pg_pool(&map->pg_pools, pi); - calc_pg_masks(pi); - *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64); - *p += le32_to_cpu(pi->v.num_removed_snap_intervals) - * sizeof(u64) * 2; } ceph_decode_32_safe(p, end, map->pool_max, bad); @@ -714,8 +718,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, pi->id = pool; __insert_pg_pool(&map->pg_pools, pi); } - ceph_decode_copy(p, &pi->v, sizeof(pi->v)); - calc_pg_masks(pi); + __decode_pool(p, pi); } /* old_pool */ -- cgit v1.2.3 From e4cb4cb8a03adde1aa4b874623c50b9a5b56e635 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 18 Mar 2010 13:43:09 -0700 Subject: ceph: prevent dup stale messages to console for restarting mds Prevent duplicate 'mds0 caps stale' message from spamming the console every few seconds while the MDS restarts. Set s_renew_requested earlier, so that we only print the message once, even if we don't send an actual request. Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 5ec864198368..5cbf46abfee3 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -863,6 +863,7 @@ static int send_renew_caps(struct ceph_mds_client *mdsc, if (time_after_eq(jiffies, session->s_cap_ttl) && time_after_eq(session->s_cap_ttl, session->s_renew_requested)) pr_info("mds%d caps stale\n", session->s_mds); + session->s_renew_requested = jiffies; /* do not try to renew caps until a recovering mds has reconnected * with its clients. */ @@ -875,7 +876,6 @@ static int send_renew_caps(struct ceph_mds_client *mdsc, dout("send_renew_caps to mds%d (%s)\n", session->s_mds, ceph_mds_state_name(state)); - session->s_renew_requested = jiffies; msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, ++session->s_renew_seq); if (IS_ERR(msg)) -- cgit v1.2.3 From 3c3f2e32effd4c6acc3a9434bd7eecb0af653d89 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 18 Mar 2010 15:20:53 -0700 Subject: ceph: fix connection fault con_work reentrancy problem The messenger fault was clearing the BUSY bit, for reasons unclear. This made it possible for the con->ops->fault function to reopen the connection, and requeue work in the workqueue--even though the current thread was already in con_work. This avoids a problem where the client busy loops with connection failures on an unreachable OSD, but doesn't address the root cause of that problem. Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 203c4359b549..983285540945 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -1836,8 +1836,6 @@ static void ceph_fault(struct ceph_connection *con) goto out; } - clear_bit(BUSY, &con->state); /* to avoid an improbable race */ - mutex_lock(&con->mutex); if (test_bit(CLOSED, &con->state)) goto out_unlock; -- cgit v1.2.3 From 3dd72fc0e6dc49c79fa9e7cd7c654deac7ccaa29 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 22 Mar 2010 14:42:30 -0700 Subject: ceph: rename r_sent_stamp r_stamp Make variable name slightly more generic, since it will (soon) reflect either the time the request was sent OR the time it was last determined to be still retrying. Signed-off-by: Sage Weil --- fs/ceph/osd_client.c | 12 ++++++------ fs/ceph/osd_client.h | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index 221038253e61..04359217ea6c 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -633,7 +633,7 @@ static int __send_request(struct ceph_osd_client *osdc, reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */ reqhead->reassert_version = req->r_reassert_version; - req->r_sent_stamp = jiffies; + req->r_stamp = jiffies; list_move_tail(&osdc->req_lru, &req->r_req_lru_item); ceph_msg_get(req->r_request); /* send consumes a ref */ @@ -660,7 +660,7 @@ static void handle_timeout(struct work_struct *work) unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ; unsigned long keepalive = osdc->client->mount_args->osd_keepalive_timeout * HZ; - unsigned long last_sent = 0; + unsigned long last_stamp = 0; struct rb_node *p; struct list_head slow_osds; @@ -697,12 +697,12 @@ static void handle_timeout(struct work_struct *work) req = list_entry(osdc->req_lru.next, struct ceph_osd_request, r_req_lru_item); - if (time_before(jiffies, req->r_sent_stamp + timeout)) + if (time_before(jiffies, req->r_stamp + timeout)) break; - BUG_ON(req == last_req && req->r_sent_stamp == last_sent); + BUG_ON(req == last_req && req->r_stamp == last_stamp); last_req = req; - last_sent = req->r_sent_stamp; + last_stamp = req->r_stamp; osd = req->r_osd; BUG_ON(!osd); @@ -718,7 +718,7 @@ static void handle_timeout(struct work_struct *work) */ INIT_LIST_HEAD(&slow_osds); list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) { - if (time_before(jiffies, req->r_sent_stamp + keepalive)) + if (time_before(jiffies, req->r_stamp + keepalive)) break; osd = req->r_osd; diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h index 1b1a3ca43afc..b0759911e7c3 100644 --- a/fs/ceph/osd_client.h +++ b/fs/ceph/osd_client.h @@ -70,7 +70,7 @@ struct ceph_osd_request { char r_oid[40]; /* object name */ int r_oid_len; - unsigned long r_sent_stamp; + unsigned long r_stamp; /* send OR check time */ bool r_resend; /* msg send failed, needs retry */ struct ceph_file_layout r_file_layout; -- cgit v1.2.3 From 87b315a5b5cec5d7086494b203577602f5befc8c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 22 Mar 2010 14:51:18 -0700 Subject: ceph: avoid reopening osd connections when address hasn't changed We get a fault callback on _every_ tcp connection fault. Normally, we want to reopen the connection when that happens. If the address we have is bad, however, and connection attempts always result in a connection refused or similar error, explicitly closing and reopening the msgr connection just prevents the messenger's backoff logic from kicking in. The result can be a console full of [ 3974.417106] ceph: osd11 10.3.14.138:6800 connection failed [ 3974.423295] ceph: osd11 10.3.14.138:6800 connection failed [ 3974.429709] ceph: osd11 10.3.14.138:6800 connection failed Instead, if we get a fault, and have outstanding requests, but the osd address hasn't changed and the connection never successfully connected in the first place, do nothing to the osd connection. The messenger layer will back off and retry periodically, because we never connected and thus the lossy bit is not set. Instead, touch each request's r_stamp so that handle_timeout can tell the request is still alive and kicking. Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 8 ++++++++ fs/ceph/messenger.h | 1 + fs/ceph/osd_client.c | 15 ++++++++++++++- 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 983285540945..a32f0f896d9f 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -365,6 +365,14 @@ void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr) queue_con(con); } +/* + * return true if this connection ever successfully opened + */ +bool ceph_con_opened(struct ceph_connection *con) +{ + return con->connect_seq > 0; +} + /* * generic get/put */ diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h index 4caaa5911110..a343dae73cdc 100644 --- a/fs/ceph/messenger.h +++ b/fs/ceph/messenger.h @@ -223,6 +223,7 @@ extern void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con); extern void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr); +extern bool ceph_con_opened(struct ceph_connection *con); extern void ceph_con_close(struct ceph_connection *con); extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg); extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg); diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index 04359217ea6c..c7b4dedaace6 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -413,11 +413,22 @@ static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all) */ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) { + struct ceph_osd_request *req; int ret = 0; dout("__reset_osd %p osd%d\n", osd, osd->o_osd); if (list_empty(&osd->o_requests)) { __remove_osd(osdc, osd); + } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd], + &osd->o_con.peer_addr, + sizeof(osd->o_con.peer_addr)) == 0 && + !ceph_con_opened(&osd->o_con)) { + dout(" osd addr hasn't changed and connection never opened," + " letting msgr retry"); + /* touch each r_stamp for handle_timeout()'s benfit */ + list_for_each_entry(req, &osd->o_requests, r_osd_item) + req->r_stamp = jiffies; + ret = -EAGAIN; } else { ceph_con_close(&osd->o_con); ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]); @@ -862,7 +873,9 @@ static int __kick_requests(struct ceph_osd_client *osdc, dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1); if (kickosd) { - __reset_osd(osdc, kickosd); + err = __reset_osd(osdc, kickosd); + if (err == -EAGAIN) + return 1; } else { for (p = rb_first(&osdc->osds); p; p = n) { struct ceph_osd *osd = -- cgit v1.2.3 From ec4318bcb4c59d8b8bf7037c9f444a9887ccb265 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 19 Mar 2010 13:24:39 -0700 Subject: ceph: fix snap rebuild condition We were rebuilding the snap context when it was not necessary (i.e. when the realm seq hadn't changed _and_ the parent seq was still older), which caused page snapc pointers to not match the realm's snapc pointer (even though the snap context itself was identical). This confused begin_write and put it into an endless loop. The correct logic is: rebuild snapc if _my_ realm seq changed, or if my parent realm's seq is newer than mine (and thus mine needs to be rebuilt too). Signed-off-by: Sage Weil --- fs/ceph/snap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 8a43bc8675eb..df04e210a055 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -314,9 +314,9 @@ static int build_snap_context(struct ceph_snap_realm *realm) because we rebuild_snap_realms() works _downward_ in hierarchy after each update.) */ if (realm->cached_context && - realm->cached_context->seq <= realm->seq && + realm->cached_context->seq == realm->seq && (!parent || - realm->cached_context->seq <= parent->cached_context->seq)) { + realm->cached_context->seq >= parent->cached_context->seq)) { dout("build_snap_context %llx %p: %p seq %lld (%d snaps)" " (unchanged)\n", realm->ino, realm, realm->cached_context, -- cgit v1.2.3 From 8f883c24de33ba929c95e018ac0ba66e4f46734b Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 19 Mar 2010 13:27:53 -0700 Subject: ceph: make write_begin wait propagate ERESTARTSYS Currently, if the wait_event_interruptible is interrupted, we return EAGAIN unconditionally and loop, such that we aren't, in fact, interruptible. So, propagate ERESTARTSYS if we get it. Signed-off-by: Sage Weil --- fs/ceph/addr.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 23bb0ceabe31..ce8ef6107727 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -919,6 +919,10 @@ static int context_is_writeable_or_written(struct inode *inode, /* * We are only allowed to write into/dirty the page if the page is * clean, or already dirty within the same snap context. + * + * called with page locked. + * return success with page locked, + * or any failure (incl -EAGAIN) with page unlocked. */ static int ceph_update_writeable_page(struct file *file, loff_t pos, unsigned len, @@ -961,9 +965,11 @@ retry_locked: snapc = ceph_get_snap_context((void *)page->private); unlock_page(page); ceph_queue_writeback(inode); - wait_event_interruptible(ci->i_cap_wq, + r = wait_event_interruptible(ci->i_cap_wq, context_is_writeable_or_written(inode, snapc)); ceph_put_snap_context(snapc); + if (r == -ERESTARTSYS) + return r; return -EAGAIN; } @@ -1035,7 +1041,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, int r; do { - /* get a page*/ + /* get a page */ page = grab_cache_page_write_begin(mapping, index, 0); if (!page) return -ENOMEM; -- cgit v1.2.3 From 9c423956b8a495f0c048143abc5da955a70eac97 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 20 Mar 2010 20:43:28 -0700 Subject: ceph: propagate mds session allocation failures to caller Return error to original caller if register_session() fails. Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 5cbf46abfee3..b6b5348055fc 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1567,8 +1567,13 @@ static int __do_request(struct ceph_mds_client *mdsc, /* get, open session */ session = __ceph_lookup_mds_session(mdsc, mds); - if (!session) + if (!session) { session = register_session(mdsc, mds); + if (IS_ERR(session)) { + err = PTR_ERR(session); + goto finish; + } + } dout("do_request mds%d session %p state %s\n", mds, session, session_state_name(session->s_state)); if (session->s_state != CEPH_MDS_SESSION_OPEN && -- cgit v1.2.3 From 4736b009b880b7c19bea36327a71032a6dbee402 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 20 Mar 2010 15:30:16 +0300 Subject: ceph: handle kmalloc() failure Return ERR_PTR(-ENOMEM) if kmalloc() fails. We handle allocation failures the same way later in the function. Signed-off-by: Dan Carpenter Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index b6b5348055fc..ad0fbc3128d3 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -328,6 +328,8 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, struct ceph_mds_session *s; s = kzalloc(sizeof(*s), GFP_NOFS); + if (!s) + return ERR_PTR(-ENOMEM); s->s_mdsc = mdsc; s->s_mds = mds; s->s_state = CEPH_MDS_SESSION_NEW; -- cgit v1.2.3 From d96d60498ff748c5a88c72ec5d1cc4ba9a583e7e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 20 Mar 2010 20:50:58 -0700 Subject: ceph: fix session check on mds reply Fix a broken check that a reply came back from the same MDS we sent the request to. I don't think a case that actually triggers this would ever come up in practice, but it's clearly wrong and easy to fix. Reported-by: Dan Carpenter Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index ad0fbc3128d3..5268d404963c 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1778,7 +1778,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) dout("handle_reply %p\n", req); /* correct session? */ - if (!req->r_session && req->r_session != session) { + if (req->r_session != session) { pr_err("mdsc_handle_reply got %llu on session mds%d" " not mds%d\n", tid, session->s_mds, req->r_session ? req->r_session->s_mds : -1); -- cgit v1.2.3 From 393f66209669ad23f4f6d4191234c1df4367df3c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 10 Mar 2010 12:03:32 -0800 Subject: ceph: fix possible double-free of mds request reference Clear pointer to mds request after dropping the reference to ensure we don't drop it again, as there is at least one error path through this function that does not reset fi->last_readdir to a new value. Signed-off-by: Sage Weil --- fs/ceph/dir.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 5107384ee029..8a9116e15b70 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -288,8 +288,10 @@ more: CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; /* discard old result, if any */ - if (fi->last_readdir) + if (fi->last_readdir) { ceph_mdsc_put_request(fi->last_readdir); + fi->last_readdir = NULL; + } /* requery frag tree, as the frag topology may have changed */ frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL); -- cgit v1.2.3 From 23ab15ad7a9d042afa7303b735b6e24faa607241 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 22 Mar 2010 09:37:14 -0700 Subject: ceph: avoid loaded term 'OSD' in documention 'OSD' means different things to different people; avoid it here to avoid confusion. Signed-off-by: Sage Weil --- Documentation/filesystems/ceph.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt index 6e03917316bd..523fdf0828db 100644 --- a/Documentation/filesystems/ceph.txt +++ b/Documentation/filesystems/ceph.txt @@ -115,7 +115,7 @@ Mount Options number of entries in that directory. nocrc - Disable CRC32C calculation for data writes. If set, the OSD + Disable CRC32C calculation for data writes. If set, the storage node must rely on TCP's error correction to detect data corruption in the data payload. -- cgit v1.2.3 From 94aa8ae13db2ecf2ec1b4e65a65d3fe92b468e0e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 28 Mar 2010 21:22:50 -0700 Subject: ceph: fix use after free on mds __unregister_request There was a use after free in __unregister_request that would trigger whenever the request map held the last reference. This appears to have triggered an oops during 'umount -f' when requests are being torn down. Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 5268d404963c..5c7920be6420 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -532,7 +532,6 @@ static void __unregister_request(struct ceph_mds_client *mdsc, dout("__unregister_request %p tid %lld\n", req, req->r_tid); rb_erase(&req->r_node, &mdsc->request_tree); RB_CLEAR_NODE(&req->r_node); - ceph_mdsc_put_request(req); if (req->r_unsafe_dir) { struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); @@ -541,6 +540,8 @@ static void __unregister_request(struct ceph_mds_client *mdsc, list_del_init(&req->r_unsafe_dir_item); spin_unlock(&ci->i_unsafe_lock); } + + ceph_mdsc_put_request(req); } /* -- cgit v1.2.3 From 8136b58dd0fce0b4cb649ac690e0493fb6fdacdb Mon Sep 17 00:00:00 2001 From: Cheng Renquan Date: Mon, 29 Mar 2010 19:05:57 +0800 Subject: ceph: some documentations fixes New documentation should have an entry in the 00-INDEX. Correct git urls. Signed-off-by: Cheng Renquan Signed-off-by: Sage Weil --- Documentation/filesystems/00-INDEX | 2 ++ Documentation/filesystems/ceph.txt | 9 +++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX index 3bae418c6ad3..4303614b5add 100644 --- a/Documentation/filesystems/00-INDEX +++ b/Documentation/filesystems/00-INDEX @@ -16,6 +16,8 @@ befs.txt - information about the BeOS filesystem for Linux. bfs.txt - info for the SCO UnixWare Boot Filesystem (BFS). +ceph.txt + - info for the Ceph Distributed File System cifs.txt - description of the CIFS filesystem. coda.txt diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt index 523fdf0828db..0660c9f5deef 100644 --- a/Documentation/filesystems/ceph.txt +++ b/Documentation/filesystems/ceph.txt @@ -8,7 +8,7 @@ Basic features include: * POSIX semantics * Seamless scaling from 1 to many thousands of nodes - * High availability and reliability. No single points of failure. + * High availability and reliability. No single point of failure. * N-way replication of data across storage nodes * Fast recovery from node failures * Automatic rebalancing of data on node addition/removal @@ -94,7 +94,7 @@ Mount Options wsize=X Specify the maximum write size in bytes. By default there is no - maximu. Ceph will normally size writes based on the file stripe + maximum. Ceph will normally size writes based on the file stripe size. rsize=X @@ -133,7 +133,8 @@ For more information on Ceph, see the home page at http://ceph.newdream.net/ The Linux kernel client source tree is available at - git://ceph.newdream.net/linux-ceph-client.git + git://ceph.newdream.net/git/ceph-client.git + git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git and the source for the full system is at - git://ceph.newdream.net/ceph.git + git://ceph.newdream.net/git/ceph.git -- cgit v1.2.3 From 82593f87b6c1922a8f8317bb165c6c7794fa4639 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 29 Mar 2010 09:53:23 -0700 Subject: ceph: update discussion list address in MAINTAINERS Signed-off-by: Sage Weil --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 449d44402083..c3b60c024f8b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1443,7 +1443,7 @@ F: arch/powerpc/platforms/cell/ CEPH DISTRIBUTED FILE SYSTEM CLIENT M: Sage Weil -L: ceph-devel@lists.sourceforge.net +L: ceph-devel@vger.kernel.org W: http://ceph.newdream.net/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git S: Supported -- cgit v1.2.3