From 02ac956c42c6284220f427568d5de3ea64aca41c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 6 Jan 2016 12:56:21 +0300 Subject: libceph: move debugfs initialization into __ceph_open_session() Our debugfs dir name is a concatenation of cluster fsid and client unique ID ("global_id"). It used to be the case that we learned global_id first, nowadays we always learn fsid first - the monmap is sent before any auth replies are. ceph_debugfs_client_init() call in ceph_monc_handle_map() is therefore never executed and can be removed. Its counterpart in handle_auth_reply() doesn't really belong there either: having to do monc->client and unlocking early to work around lockdep is a testament to that. Move it into __ceph_open_session(), where it can be called unconditionally. Signed-off-by: Ilya Dryomov --- net/ceph/ceph_common.c | 3 +++ net/ceph/mon_client.c | 52 +------------------------------------------------- 2 files changed, 4 insertions(+), 51 deletions(-) (limited to 'net') diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index bcbec33c6a14..389dbabba17b 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -686,6 +686,9 @@ int __ceph_open_session(struct ceph_client *client, unsigned long started) return client->auth_err; } + pr_info("client%llu fsid %pU\n", ceph_client_id(client), &client->fsid); + ceph_debugfs_client_init(client); + return 0; } EXPORT_SYMBOL(__ceph_open_session); diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index de85dddc3dc0..5ab737ce4f6d 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -353,29 +353,15 @@ int ceph_monc_open_session(struct ceph_mon_client *monc) } EXPORT_SYMBOL(ceph_monc_open_session); -/* - * We require the fsid and global_id in order to initialize our - * debugfs dir. - */ -static bool have_debugfs_info(struct ceph_mon_client *monc) -{ - dout("have_debugfs_info fsid %d globalid %lld\n", - (int)monc->client->have_fsid, monc->auth->global_id); - return monc->client->have_fsid && monc->auth->global_id > 0; -} - static void ceph_monc_handle_map(struct ceph_mon_client *monc, struct ceph_msg *msg) { struct ceph_client *client = monc->client; struct ceph_monmap *monmap = NULL, *old = monc->monmap; void *p, *end; - int had_debugfs_info, init_debugfs = 0; mutex_lock(&monc->mutex); - had_debugfs_info = have_debugfs_info(monc); - dout("handle_monmap\n"); p = msg->front.iov_base; end = p + msg->front.iov_len; @@ -395,29 +381,10 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc, client->monc.monmap = monmap; kfree(old); - if (!client->have_fsid) { - client->have_fsid = true; - if (!had_debugfs_info && have_debugfs_info(monc)) { - pr_info("client%lld fsid %pU\n", - ceph_client_id(monc->client), - &monc->client->fsid); - init_debugfs = 1; - } - mutex_unlock(&monc->mutex); + client->have_fsid = true; - if (init_debugfs) { - /* - * do debugfs initialization without mutex to avoid - * creating a locking dependency - */ - ceph_debugfs_client_init(monc->client); - } - - goto out_unlocked; - } out: mutex_unlock(&monc->mutex); -out_unlocked: wake_up_all(&client->auth_wq); } @@ -915,10 +882,8 @@ static void handle_auth_reply(struct ceph_mon_client *monc, { int ret; int was_auth = 0; - int had_debugfs_info, init_debugfs = 0; mutex_lock(&monc->mutex); - had_debugfs_info = have_debugfs_info(monc); was_auth = ceph_auth_is_authenticated(monc->auth); monc->pending_auth = 0; ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, @@ -940,22 +905,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc, __send_subscribe(monc); __resend_generic_request(monc); } - - if (!had_debugfs_info && have_debugfs_info(monc)) { - pr_info("client%lld fsid %pU\n", - ceph_client_id(monc->client), - &monc->client->fsid); - init_debugfs = 1; - } mutex_unlock(&monc->mutex); - - if (init_debugfs) { - /* - * do debugfs initialization without mutex to avoid - * creating a locking dependency - */ - ceph_debugfs_client_init(monc->client); - } } static int __validate_auth(struct ceph_mon_client *monc) -- cgit v1.2.3 From 0f9af169a1db62c33d87e4cfda46493907bd5537 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 8 Jan 2016 21:17:22 +0300 Subject: libceph: decouple hunting and subs management Coupling hunting state with subscribe state is not a good idea. Clear hunting when we complete the authentication handshake. Signed-off-by: Ilya Dryomov --- net/ceph/mon_client.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 5ab737ce4f6d..d6af6ca26e8d 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -255,12 +255,6 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc, seconds = le32_to_cpu(h->duration); mutex_lock(&monc->mutex); - if (monc->hunting) { - pr_info("mon%d %s session established\n", - monc->cur_mon, - ceph_pr_addr(&monc->con.peer_addr.in_addr)); - monc->hunting = false; - } dout("handle_subscribe_ack after %d seconds\n", seconds); monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1; monc->sub_sent = 0; @@ -877,6 +871,14 @@ void ceph_monc_stop(struct ceph_mon_client *monc) } EXPORT_SYMBOL(ceph_monc_stop); +static void finish_hunting(struct ceph_mon_client *monc) +{ + if (monc->hunting) { + dout("%s found mon%d\n", __func__, monc->cur_mon); + monc->hunting = false; + } +} + static void handle_auth_reply(struct ceph_mon_client *monc, struct ceph_msg *msg) { @@ -890,11 +892,15 @@ static void handle_auth_reply(struct ceph_mon_client *monc, msg->front.iov_len, monc->m_auth->front.iov_base, monc->m_auth->front_alloc_len); + if (ret > 0) { + __send_prepared_auth_request(monc, ret); + goto out; + } + + finish_hunting(monc); + if (ret < 0) { monc->client->auth_err = ret; - wake_up_all(&monc->client->auth_wq); - } else if (ret > 0) { - __send_prepared_auth_request(monc, ret); } else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) { dout("authenticated, starting session\n"); @@ -904,8 +910,15 @@ static void handle_auth_reply(struct ceph_mon_client *monc, __send_subscribe(monc); __resend_generic_request(monc); + + pr_info("mon%d %s session established\n", monc->cur_mon, + ceph_pr_addr(&monc->con.peer_addr.in_addr)); } + +out: mutex_unlock(&monc->mutex); + if (monc->client->auth_err < 0) + wake_up_all(&monc->client->auth_wq); } static int __validate_auth(struct ceph_mon_client *monc) -- cgit v1.2.3 From 82dcabad750a36a2b749889bc89c5a3188775b2e Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 19 Jan 2016 16:19:06 +0100 Subject: libceph: revamp subs code, switch to SUBSCRIBE2 protocol It is currently hard-coded in the mon_client that mdsmap and monmap subs are continuous, while osdmap sub is always "onetime". To better handle full clusters/pools in the osd_client, we need to be able to issue continuous osdmap subs. Revamp subs code to allow us to specify for each sub whether it should be continuous or not. Although not strictly required for the above, switch to SUBSCRIBE2 protocol while at it, eliminating the ambiguity between a request for "every map since X" and a request for "just the latest" when we don't have a map yet (i.e. have epoch 0). SUBSCRIBE2 feature bit is now required - it's been supported since pre-argonaut (2010). Move "got mdsmap" call to the end of ceph_mdsc_handle_map() - calling in before we validate the epoch and successfully install the new map can mess up mon_client sub state. Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 3 +- fs/ceph/super.c | 2 +- include/linux/ceph/ceph_features.h | 2 + include/linux/ceph/ceph_fs.h | 4 +- include/linux/ceph/mon_client.h | 28 +++-- net/ceph/debugfs.c | 17 +-- net/ceph/mon_client.c | 210 +++++++++++++++++++++++-------------- net/ceph/osd_client.c | 3 +- 8 files changed, 174 insertions(+), 95 deletions(-) (limited to 'net') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 911d64d865f1..b43399d22e23 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3764,7 +3764,6 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg) dout("handle_map epoch %u len %d\n", epoch, (int)maplen); /* do we need it? */ - ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch); mutex_lock(&mdsc->mutex); if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) { dout("handle_map epoch %u <= our %u\n", @@ -3791,6 +3790,8 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg) mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size; __wake_requests(mdsc, &mdsc->waiting_for_map); + ceph_monc_got_map(&mdsc->fsc->client->monc, CEPH_SUB_MDSMAP, + mdsc->mdsmap->m_epoch); mutex_unlock(&mdsc->mutex); schedule_delayed(mdsc); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index ca4d5e8457f1..c941fd1a8eb8 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -530,7 +530,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, goto fail; } fsc->client->extra_mon_dispatch = extra_mon_dispatch; - fsc->client->monc.want_mdsmap = 1; + ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true); fsc->mount_options = fsopt; diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index 15151f3c4120..ae2f66833762 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -105,6 +105,7 @@ static inline u64 ceph_sanitize_features(u64 features) */ #define CEPH_FEATURES_SUPPORTED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ + CEPH_FEATURE_SUBSCRIBE2 | \ CEPH_FEATURE_RECONNECT_SEQ | \ CEPH_FEATURE_PGID64 | \ CEPH_FEATURE_PGPOOL3 | \ @@ -127,6 +128,7 @@ static inline u64 ceph_sanitize_features(u64 features) #define CEPH_FEATURES_REQUIRED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ + CEPH_FEATURE_SUBSCRIBE2 | \ CEPH_FEATURE_RECONNECT_SEQ | \ CEPH_FEATURE_PGID64 | \ CEPH_FEATURE_PGPOOL3 | \ diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index d7d072a25c27..bf74005eedec 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -198,8 +198,8 @@ struct ceph_client_mount { #define CEPH_SUBSCRIBE_ONETIME 1 /* i want only 1 update after have */ struct ceph_mon_subscribe_item { - __le64 have_version; __le64 have; - __u8 onetime; + __le64 start; + __u8 flags; } __attribute__ ((packed)); struct ceph_mon_subscribe_ack { diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index 81810dc21f06..8b2d2f0b659e 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -68,7 +68,8 @@ struct ceph_mon_client { bool hunting; int cur_mon; /* last monitor i contacted */ - unsigned long sub_sent, sub_renew_after; + unsigned long sub_renew_after; + unsigned long sub_renew_sent; struct ceph_connection con; /* pending generic requests */ @@ -76,10 +77,12 @@ struct ceph_mon_client { int num_generic_requests; u64 last_tid; - /* mds/osd map */ - int want_mdsmap; - int want_next_osdmap; /* 1 = want, 2 = want+asked */ - u32 have_osdmap, have_mdsmap; + /* subs, indexed with CEPH_SUB_* */ + struct { + struct ceph_mon_subscribe_item item; + bool want; + u32 have; /* epoch */ + } subs[3]; #ifdef CONFIG_DEBUG_FS struct dentry *debugfs_file; @@ -93,14 +96,23 @@ extern int ceph_monmap_contains(struct ceph_monmap *m, extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl); extern void ceph_monc_stop(struct ceph_mon_client *monc); +enum { + CEPH_SUB_MDSMAP = 0, + CEPH_SUB_MONMAP, + CEPH_SUB_OSDMAP, +}; + +extern const char *ceph_sub_str[]; + /* * The model here is to indicate that we need a new map of at least - * epoch @want, and also call in when we receive a map. We will + * epoch @epoch, and also call in when we receive a map. We will * periodically rerequest the map from the monitor cluster until we * get what we want. */ -extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have); -extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have); +bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch, + bool continuous); +void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch); extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc); extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 593dc2eabcc8..b902fbc7863e 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -112,15 +112,20 @@ static int monc_show(struct seq_file *s, void *p) struct ceph_mon_generic_request *req; struct ceph_mon_client *monc = &client->monc; struct rb_node *rp; + int i; mutex_lock(&monc->mutex); - if (monc->have_mdsmap) - seq_printf(s, "have mdsmap %u\n", (unsigned int)monc->have_mdsmap); - if (monc->have_osdmap) - seq_printf(s, "have osdmap %u\n", (unsigned int)monc->have_osdmap); - if (monc->want_next_osdmap) - seq_printf(s, "want next osdmap\n"); + for (i = 0; i < ARRAY_SIZE(monc->subs); i++) { + seq_printf(s, "have %s %u", ceph_sub_str[i], + monc->subs[i].have); + if (monc->subs[i].want) + seq_printf(s, " want %llu%s", + le64_to_cpu(monc->subs[i].item.start), + (monc->subs[i].item.flags & + CEPH_SUBSCRIBE_ONETIME ? "" : "+")); + seq_putc(s, '\n'); + } for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) { __u16 op; diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index d6af6ca26e8d..89029916315c 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -140,9 +140,8 @@ static int __open_session(struct ceph_mon_client *monc) monc->cur_mon = r % monc->monmap->num_mon; dout("open_session num=%d r=%d -> mon%d\n", monc->monmap->num_mon, r, monc->cur_mon); - monc->sub_sent = 0; monc->sub_renew_after = jiffies; /* i.e., expired */ - monc->want_next_osdmap = !!monc->want_next_osdmap; + monc->sub_renew_sent = 0; dout("open_session mon%d opening\n", monc->cur_mon); ceph_con_open(&monc->con, @@ -189,59 +188,58 @@ static void __schedule_delayed(struct ceph_mon_client *monc) round_jiffies_relative(delay)); } +const char *ceph_sub_str[] = { + [CEPH_SUB_MDSMAP] = "mdsmap", + [CEPH_SUB_MONMAP] = "monmap", + [CEPH_SUB_OSDMAP] = "osdmap", +}; + /* - * Send subscribe request for mdsmap and/or osdmap. + * Send subscribe request for one or more maps, according to + * monc->subs. */ static void __send_subscribe(struct ceph_mon_client *monc) { - dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n", - (unsigned int)monc->sub_sent, __sub_expired(monc), - monc->want_next_osdmap); - if ((__sub_expired(monc) && !monc->sub_sent) || - monc->want_next_osdmap == 1) { - struct ceph_msg *msg = monc->m_subscribe; - struct ceph_mon_subscribe_item *i; - void *p, *end; - int num; - - p = msg->front.iov_base; - end = p + msg->front_alloc_len; - - num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap; - ceph_encode_32(&p, num); - - if (monc->want_next_osdmap) { - dout("__send_subscribe to 'osdmap' %u\n", - (unsigned int)monc->have_osdmap); - ceph_encode_string(&p, end, "osdmap", 6); - i = p; - i->have = cpu_to_le64(monc->have_osdmap); - i->onetime = 1; - p += sizeof(*i); - monc->want_next_osdmap = 2; /* requested */ - } - if (monc->want_mdsmap) { - dout("__send_subscribe to 'mdsmap' %u+\n", - (unsigned int)monc->have_mdsmap); - ceph_encode_string(&p, end, "mdsmap", 6); - i = p; - i->have = cpu_to_le64(monc->have_mdsmap); - i->onetime = 0; - p += sizeof(*i); - } - ceph_encode_string(&p, end, "monmap", 6); - i = p; - i->have = 0; - i->onetime = 0; - p += sizeof(*i); - - msg->front.iov_len = p - msg->front.iov_base; - msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - ceph_msg_revoke(msg); - ceph_con_send(&monc->con, ceph_msg_get(msg)); - - monc->sub_sent = jiffies | 1; /* never 0 */ + struct ceph_msg *msg = monc->m_subscribe; + void *p = msg->front.iov_base; + void *const end = p + msg->front_alloc_len; + int num = 0; + int i; + + dout("%s sent %lu\n", __func__, monc->sub_renew_sent); + + BUG_ON(monc->cur_mon < 0); + + if (!monc->sub_renew_sent) + monc->sub_renew_sent = jiffies | 1; /* never 0 */ + + msg->hdr.version = cpu_to_le16(2); + + for (i = 0; i < ARRAY_SIZE(monc->subs); i++) { + if (monc->subs[i].want) + num++; } + BUG_ON(num < 1); /* monmap sub is always there */ + ceph_encode_32(&p, num); + for (i = 0; i < ARRAY_SIZE(monc->subs); i++) { + const char *s = ceph_sub_str[i]; + + if (!monc->subs[i].want) + continue; + + dout("%s %s start %llu flags 0x%x\n", __func__, s, + le64_to_cpu(monc->subs[i].item.start), + monc->subs[i].item.flags); + ceph_encode_string(&p, end, s, strlen(s)); + memcpy(p, &monc->subs[i].item, sizeof(monc->subs[i].item)); + p += sizeof(monc->subs[i].item); + } + + BUG_ON(p != (end - 35 - (ARRAY_SIZE(monc->subs) - num) * 19)); + msg->front.iov_len = p - msg->front.iov_base; + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + ceph_msg_revoke(msg); + ceph_con_send(&monc->con, ceph_msg_get(msg)); } static void handle_subscribe_ack(struct ceph_mon_client *monc, @@ -255,9 +253,16 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc, seconds = le32_to_cpu(h->duration); mutex_lock(&monc->mutex); - dout("handle_subscribe_ack after %d seconds\n", seconds); - monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1; - monc->sub_sent = 0; + if (monc->sub_renew_sent) { + monc->sub_renew_after = monc->sub_renew_sent + + (seconds >> 1) * HZ - 1; + dout("%s sent %lu duration %d renew after %lu\n", __func__, + monc->sub_renew_sent, seconds, monc->sub_renew_after); + monc->sub_renew_sent = 0; + } else { + dout("%s sent %lu renew after %lu, ignoring\n", __func__, + monc->sub_renew_sent, monc->sub_renew_after); + } mutex_unlock(&monc->mutex); return; bad: @@ -266,36 +271,82 @@ bad: } /* - * Keep track of which maps we have + * Register interest in a map + * + * @sub: one of CEPH_SUB_* + * @epoch: X for "every map since X", or 0 for "just the latest" */ -int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got) +static bool __ceph_monc_want_map(struct ceph_mon_client *monc, int sub, + u32 epoch, bool continuous) +{ + __le64 start = cpu_to_le64(epoch); + u8 flags = !continuous ? CEPH_SUBSCRIBE_ONETIME : 0; + + dout("%s %s epoch %u continuous %d\n", __func__, ceph_sub_str[sub], + epoch, continuous); + + if (monc->subs[sub].want && + monc->subs[sub].item.start == start && + monc->subs[sub].item.flags == flags) + return false; + + monc->subs[sub].item.start = start; + monc->subs[sub].item.flags = flags; + monc->subs[sub].want = true; + + return true; +} + +bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch, + bool continuous) { + bool need_request; + mutex_lock(&monc->mutex); - monc->have_mdsmap = got; + need_request = __ceph_monc_want_map(monc, sub, epoch, continuous); mutex_unlock(&monc->mutex); - return 0; + + return need_request; } -EXPORT_SYMBOL(ceph_monc_got_mdsmap); +EXPORT_SYMBOL(ceph_monc_want_map); -int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got) +/* + * Keep track of which maps we have + * + * @sub: one of CEPH_SUB_* + */ +static void __ceph_monc_got_map(struct ceph_mon_client *monc, int sub, + u32 epoch) +{ + dout("%s %s epoch %u\n", __func__, ceph_sub_str[sub], epoch); + + if (monc->subs[sub].want) { + if (monc->subs[sub].item.flags & CEPH_SUBSCRIBE_ONETIME) + monc->subs[sub].want = false; + else + monc->subs[sub].item.start = cpu_to_le64(epoch + 1); + } + + monc->subs[sub].have = epoch; +} + +void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch) { mutex_lock(&monc->mutex); - monc->have_osdmap = got; - monc->want_next_osdmap = 0; + __ceph_monc_got_map(monc, sub, epoch); mutex_unlock(&monc->mutex); - return 0; } +EXPORT_SYMBOL(ceph_monc_got_map); /* * Register interest in the next osdmap */ void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc) { - dout("request_next_osdmap have %u\n", monc->have_osdmap); + dout("%s have %u\n", __func__, monc->subs[CEPH_SUB_OSDMAP].have); mutex_lock(&monc->mutex); - if (!monc->want_next_osdmap) - monc->want_next_osdmap = 1; - if (monc->want_next_osdmap < 2) + if (__ceph_monc_want_map(monc, CEPH_SUB_OSDMAP, + monc->subs[CEPH_SUB_OSDMAP].have + 1, false)) __send_subscribe(monc); mutex_unlock(&monc->mutex); } @@ -314,15 +365,15 @@ int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, long ret; mutex_lock(&monc->mutex); - while (monc->have_osdmap < epoch) { + while (monc->subs[CEPH_SUB_OSDMAP].have < epoch) { mutex_unlock(&monc->mutex); if (timeout && time_after_eq(jiffies, started + timeout)) return -ETIMEDOUT; ret = wait_event_interruptible_timeout(monc->client->auth_wq, - monc->have_osdmap >= epoch, - ceph_timeout_jiffies(timeout)); + monc->subs[CEPH_SUB_OSDMAP].have >= epoch, + ceph_timeout_jiffies(timeout)); if (ret < 0) return ret; @@ -335,11 +386,14 @@ int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, EXPORT_SYMBOL(ceph_monc_wait_osdmap); /* - * + * Open a session with a random monitor. Request monmap and osdmap, + * which are waited upon in __ceph_open_session(). */ int ceph_monc_open_session(struct ceph_mon_client *monc) { mutex_lock(&monc->mutex); + __ceph_monc_want_map(monc, CEPH_SUB_MONMAP, 0, true); + __ceph_monc_want_map(monc, CEPH_SUB_OSDMAP, 0, false); __open_session(monc); __schedule_delayed(monc); mutex_unlock(&monc->mutex); @@ -375,6 +429,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc, client->monc.monmap = monmap; kfree(old); + __ceph_monc_got_map(monc, CEPH_SUB_MONMAP, monc->monmap->epoch); client->have_fsid = true; out: @@ -725,8 +780,14 @@ static void delayed_work(struct work_struct *work) __validate_auth(monc); } - if (is_auth) - __send_subscribe(monc); + if (is_auth) { + unsigned long now = jiffies; + + dout("%s renew subs? now %lu renew after %lu\n", + __func__, now, monc->sub_renew_after); + if (time_after_eq(now, monc->sub_renew_after)) + __send_subscribe(monc); + } } __schedule_delayed(monc); mutex_unlock(&monc->mutex); @@ -815,16 +876,13 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) monc->cur_mon = -1; monc->hunting = true; monc->sub_renew_after = jiffies; - monc->sub_sent = 0; + monc->sub_renew_sent = 0; INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); monc->generic_request_tree = RB_ROOT; monc->num_generic_requests = 0; monc->last_tid = 0; - monc->have_mdsmap = 0; - monc->have_osdmap = 0; - monc->want_next_osdmap = 1; return 0; out_auth_reply: diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 5bc053778fed..3309112e23d0 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2187,7 +2187,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) goto bad; done: downgrade_write(&osdc->map_sem); - ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch); + ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP, + osdc->osdmap->epoch); /* * subscribe to subsequent osdmap updates if full to ensure -- cgit v1.2.3 From 0e04dc26cc594d31ee6b1382b452b6bc83b57937 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 20 Jan 2016 17:50:31 +0100 Subject: libceph: pick a different monitor when reconnecting Don't try to reconnect to the same monitor when we fail to establish a session within a timeout or it's lost. For that, pick_new_mon() needs to see the old value of cur_mon, so don't clear it in __close_session() - all calls to __close_session() but one are followed by __open_session() anyway. __open_session() is only called when a new session needs to be established, so the "already open?" branch, which is now in the way, is simply dropped. Signed-off-by: Ilya Dryomov --- net/ceph/mon_client.c | 85 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 57 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 89029916315c..accfded53bae 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -122,45 +122,74 @@ static void __close_session(struct ceph_mon_client *monc) ceph_msg_revoke(monc->m_subscribe); ceph_msg_revoke_incoming(monc->m_subscribe_ack); ceph_con_close(&monc->con); - monc->cur_mon = -1; + monc->pending_auth = 0; ceph_auth_reset(monc->auth); } /* - * Open a session with a (new) monitor. + * Pick a new monitor at random and set cur_mon. If we are repicking + * (i.e. cur_mon is already set), be sure to pick a different one. */ -static int __open_session(struct ceph_mon_client *monc) +static void pick_new_mon(struct ceph_mon_client *monc) { - char r; - int ret; + int old_mon = monc->cur_mon; - if (monc->cur_mon < 0) { - get_random_bytes(&r, 1); - monc->cur_mon = r % monc->monmap->num_mon; - dout("open_session num=%d r=%d -> mon%d\n", - monc->monmap->num_mon, r, monc->cur_mon); - monc->sub_renew_after = jiffies; /* i.e., expired */ - monc->sub_renew_sent = 0; + BUG_ON(monc->monmap->num_mon < 1); - dout("open_session mon%d opening\n", monc->cur_mon); - ceph_con_open(&monc->con, - CEPH_ENTITY_TYPE_MON, monc->cur_mon, - &monc->monmap->mon_inst[monc->cur_mon].addr); + if (monc->monmap->num_mon == 1) { + monc->cur_mon = 0; + } else { + int max = monc->monmap->num_mon; + int o = -1; + int n; + + if (monc->cur_mon >= 0) { + if (monc->cur_mon < monc->monmap->num_mon) + o = monc->cur_mon; + if (o >= 0) + max--; + } - /* send an initial keepalive to ensure our timestamp is - * valid by the time we are in an OPENED state */ - ceph_con_keepalive(&monc->con); + n = prandom_u32() % max; + if (o >= 0 && n >= o) + n++; - /* initiatiate authentication handshake */ - ret = ceph_auth_build_hello(monc->auth, - monc->m_auth->front.iov_base, - monc->m_auth->front_alloc_len); - __send_prepared_auth_request(monc, ret); - } else { - dout("open_session mon%d already open\n", monc->cur_mon); + monc->cur_mon = n; } - return 0; + + dout("%s mon%d -> mon%d out of %d mons\n", __func__, old_mon, + monc->cur_mon, monc->monmap->num_mon); +} + +/* + * Open a session with a new monitor. + */ +static void __open_session(struct ceph_mon_client *monc) +{ + int ret; + + pick_new_mon(monc); + + monc->sub_renew_after = jiffies; /* i.e., expired */ + monc->sub_renew_sent = 0; + + dout("%s opening mon%d\n", __func__, monc->cur_mon); + ceph_con_open(&monc->con, CEPH_ENTITY_TYPE_MON, monc->cur_mon, + &monc->monmap->mon_inst[monc->cur_mon].addr); + + /* + * send an initial keepalive to ensure our timestamp is valid + * by the time we are in an OPENED state + */ + ceph_con_keepalive(&monc->con); + + /* initiate authentication handshake */ + ret = ceph_auth_build_hello(monc->auth, + monc->m_auth->front.iov_base, + monc->m_auth->front_alloc_len); + BUG_ON(ret <= 0); + __send_prepared_auth_request(monc, ret); } static bool __sub_expired(struct ceph_mon_client *monc) @@ -907,7 +936,7 @@ void ceph_monc_stop(struct ceph_mon_client *monc) mutex_lock(&monc->mutex); __close_session(monc); - + monc->cur_mon = -1; mutex_unlock(&monc->mutex); /* -- cgit v1.2.3 From 58d81b1294f02262a141687cd62529c1ec8e6484 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 21 Jan 2016 16:33:15 +0100 Subject: libceph: monc ping rate is 10s Split ping interval and ping timeout: ping interval is 10s; keepalive timeout is 30s. Make monc_ping_timeout a constant while at it - it's not actually exported as a mount option (and the rest of tick-related settings won't be either), so it's got no place in ceph_options. Signed-off-by: Ilya Dryomov --- include/linux/ceph/libceph.h | 5 +++-- net/ceph/ceph_common.c | 1 - net/ceph/mon_client.c | 8 ++------ 3 files changed, 5 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 3e3799cdc6e6..f5466273b9a3 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -47,7 +47,6 @@ struct ceph_options { unsigned long mount_timeout; /* jiffies */ unsigned long osd_idle_ttl; /* jiffies */ unsigned long osd_keepalive_timeout; /* jiffies */ - unsigned long monc_ping_timeout; /* jiffies */ /* * any type that can't be simply compared or doesn't need need @@ -68,7 +67,9 @@ struct ceph_options { #define CEPH_MOUNT_TIMEOUT_DEFAULT msecs_to_jiffies(60 * 1000) #define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000) #define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000) -#define CEPH_MONC_PING_TIMEOUT_DEFAULT msecs_to_jiffies(30 * 1000) + +#define CEPH_MONC_PING_INTERVAL msecs_to_jiffies(10 * 1000) +#define CEPH_MONC_PING_TIMEOUT msecs_to_jiffies(30 * 1000) #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) #define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024) diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 389dbabba17b..dcc18c6f7cf9 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -361,7 +361,6 @@ ceph_parse_options(char *options, const char *dev_name, opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; - opt->monc_ping_timeout = CEPH_MONC_PING_TIMEOUT_DEFAULT; /* get mon ip(s) */ /* ip1[:port1][,ip2[:port2]...] */ diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index accfded53bae..23a270c49baf 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -202,15 +202,12 @@ static bool __sub_expired(struct ceph_mon_client *monc) */ static void __schedule_delayed(struct ceph_mon_client *monc) { - struct ceph_options *opt = monc->client->options; unsigned long delay; if (monc->cur_mon < 0 || __sub_expired(monc)) { delay = 10 * HZ; } else { - delay = 20 * HZ; - if (opt->monc_ping_timeout > 0) - delay = min(delay, opt->monc_ping_timeout / 3); + delay = CEPH_MONC_PING_INTERVAL; } dout("__schedule_delayed after %lu\n", delay); schedule_delayed_work(&monc->delayed_work, @@ -793,10 +790,9 @@ static void delayed_work(struct work_struct *work) __close_session(monc); __open_session(monc); /* continue hunting */ } else { - struct ceph_options *opt = monc->client->options; int is_auth = ceph_auth_is_authenticated(monc->auth); if (ceph_con_keepalive_expired(&monc->con, - opt->monc_ping_timeout)) { + CEPH_MONC_PING_TIMEOUT)) { dout("monc keepalive timeout\n"); is_auth = 0; __close_session(monc); -- cgit v1.2.3 From 168b9090c739c4b5556023a3f08789b349ca7339 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 21 Jan 2016 16:33:19 +0100 Subject: libceph: monc hunt rate is 3s with backoff up to 30s Unless we are in the process of setting up a client (i.e. connecting to the monitor cluster for the first time), apply a backoff: every time we want to reopen a session, increase our timeout by a multiple (currently 2); when we complete the connection, reduce that multipler by 50%. Mirrors ceph.git commit 794c86fd289bd62a35ed14368fa096c46736e9a2. Signed-off-by: Ilya Dryomov --- include/linux/ceph/libceph.h | 3 +++ include/linux/ceph/mon_client.h | 3 +++ net/ceph/mon_client.c | 25 ++++++++++++++++--------- 3 files changed, 22 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index f5466273b9a3..e7975e4681e1 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -68,8 +68,11 @@ struct ceph_options { #define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000) #define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000) +#define CEPH_MONC_HUNT_INTERVAL msecs_to_jiffies(3 * 1000) #define CEPH_MONC_PING_INTERVAL msecs_to_jiffies(10 * 1000) #define CEPH_MONC_PING_TIMEOUT msecs_to_jiffies(30 * 1000) +#define CEPH_MONC_HUNT_BACKOFF 2 +#define CEPH_MONC_HUNT_MAX_MULT 10 #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) #define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024) diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index 8b2d2f0b659e..e230e7ed60d3 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -72,6 +72,9 @@ struct ceph_mon_client { unsigned long sub_renew_sent; struct ceph_connection con; + bool had_a_connection; + int hunt_mult; /* [1..CEPH_MONC_HUNT_MAX_MULT] */ + /* pending generic requests */ struct rb_root generic_request_tree; int num_generic_requests; diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 23a270c49baf..fd1cf408fd89 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -171,6 +171,12 @@ static void __open_session(struct ceph_mon_client *monc) pick_new_mon(monc); + if (monc->had_a_connection) { + monc->hunt_mult *= CEPH_MONC_HUNT_BACKOFF; + if (monc->hunt_mult > CEPH_MONC_HUNT_MAX_MULT) + monc->hunt_mult = CEPH_MONC_HUNT_MAX_MULT; + } + monc->sub_renew_after = jiffies; /* i.e., expired */ monc->sub_renew_sent = 0; @@ -192,11 +198,6 @@ static void __open_session(struct ceph_mon_client *monc) __send_prepared_auth_request(monc, ret); } -static bool __sub_expired(struct ceph_mon_client *monc) -{ - return time_after_eq(jiffies, monc->sub_renew_after); -} - /* * Reschedule delayed work timer. */ @@ -204,11 +205,11 @@ static void __schedule_delayed(struct ceph_mon_client *monc) { unsigned long delay; - if (monc->cur_mon < 0 || __sub_expired(monc)) { - delay = 10 * HZ; - } else { + if (monc->hunting) + delay = CEPH_MONC_HUNT_INTERVAL * monc->hunt_mult; + else delay = CEPH_MONC_PING_INTERVAL; - } + dout("__schedule_delayed after %lu\n", delay); schedule_delayed_work(&monc->delayed_work, round_jiffies_relative(delay)); @@ -902,6 +903,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) monc->hunting = true; monc->sub_renew_after = jiffies; monc->sub_renew_sent = 0; + monc->had_a_connection = false; + monc->hunt_mult = 1; INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); monc->generic_request_tree = RB_ROOT; @@ -959,6 +962,10 @@ static void finish_hunting(struct ceph_mon_client *monc) if (monc->hunting) { dout("%s found mon%d\n", __func__, monc->cur_mon); monc->hunting = false; + monc->had_a_connection = true; + monc->hunt_mult /= 2; /* reduce by 50% */ + if (monc->hunt_mult < 1) + monc->hunt_mult = 1; } } -- cgit v1.2.3 From 1752b50ca240a7f722f57e81ba04496eb15c466f Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 21 Jan 2016 16:33:25 +0100 Subject: libceph: introduce and switch to reopen_session() hunting is now set in __open_session() and cleared in finish_hunting(), instead of all around. The "session lost" message is printed not only on connection resets, but also on keepalive timeouts. Signed-off-by: Ilya Dryomov --- net/ceph/mon_client.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index fd1cf408fd89..816fb813a336 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -171,6 +171,7 @@ static void __open_session(struct ceph_mon_client *monc) pick_new_mon(monc); + monc->hunting = true; if (monc->had_a_connection) { monc->hunt_mult *= CEPH_MONC_HUNT_BACKOFF; if (monc->hunt_mult > CEPH_MONC_HUNT_MAX_MULT) @@ -198,6 +199,16 @@ static void __open_session(struct ceph_mon_client *monc) __send_prepared_auth_request(monc, ret); } +static void reopen_session(struct ceph_mon_client *monc) +{ + if (!monc->hunting) + pr_info("mon%d %s session lost, hunting for new mon\n", + monc->cur_mon, ceph_pr_addr(&monc->con.peer_addr.in_addr)); + + __close_session(monc); + __open_session(monc); +} + /* * Reschedule delayed work timer. */ @@ -788,17 +799,15 @@ static void delayed_work(struct work_struct *work) dout("monc delayed_work\n"); mutex_lock(&monc->mutex); if (monc->hunting) { - __close_session(monc); - __open_session(monc); /* continue hunting */ + dout("%s continuing hunt\n", __func__); + reopen_session(monc); } else { int is_auth = ceph_auth_is_authenticated(monc->auth); if (ceph_con_keepalive_expired(&monc->con, CEPH_MONC_PING_TIMEOUT)) { dout("monc keepalive timeout\n"); is_auth = 0; - __close_session(monc); - monc->hunting = true; - __open_session(monc); + reopen_session(monc); } if (!monc->hunting) { @@ -900,9 +909,6 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) &monc->client->msgr); monc->cur_mon = -1; - monc->hunting = true; - monc->sub_renew_after = jiffies; - monc->sub_renew_sent = 0; monc->had_a_connection = false; monc->hunt_mult = 1; @@ -1157,16 +1163,9 @@ static void mon_fault(struct ceph_connection *con) if (!con->private) goto out; - if (!monc->hunting) - pr_info("mon%d %s session lost, " - "hunting for new mon\n", monc->cur_mon, - ceph_pr_addr(&monc->con.peer_addr.in_addr)); - - __close_session(monc); if (!monc->hunting) { - /* start hunting */ - monc->hunting = true; - __open_session(monc); + dout("%s hunting for new mon\n", __func__); + reopen_session(monc); } else { /* already hunting, let's wait a bit */ __schedule_delayed(monc); -- cgit v1.2.3 From bee3a37c470e4febcb05556ceafcb70929268edd Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 22 Jan 2016 18:42:17 +0100 Subject: libceph: reschedule tick in mon_fault() Doing __schedule_delayed() in the hunting branch is pointless, as the tick will have already been scheduled by then. What we need to do instead is *reschedule* it in the !hunting branch, after reopen_session() changes hunt_mult, which affects the delay. This helps with spacing out connection attempts and avoiding things like two back-to-back attempts followed by a longer period of waiting around. Signed-off-by: Ilya Dryomov --- net/ceph/mon_client.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 816fb813a336..a2b45cf79dca 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -222,8 +222,8 @@ static void __schedule_delayed(struct ceph_mon_client *monc) delay = CEPH_MONC_PING_INTERVAL; dout("__schedule_delayed after %lu\n", delay); - schedule_delayed_work(&monc->delayed_work, - round_jiffies_relative(delay)); + mod_delayed_work(system_wq, &monc->delayed_work, + round_jiffies_relative(delay)); } const char *ceph_sub_str[] = { @@ -1166,9 +1166,9 @@ static void mon_fault(struct ceph_connection *con) if (!monc->hunting) { dout("%s hunting for new mon\n", __func__); reopen_session(monc); - } else { - /* already hunting, let's wait a bit */ __schedule_delayed(monc); + } else { + dout("%s already hunting\n", __func__); } out: mutex_unlock(&monc->mutex); -- cgit v1.2.3 From b5d91704f53efc5a2a93e88e323877e6889b0f5b Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sat, 23 Jan 2016 15:57:51 +0100 Subject: libceph: behave in mon_fault() if cur_mon < 0 This can happen if __close_session() in ceph_monc_stop() races with a connection reset. We need to ignore such faults, otherwise it's likely we would take !hunting, call __schedule_delayed() and end up with delayed_work() executing on invalid memory, among other things. The (two!) con->private tests are useless, as nothing ever clears con->private. Nuke them. Signed-off-by: Ilya Dryomov --- net/ceph/mon_client.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index a2b45cf79dca..cf638c009cfa 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -1155,22 +1155,17 @@ static void mon_fault(struct ceph_connection *con) { struct ceph_mon_client *monc = con->private; - if (!monc) - return; - - dout("mon_fault\n"); mutex_lock(&monc->mutex); - if (!con->private) - goto out; - - if (!monc->hunting) { - dout("%s hunting for new mon\n", __func__); - reopen_session(monc); - __schedule_delayed(monc); - } else { - dout("%s already hunting\n", __func__); + dout("%s mon%d\n", __func__, monc->cur_mon); + if (monc->cur_mon >= 0) { + if (!monc->hunting) { + dout("%s hunting for new mon\n", __func__); + reopen_session(monc); + __schedule_delayed(monc); + } else { + dout("%s already hunting\n", __func__); + } } -out: mutex_unlock(&monc->mutex); } -- cgit v1.2.3 From de2aa102ea464a54dba14b9588e0bc188bd94707 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 8 Feb 2016 13:39:46 +0100 Subject: libceph: rename ceph_osd_req_op::payload_len to indata_len Follow userspace nomenclature on this - the next commit adds outdata_len. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 2 +- net/ceph/osd_client.c | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 7506b485bb6d..35c8b006916f 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -77,7 +77,7 @@ struct ceph_osd_data { struct ceph_osd_req_op { u16 op; /* CEPH_OSD_OP_* */ u32 flags; /* CEPH_OSD_OP_FLAG_* */ - u32 payload_len; + u32 indata_len; /* request */ union { struct ceph_osd_data raw_data_in; struct { diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 3309112e23d0..84075539135b 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -498,7 +498,7 @@ void osd_req_op_extent_init(struct ceph_osd_request *osd_req, if (opcode == CEPH_OSD_OP_WRITE || opcode == CEPH_OSD_OP_WRITEFULL) payload_len += length; - op->payload_len = payload_len; + op->indata_len = payload_len; } EXPORT_SYMBOL(osd_req_op_extent_init); @@ -517,7 +517,7 @@ void osd_req_op_extent_update(struct ceph_osd_request *osd_req, BUG_ON(length > previous); op->extent.length = length; - op->payload_len -= previous - length; + op->indata_len -= previous - length; } EXPORT_SYMBOL(osd_req_op_extent_update); @@ -554,7 +554,7 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, op->cls.argc = 0; /* currently unused */ - op->payload_len = payload_len; + op->indata_len = payload_len; } EXPORT_SYMBOL(osd_req_op_cls_init); @@ -587,7 +587,7 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, op->xattr.cmp_mode = cmp_mode; ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist); - op->payload_len = payload_len; + op->indata_len = payload_len; return 0; } EXPORT_SYMBOL(osd_req_op_xattr_init); @@ -707,7 +707,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE); dst->cls.indata_len = cpu_to_le32(data_length); ceph_osdc_msg_data_add(req->r_request, osd_data); - src->payload_len += data_length; + src->indata_len += data_length; request_data_len += data_length; } osd_data = &src->cls.response_data; @@ -750,7 +750,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, dst->op = cpu_to_le16(src->op); dst->flags = cpu_to_le32(src->flags); - dst->payload_len = cpu_to_le32(src->payload_len); + dst->payload_len = cpu_to_le32(src->indata_len); return request_data_len; } -- cgit v1.2.3 From 7665d85b7307fa0218881bc2009de067c42dc52e Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 7 Jan 2016 16:48:57 +0800 Subject: libceph: move r_reply_op_{len,result} into struct ceph_osd_req_op This avoids defining large array of r_reply_op_{len,result} in in struct ceph_osd_request. Signed-off-by: Yan, Zheng Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 2 +- include/linux/ceph/osd_client.h | 5 +++-- net/ceph/osd_client.c | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4a876785b68c..94f31bde73e8 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1854,7 +1854,7 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, * passed to the block layer, which just supports a 32-bit * length field. */ - obj_request->xferred = osd_req->r_reply_op_len[0]; + obj_request->xferred = osd_req->r_ops[0].outdata_len; rbd_assert(obj_request->xferred < (u64)UINT_MAX); opcode = osd_req->r_ops[0].op; diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 35c8b006916f..c6d1d603bacf 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -78,6 +78,9 @@ struct ceph_osd_req_op { u16 op; /* CEPH_OSD_OP_* */ u32 flags; /* CEPH_OSD_OP_FLAG_* */ u32 indata_len; /* request */ + u32 outdata_len; /* reply */ + s32 rval; + union { struct ceph_osd_data raw_data_in; struct { @@ -148,8 +151,6 @@ struct ceph_osd_request { struct ceph_eversion *r_request_reassert_version; int r_result; - int r_reply_op_len[CEPH_OSD_MAX_OP]; - s32 r_reply_op_result[CEPH_OSD_MAX_OP]; int r_got_reply; int r_linger; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 84075539135b..1048edb44343 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1821,7 +1821,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) int len; len = le32_to_cpu(op->payload_len); - req->r_reply_op_len[i] = len; + req->r_ops[i].outdata_len = len; dout(" op %d has %d bytes\n", i, len); payload_len += len; p += sizeof(*op); @@ -1836,7 +1836,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) ceph_decode_need(&p, end, 4 + numops * 4, bad_put); retry_attempt = ceph_decode_32(&p); for (i = 0; i < numops; i++) - req->r_reply_op_result[i] = ceph_decode_32(&p); + req->r_ops[i].rval = ceph_decode_32(&p); if (le16_to_cpu(msg->hdr.version) >= 6) { p += 8 + 4; /* skip replay_version */ -- cgit v1.2.3 From ae458f5a171badcce60bba9024dbdc5488e5e387 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 11 Feb 2016 13:09:15 +0100 Subject: libceph: make r_request msg_size calculation clearer Although msg_size is calculated correctly, the terms are grouped in a misleading way - snaps appears to not have room for a u32 length. Move calculation closer to its use and regroup terms. No functional change. Signed-off-by: Ilya Dryomov --- net/ceph/osd_client.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 1048edb44343..109d1f82cac5 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -372,16 +372,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, BUILD_BUG_ON(CEPH_OSD_MAX_OP > U16_MAX); BUG_ON(num_ops > CEPH_OSD_MAX_OP); - msg_size = 4 + 4 + 8 + 8 + 4+8; - msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ - msg_size += 1 + 8 + 4 + 4; /* pg_t */ - msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */ - msg_size += 2 + num_ops*sizeof(struct ceph_osd_op); - msg_size += 8; /* snapid */ - msg_size += 8; /* snap_seq */ - msg_size += 8 * (snapc ? snapc->num_snaps : 0); /* snaps */ - msg_size += 4; - if (use_mempool) { req = mempool_alloc(osdc->req_mempool, gfp_flags); memset(req, 0, sizeof(*req)); @@ -420,6 +410,17 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, } req->r_reply = msg; + msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */ + msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */ + msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ + msg_size += 1 + 8 + 4 + 4; /* pgid */ + msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */ + msg_size += 2 + num_ops * sizeof(struct ceph_osd_op); + msg_size += 8; /* snapid */ + msg_size += 8; /* snap_seq */ + msg_size += 4 + 8 * (snapc ? snapc->num_snaps : 0); /* snaps */ + msg_size += 4; /* retry_attempt */ + /* create request message; allow space for oid */ if (use_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op, 0); -- cgit v1.2.3 From 9e767adbd3961760af5d56efe45fa8217cce7db6 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 9 Feb 2016 17:25:31 +0100 Subject: libceph: osdc->req_mempool should be backed by a slab pool ceph_osd_request_cache was introduced a long time ago. Also, osd_req is about to get a flexible array member, which ceph_osd_request_cache is going to be aware of. Signed-off-by: Ilya Dryomov --- net/ceph/osd_client.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 109d1f82cac5..f93d0e893f3f 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2648,8 +2648,8 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) round_jiffies_relative(osdc->client->options->osd_idle_ttl)); err = -ENOMEM; - osdc->req_mempool = mempool_create_kmalloc_pool(10, - sizeof(struct ceph_osd_request)); + osdc->req_mempool = mempool_create_slab_pool(10, + ceph_osd_request_cache); if (!osdc->req_mempool) goto out; -- cgit v1.2.3 From 3f1af42ad0fad8a12242233dd0d9fc42f5e83415 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 9 Feb 2016 17:50:15 +0100 Subject: libceph: enable large, variable-sized OSD requests Turn r_ops into a flexible array member to enable large, consisting of up to 16 ops, OSD requests. The use case is scattered writeback in cephfs and, as far as the kernel client is concerned, 16 is just a made up number. r_ops had size 3 for copyup+hint+write, but copyup is really a special case - it can only happen once. ceph_osd_request_cache is therefore stuffed with num_ops=2 requests, anything bigger than that is allocated with kmalloc(). req_mempool is backed by ceph_osd_request_cache, which means either num_ops=1 or num_ops=2 for use_mempool=true - all existing users (ceph_writepages_start(), ceph_osdc_writepages()) are fine with that. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 2 -- include/linux/ceph/osd_client.h | 6 ++++-- net/ceph/osd_client.c | 43 +++++++++++++++++++++++++++-------------- 3 files changed, 32 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 94f31bde73e8..08018710f363 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1847,8 +1847,6 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, if (osd_req->r_result < 0) obj_request->result = osd_req->r_result; - rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP); - /* * We support a 64-bit length, but ultimately it has to be * passed to the block layer, which just supports a 32-bit diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index c6d1d603bacf..aada6a1383a4 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -43,7 +43,8 @@ struct ceph_osd { }; -#define CEPH_OSD_MAX_OP 3 +#define CEPH_OSD_SLAB_OPS 2 +#define CEPH_OSD_MAX_OPS 16 enum ceph_osd_data_type { CEPH_OSD_DATA_TYPE_NONE = 0, @@ -139,7 +140,6 @@ struct ceph_osd_request { /* request osd ops array */ unsigned int r_num_ops; - struct ceph_osd_req_op r_ops[CEPH_OSD_MAX_OP]; /* these are updated on each send */ __le32 *r_request_osdmap_epoch; @@ -175,6 +175,8 @@ struct ceph_osd_request { unsigned long r_stamp; /* send OR check time */ struct ceph_snap_context *r_snapc; /* snap context for writes */ + + struct ceph_osd_req_op r_ops[]; }; struct ceph_request_redirect { diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index f93d0e893f3f..ccd4e031fa3f 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -338,9 +338,10 @@ static void ceph_osdc_release_request(struct kref *kref) ceph_put_snap_context(req->r_snapc); if (req->r_mempool) mempool_free(req, req->r_osdc->req_mempool); - else + else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS) kmem_cache_free(ceph_osd_request_cache, req); - + else + kfree(req); } void ceph_osdc_get_request(struct ceph_osd_request *req) @@ -369,18 +370,22 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_msg *msg; size_t msg_size; - BUILD_BUG_ON(CEPH_OSD_MAX_OP > U16_MAX); - BUG_ON(num_ops > CEPH_OSD_MAX_OP); - if (use_mempool) { + BUG_ON(num_ops > CEPH_OSD_SLAB_OPS); req = mempool_alloc(osdc->req_mempool, gfp_flags); - memset(req, 0, sizeof(*req)); + } else if (num_ops <= CEPH_OSD_SLAB_OPS) { + req = kmem_cache_alloc(ceph_osd_request_cache, gfp_flags); } else { - req = kmem_cache_zalloc(ceph_osd_request_cache, gfp_flags); + BUG_ON(num_ops > CEPH_OSD_MAX_OPS); + req = kmalloc(sizeof(*req) + num_ops * sizeof(req->r_ops[0]), + gfp_flags); } - if (req == NULL) + if (unlikely(!req)) return NULL; + /* req only, each op is zeroed in _osd_req_op_init() */ + memset(req, 0, sizeof(*req)); + req->r_osdc = osdc; req->r_mempool = use_mempool; req->r_num_ops = num_ops; @@ -398,12 +403,19 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, req->r_base_oloc.pool = -1; req->r_target_oloc.pool = -1; + msg_size = OSD_OPREPLY_FRONT_LEN; + if (num_ops > CEPH_OSD_SLAB_OPS) { + /* ceph_osd_op and rval */ + msg_size += (num_ops - CEPH_OSD_SLAB_OPS) * + (sizeof(struct ceph_osd_op) + 4); + } + /* create reply message */ if (use_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); else - msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, - OSD_OPREPLY_FRONT_LEN, gfp_flags, true); + msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, + gfp_flags, true); if (!msg) { ceph_osdc_put_request(req); return NULL; @@ -1811,7 +1823,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) ceph_decode_need(&p, end, 4, bad_put); numops = ceph_decode_32(&p); - if (numops > CEPH_OSD_MAX_OP) + if (numops > CEPH_OSD_MAX_OPS) goto bad_put; if (numops != req->r_num_ops) goto bad_put; @@ -2784,11 +2796,12 @@ EXPORT_SYMBOL(ceph_osdc_writepages); int ceph_osdc_setup(void) { + size_t size = sizeof(struct ceph_osd_request) + + CEPH_OSD_SLAB_OPS * sizeof(struct ceph_osd_req_op); + BUG_ON(ceph_osd_request_cache); - ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", - sizeof (struct ceph_osd_request), - __alignof__(struct ceph_osd_request), - 0, NULL); + ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", size, + 0, 0, NULL); return ceph_osd_request_cache ? 0 : -ENOMEM; } -- cgit v1.2.3 From 2c63f49a724a10bb71cc0fd34f8e5acce78525d5 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 7 Jan 2016 17:32:54 +0800 Subject: libceph: add helper that duplicates last extent operation This helper duplicates last extent operation in OSD request, then adjusts the new extent operation's offset and length. The helper is for scatterd page writeback, which adds nonconsecutive dirty pages to single OSD request. Signed-off-by: Yan, Zheng Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 2 ++ net/ceph/osd_client.c | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'net') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index aada6a1383a4..4343df806710 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -266,6 +266,8 @@ extern void osd_req_op_extent_init(struct ceph_osd_request *osd_req, u64 truncate_size, u32 truncate_seq); extern void osd_req_op_extent_update(struct ceph_osd_request *osd_req, unsigned int which, u64 length); +extern void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req, + unsigned int which, u64 offset_inc); extern struct ceph_osd_data *osd_req_op_extent_osd_data( struct ceph_osd_request *osd_req, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index ccd4e031fa3f..32355d9d0103 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -534,6 +534,28 @@ void osd_req_op_extent_update(struct ceph_osd_request *osd_req, } EXPORT_SYMBOL(osd_req_op_extent_update); +void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req, + unsigned int which, u64 offset_inc) +{ + struct ceph_osd_req_op *op, *prev_op; + + BUG_ON(which + 1 >= osd_req->r_num_ops); + + prev_op = &osd_req->r_ops[which]; + op = _osd_req_op_init(osd_req, which + 1, prev_op->op, prev_op->flags); + /* dup previous one */ + op->indata_len = prev_op->indata_len; + op->outdata_len = prev_op->outdata_len; + op->extent = prev_op->extent; + /* adjust offset */ + op->extent.offset += offset_inc; + op->extent.length -= offset_inc; + + if (op->op == CEPH_OSD_OP_WRITE || op->op == CEPH_OSD_OP_WRITEFULL) + op->indata_len -= offset_inc; +} +EXPORT_SYMBOL(osd_req_op_extent_dup_last); + void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, const char *class, const char *method) { -- cgit v1.2.3 From 89f081730c49a1d3b46359aa0054e6b3b80f47e4 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sat, 20 Feb 2016 15:56:07 +0100 Subject: libceph: use sizeof_footer() more Don't open-code sizeof_footer() in read_partial_message() and ceph_msg_revoke(). Also, after switching to sizeof_footer(), it's now possible to use con_out_kvec_add() in prepare_write_message_footer(). Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/messenger.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 9382619a405b..d9681bc839c7 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1221,25 +1221,19 @@ static void prepare_message_data(struct ceph_msg *msg, u32 data_len) static void prepare_write_message_footer(struct ceph_connection *con) { struct ceph_msg *m = con->out_msg; - int v = con->out_kvec_left; m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE; dout("prepare_write_message_footer %p\n", con); - con->out_kvec[v].iov_base = &m->footer; + con_out_kvec_add(con, sizeof_footer(con), &m->footer); if (con->peer_features & CEPH_FEATURE_MSG_AUTH) { if (con->ops->sign_message) con->ops->sign_message(m); else m->footer.sig = 0; - con->out_kvec[v].iov_len = sizeof(m->footer); - con->out_kvec_bytes += sizeof(m->footer); } else { m->old_footer.flags = m->footer.flags; - con->out_kvec[v].iov_len = sizeof(m->old_footer); - con->out_kvec_bytes += sizeof(m->old_footer); } - con->out_kvec_left++; con->out_more = m->more_to_follow; con->out_msg_done = true; } @@ -2409,11 +2403,7 @@ static int read_partial_message(struct ceph_connection *con) } /* footer */ - if (need_sign) - size = sizeof(m->footer); - else - size = sizeof(m->old_footer); - + size = sizeof_footer(con); end += size; ret = read_partial(con, end, size, &m->footer); if (ret <= 0) @@ -3089,10 +3079,7 @@ void ceph_msg_revoke(struct ceph_msg *msg) con->out_skip += con_out_kvec_skip(con); } else { BUG_ON(!msg->data_length); - if (con->peer_features & CEPH_FEATURE_MSG_AUTH) - con->out_skip += sizeof(msg->footer); - else - con->out_skip += sizeof(msg->old_footer); + con->out_skip += sizeof_footer(con); } /* data, middle, front */ if (msg->data_length) -- cgit v1.2.3 From 5ee61e95b6b33c82f6fa1382585faed66aa01245 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Sun, 13 Mar 2016 15:18:39 +0800 Subject: libceph: use KMEM_CACHE macro Use KMEM_CACHE() instead of kmem_cache_create() to simplify the code. Signed-off-by: Geliang Tang Signed-off-by: Ilya Dryomov --- net/ceph/messenger.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index d9681bc839c7..1831f6353622 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -235,18 +235,12 @@ static struct workqueue_struct *ceph_msgr_wq; static int ceph_msgr_slab_init(void) { BUG_ON(ceph_msg_cache); - ceph_msg_cache = kmem_cache_create("ceph_msg", - sizeof (struct ceph_msg), - __alignof__(struct ceph_msg), 0, NULL); - + ceph_msg_cache = KMEM_CACHE(ceph_msg, 0); if (!ceph_msg_cache) return -ENOMEM; BUG_ON(ceph_msg_data_cache); - ceph_msg_data_cache = kmem_cache_create("ceph_msg_data", - sizeof (struct ceph_msg_data), - __alignof__(struct ceph_msg_data), - 0, NULL); + ceph_msg_data_cache = KMEM_CACHE(ceph_msg_data, 0); if (ceph_msg_data_cache) return 0; -- cgit v1.2.3