From 986e89898acb3d8f750f259a90cb73afca426b58 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov <idryomov@gmail.com> Date: Tue, 25 Jul 2017 14:40:03 +0200 Subject: libceph: make encode_request_*() work with r_mempool requests Messages allocated out of ceph_msgpool have a fixed front length (pool->front_len). Asserting that the entire front has been filled while encoding is thus wrong. Fixes: 8cb441c0545d ("libceph: MOSDOp v8 encoding (actual spgid + full hash)") Reported-by: "Yan, Zheng" <zyan@redhat.com> Signed-off-by: Ilya Dryomov <idryomov@gmail.com> Reviewed-by: "Yan, Zheng" <zyan@redhat.com> --- net/ceph/osd_client.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 901bb8221366..b5f016cb9569 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1918,10 +1918,12 @@ static void encode_request_partial(struct ceph_osd_request *req, } ceph_encode_32(&p, req->r_attempts); /* retry_attempt */ - BUG_ON(p != end - 8); /* space for features */ + BUG_ON(p > end - 8); /* space for features */ msg->hdr.version = cpu_to_le16(8); /* MOSDOp v8 */ /* front_len is finalized in encode_request_finish() */ + msg->front.iov_len = p - msg->front.iov_base; + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); msg->hdr.data_len = cpu_to_le32(data_len); /* * The header "data_off" is a hint to the receiver allowing it @@ -1937,11 +1939,12 @@ static void encode_request_partial(struct ceph_osd_request *req, static void encode_request_finish(struct ceph_msg *msg) { void *p = msg->front.iov_base; + void *const partial_end = p + msg->front.iov_len; void *const end = p + msg->front_alloc_len; if (CEPH_HAVE_FEATURE(msg->con->peer_features, RESEND_ON_SPLIT)) { /* luminous OSD -- encode features and be done */ - p = end - 8; + p = partial_end; ceph_encode_64(&p, msg->con->peer_features); } else { struct { @@ -1984,7 +1987,7 @@ static void encode_request_finish(struct ceph_msg *msg) oid_len = p - oid; tail = p; - tail_len = (end - p) - 8; + tail_len = partial_end - p; p = msg->front.iov_base; ceph_encode_copy(&p, &head.client_inc, sizeof(head.client_inc)); -- cgit v1.2.3 From 4690faf00cf838392ce038202a85ac0d5f1df598 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov <idryomov@gmail.com> Date: Wed, 26 Jul 2017 09:59:15 +0200 Subject: libceph: don't call ->reencode_message() more than once per message Reencoding an already reencoded message is a bad idea. This could happen on Policy::stateful_server connections (!CEPH_MSG_CONNECT_LOSSY), such as MDS sessions. This didn't pop up in testing because currently only OSD requests are reencoded and OSD sessions are always lossy. Fixes: 98ad5ebd1505 ("libceph: ceph_connection_operations::reencode_message() method") Signed-off-by: Ilya Dryomov <idryomov@gmail.com> Reviewed-by: "Yan, Zheng" <zyan@redhat.com> --- net/ceph/messenger.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index b7cc615d42ef..a67298c7e0cd 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1287,10 +1287,10 @@ static void prepare_write_message(struct ceph_connection *con) if (m->needs_out_seq) { m->hdr.seq = cpu_to_le64(++con->out_seq); m->needs_out_seq = false; - } - if (con->ops->reencode_message) - con->ops->reencode_message(m); + if (con->ops->reencode_message) + con->ops->reencode_message(m); + } dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n", m, con->out_seq, le16_to_cpu(m->hdr.type), -- cgit v1.2.3 From e17e8969f5c59a10083af5e260bdad6026872203 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov <idryomov@gmail.com> Date: Mon, 24 Jul 2017 16:43:49 +0200 Subject: libceph: fallback for when there isn't a pool-specific choose_arg There is now a fallback to a choose_arg index of -1 if there isn't a pool-specific choose_arg set. If you create a per-pool weight-set, that works for that pool. Otherwise we try the compat/default one. If that doesn't exist either, then we use the normal CRUSH weights. Signed-off-by: Ilya Dryomov <idryomov@gmail.com> Reviewed-by: Sage Weil <sage@redhat.com> --- include/linux/crush/crush.h | 2 +- net/ceph/osdmap.c | 12 +++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 92e165d417a6..07eed95e10c7 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -193,7 +193,7 @@ struct crush_choose_arg { struct crush_choose_arg_map { #ifdef __KERNEL__ struct rb_node node; - u64 choose_args_index; + s64 choose_args_index; #endif struct crush_choose_arg *args; /*!< replacement for each bucket in the crushmap */ diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 64ae9f89773a..eb57a06373ca 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -2301,10 +2301,17 @@ static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi, } } +/* + * Magic value used for a "default" fallback choose_args, used if the + * crush_choose_arg_map passed to do_crush() does not exist. If this + * also doesn't exist, fall back to canonical weights. + */ +#define CEPH_DEFAULT_CHOOSE_ARGS -1 + static int do_crush(struct ceph_osdmap *map, int ruleno, int x, int *result, int result_max, const __u32 *weight, int weight_max, - u64 choose_args_index) + s64 choose_args_index) { struct crush_choose_arg_map *arg_map; int r; @@ -2313,6 +2320,9 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x, arg_map = lookup_choose_arg_map(&map->crush->choose_args, choose_args_index); + if (!arg_map) + arg_map = lookup_choose_arg_map(&map->crush->choose_args, + CEPH_DEFAULT_CHOOSE_ARGS); mutex_lock(&map->crush_workspace_mutex); r = crush_do_rule(map->crush, ruleno, x, result, result_max, -- cgit v1.2.3 From c7ed1a4bf4b446317eefa0f4916d94b1f6d3ada5 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov <idryomov@gmail.com> Date: Mon, 24 Jul 2017 15:49:52 +0200 Subject: crush: assume weight_set != null imples weight_set_size > 0 Reflects ceph.git commit 5e8fa3e06b68fae1582c9230a3a8d1abc6146286. Signed-off-by: Ilya Dryomov <idryomov@gmail.com> Reviewed-by: Sage Weil <sage@redhat.com> --- net/ceph/crush/mapper.c | 2 +- net/ceph/osdmap.c | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 746b145bfd11..417df675c71b 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -306,7 +306,7 @@ static __u32 *get_choose_arg_weights(const struct crush_bucket_straw2 *bucket, const struct crush_choose_arg *arg, int position) { - if (!arg || !arg->weight_set || arg->weight_set_size == 0) + if (!arg || !arg->weight_set) return bucket->item_weights; if (position >= arg->weight_set_size) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index eb57a06373ca..2586e5546143 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -295,6 +295,10 @@ static int decode_choose_args(void **p, void *end, struct crush_map *c) ret = decode_choose_arg(p, end, arg); if (ret) goto fail; + + if (arg->ids_size && + arg->ids_size != c->buckets[bucket_index]->size) + goto e_inval; } insert_choose_arg_map(&c->choose_args, arg_map); -- cgit v1.2.3 From f53b7665c8cec40c8a638b55ee098b721e6be20c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov <idryomov@gmail.com> Date: Thu, 27 Jul 2017 15:16:39 +0200 Subject: libceph: upmap semantic changes - apply both pg_upmap and pg_upmap_items - allow bidirectional swap of pg-upmap-items Signed-off-by: Ilya Dryomov <idryomov@gmail.com> Reviewed-by: Sage Weil <sage@redhat.com> --- net/ceph/osdmap.c | 39 +++++++++++---------------------------- 1 file changed, 11 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 2586e5546143..0bec71fa712e 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -2437,40 +2437,23 @@ static void apply_upmap(struct ceph_osdmap *osdmap, for (i = 0; i < pg->pg_upmap.len; i++) raw->osds[i] = pg->pg_upmap.osds[i]; raw->size = pg->pg_upmap.len; - return; + /* check and apply pg_upmap_items, if any */ } pg = lookup_pg_mapping(&osdmap->pg_upmap_items, pgid); if (pg) { - /* - * Note: this approach does not allow a bidirectional swap, - * e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1]. - */ - for (i = 0; i < pg->pg_upmap_items.len; i++) { - int from = pg->pg_upmap_items.from_to[i][0]; - int to = pg->pg_upmap_items.from_to[i][1]; - int pos = -1; - bool exists = false; - - /* make sure replacement doesn't already appear */ - for (j = 0; j < raw->size; j++) { - int osd = raw->osds[j]; - - if (osd == to) { - exists = true; + for (i = 0; i < raw->size; i++) { + for (j = 0; j < pg->pg_upmap_items.len; j++) { + int from = pg->pg_upmap_items.from_to[j][0]; + int to = pg->pg_upmap_items.from_to[j][1]; + + if (from == raw->osds[i]) { + if (!(to != CRUSH_ITEM_NONE && + to < osdmap->max_osd && + osdmap->osd_weight[to] == 0)) + raw->osds[i] = to; break; } - /* ignore mapping if target is marked out */ - if (osd == from && pos < 0 && - !(to != CRUSH_ITEM_NONE && - to < osdmap->max_osd && - osdmap->osd_weight[to] == 0)) { - pos = j; - } - } - if (!exists && pos >= 0) { - raw->osds[pos] = to; - return; } } } -- cgit v1.2.3 From ae78dd8139ce93a528beb7f3914531b7a7be9e30 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov <idryomov@gmail.com> Date: Thu, 27 Jul 2017 17:59:14 +0200 Subject: libceph: make RECOVERY_DELETES feature create a new interval This is needed so that the OSDs can regenerate the missing set at the start of a new interval where support for recovery deletes changed. Signed-off-by: Ilya Dryomov <idryomov@gmail.com> Reviewed-by: Sage Weil <sage@redhat.com> --- include/linux/ceph/osd_client.h | 1 + include/linux/ceph/osdmap.h | 2 ++ include/linux/ceph/rados.h | 4 ++++ net/ceph/osd_client.c | 5 +++++ net/ceph/osdmap.c | 5 ++++- 5 files changed, 16 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index c6d96a5f46fd..adf670ecaf94 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -148,6 +148,7 @@ struct ceph_osd_request_target { int size; int min_size; bool sort_bitwise; + bool recovery_deletes; unsigned int flags; /* CEPH_OSD_FLAG_* */ bool paused; diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index a0996cb9faed..af3444a5bfdd 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -272,6 +272,8 @@ bool ceph_is_new_interval(const struct ceph_osds *old_acting, u32 new_pg_num, bool old_sort_bitwise, bool new_sort_bitwise, + bool old_recovery_deletes, + bool new_recovery_deletes, const struct ceph_pg *pgid); bool ceph_osds_changed(const struct ceph_osds *old_acting, const struct ceph_osds *new_acting, diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 385db08bb8b2..b8281feda9c7 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -158,6 +158,10 @@ extern const char *ceph_osd_state_name(int s); #define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */ #define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */ #define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */ +#define CEPH_OSDMAP_REQUIRE_JEWEL (1<<16) /* require jewel for booting osds */ +#define CEPH_OSDMAP_REQUIRE_KRAKEN (1<<17) /* require kraken for booting osds */ +#define CEPH_OSDMAP_REQUIRE_LUMINOUS (1<<18) /* require l for booting osds */ +#define CEPH_OSDMAP_RECOVERY_DELETES (1<<19) /* deletes performed during recovery instead of peering */ /* * The error code to return when an OSD can't handle a write diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index b5f016cb9569..dcfbdd74dfd1 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1337,6 +1337,8 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, bool legacy_change; bool split = false; bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE); + bool recovery_deletes = ceph_osdmap_flag(osdc, + CEPH_OSDMAP_RECOVERY_DELETES); enum calc_target_result ct_res; int ret; @@ -1399,6 +1401,8 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, pi->pg_num, t->sort_bitwise, sort_bitwise, + t->recovery_deletes, + recovery_deletes, &last_pgid)) force_resend = true; @@ -1421,6 +1425,7 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, t->pg_num = pi->pg_num; t->pg_num_mask = pi->pg_num_mask; t->sort_bitwise = sort_bitwise; + t->recovery_deletes = recovery_deletes; t->osd = acting.primary; } diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 0bec71fa712e..f358d0bfa76b 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -2082,6 +2082,8 @@ bool ceph_is_new_interval(const struct ceph_osds *old_acting, u32 new_pg_num, bool old_sort_bitwise, bool new_sort_bitwise, + bool old_recovery_deletes, + bool new_recovery_deletes, const struct ceph_pg *pgid) { return !osds_equal(old_acting, new_acting) || @@ -2089,7 +2091,8 @@ bool ceph_is_new_interval(const struct ceph_osds *old_acting, old_size != new_size || old_min_size != new_min_size || ceph_pg_is_split(pgid, old_pg_num, new_pg_num) || - old_sort_bitwise != new_sort_bitwise; + old_sort_bitwise != new_sort_bitwise || + old_recovery_deletes != new_recovery_deletes; } static int calc_pg_rank(int osd, const struct ceph_osds *acting) -- cgit v1.2.3