From 986e89898acb3d8f750f259a90cb73afca426b58 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Tue, 25 Jul 2017 14:40:03 +0200
Subject: libceph: make encode_request_*() work with r_mempool requests

Messages allocated out of ceph_msgpool have a fixed front length
(pool->front_len).  Asserting that the entire front has been filled
while encoding is thus wrong.

Fixes: 8cb441c0545d ("libceph: MOSDOp v8 encoding (actual spgid + full hash)")
Reported-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
---
 net/ceph/osd_client.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 901bb8221366..b5f016cb9569 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1918,10 +1918,12 @@ static void encode_request_partial(struct ceph_osd_request *req,
 	}
 
 	ceph_encode_32(&p, req->r_attempts); /* retry_attempt */
-	BUG_ON(p != end - 8); /* space for features */
+	BUG_ON(p > end - 8); /* space for features */
 
 	msg->hdr.version = cpu_to_le16(8); /* MOSDOp v8 */
 	/* front_len is finalized in encode_request_finish() */
+	msg->front.iov_len = p - msg->front.iov_base;
+	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
 	msg->hdr.data_len = cpu_to_le32(data_len);
 	/*
 	 * The header "data_off" is a hint to the receiver allowing it
@@ -1937,11 +1939,12 @@ static void encode_request_partial(struct ceph_osd_request *req,
 static void encode_request_finish(struct ceph_msg *msg)
 {
 	void *p = msg->front.iov_base;
+	void *const partial_end = p + msg->front.iov_len;
 	void *const end = p + msg->front_alloc_len;
 
 	if (CEPH_HAVE_FEATURE(msg->con->peer_features, RESEND_ON_SPLIT)) {
 		/* luminous OSD -- encode features and be done */
-		p = end - 8;
+		p = partial_end;
 		ceph_encode_64(&p, msg->con->peer_features);
 	} else {
 		struct {
@@ -1984,7 +1987,7 @@ static void encode_request_finish(struct ceph_msg *msg)
 		oid_len = p - oid;
 
 		tail = p;
-		tail_len = (end - p) - 8;
+		tail_len = partial_end - p;
 
 		p = msg->front.iov_base;
 		ceph_encode_copy(&p, &head.client_inc, sizeof(head.client_inc));
-- 
cgit v1.2.3


From 4690faf00cf838392ce038202a85ac0d5f1df598 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 26 Jul 2017 09:59:15 +0200
Subject: libceph: don't call ->reencode_message() more than once per message

Reencoding an already reencoded message is a bad idea.  This could
happen on Policy::stateful_server connections (!CEPH_MSG_CONNECT_LOSSY),
such as MDS sessions.

This didn't pop up in testing because currently only OSD requests are
reencoded and OSD sessions are always lossy.

Fixes: 98ad5ebd1505 ("libceph: ceph_connection_operations::reencode_message() method")
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
---
 net/ceph/messenger.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index b7cc615d42ef..a67298c7e0cd 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1287,10 +1287,10 @@ static void prepare_write_message(struct ceph_connection *con)
 	if (m->needs_out_seq) {
 		m->hdr.seq = cpu_to_le64(++con->out_seq);
 		m->needs_out_seq = false;
-	}
 
-	if (con->ops->reencode_message)
-		con->ops->reencode_message(m);
+		if (con->ops->reencode_message)
+			con->ops->reencode_message(m);
+	}
 
 	dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",
 	     m, con->out_seq, le16_to_cpu(m->hdr.type),
-- 
cgit v1.2.3


From e17e8969f5c59a10083af5e260bdad6026872203 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 24 Jul 2017 16:43:49 +0200
Subject: libceph: fallback for when there isn't a pool-specific choose_arg

There is now a fallback to a choose_arg index of -1 if there isn't
a pool-specific choose_arg set.  If you create a per-pool weight-set,
that works for that pool.  Otherwise we try the compat/default one.  If
that doesn't exist either, then we use the normal CRUSH weights.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Sage Weil <sage@redhat.com>
---
 include/linux/crush/crush.h |  2 +-
 net/ceph/osdmap.c           | 12 +++++++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
index 92e165d417a6..07eed95e10c7 100644
--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -193,7 +193,7 @@ struct crush_choose_arg {
 struct crush_choose_arg_map {
 #ifdef __KERNEL__
 	struct rb_node node;
-	u64 choose_args_index;
+	s64 choose_args_index;
 #endif
 	struct crush_choose_arg *args; /*!< replacement for each bucket
                                             in the crushmap */
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 64ae9f89773a..eb57a06373ca 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -2301,10 +2301,17 @@ static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi,
 	}
 }
 
+/*
+ * Magic value used for a "default" fallback choose_args, used if the
+ * crush_choose_arg_map passed to do_crush() does not exist.  If this
+ * also doesn't exist, fall back to canonical weights.
+ */
+#define CEPH_DEFAULT_CHOOSE_ARGS	-1
+
 static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
 		    int *result, int result_max,
 		    const __u32 *weight, int weight_max,
-		    u64 choose_args_index)
+		    s64 choose_args_index)
 {
 	struct crush_choose_arg_map *arg_map;
 	int r;
@@ -2313,6 +2320,9 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
 
 	arg_map = lookup_choose_arg_map(&map->crush->choose_args,
 					choose_args_index);
+	if (!arg_map)
+		arg_map = lookup_choose_arg_map(&map->crush->choose_args,
+						CEPH_DEFAULT_CHOOSE_ARGS);
 
 	mutex_lock(&map->crush_workspace_mutex);
 	r = crush_do_rule(map->crush, ruleno, x, result, result_max,
-- 
cgit v1.2.3


From c7ed1a4bf4b446317eefa0f4916d94b1f6d3ada5 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 24 Jul 2017 15:49:52 +0200
Subject: crush: assume weight_set != null imples weight_set_size > 0

Reflects ceph.git commit 5e8fa3e06b68fae1582c9230a3a8d1abc6146286.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Sage Weil <sage@redhat.com>
---
 net/ceph/crush/mapper.c | 2 +-
 net/ceph/osdmap.c       | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 746b145bfd11..417df675c71b 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -306,7 +306,7 @@ static __u32 *get_choose_arg_weights(const struct crush_bucket_straw2 *bucket,
 				     const struct crush_choose_arg *arg,
 				     int position)
 {
-	if (!arg || !arg->weight_set || arg->weight_set_size == 0)
+	if (!arg || !arg->weight_set)
 		return bucket->item_weights;
 
 	if (position >= arg->weight_set_size)
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index eb57a06373ca..2586e5546143 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -295,6 +295,10 @@ static int decode_choose_args(void **p, void *end, struct crush_map *c)
 			ret = decode_choose_arg(p, end, arg);
 			if (ret)
 				goto fail;
+
+			if (arg->ids_size &&
+			    arg->ids_size != c->buckets[bucket_index]->size)
+				goto e_inval;
 		}
 
 		insert_choose_arg_map(&c->choose_args, arg_map);
-- 
cgit v1.2.3


From f53b7665c8cec40c8a638b55ee098b721e6be20c Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 27 Jul 2017 15:16:39 +0200
Subject: libceph: upmap semantic changes

- apply both pg_upmap and pg_upmap_items
- allow bidirectional swap of pg-upmap-items

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Sage Weil <sage@redhat.com>
---
 net/ceph/osdmap.c | 39 +++++++++++----------------------------
 1 file changed, 11 insertions(+), 28 deletions(-)

(limited to 'net')

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 2586e5546143..0bec71fa712e 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -2437,40 +2437,23 @@ static void apply_upmap(struct ceph_osdmap *osdmap,
 		for (i = 0; i < pg->pg_upmap.len; i++)
 			raw->osds[i] = pg->pg_upmap.osds[i];
 		raw->size = pg->pg_upmap.len;
-		return;
+		/* check and apply pg_upmap_items, if any */
 	}
 
 	pg = lookup_pg_mapping(&osdmap->pg_upmap_items, pgid);
 	if (pg) {
-		/*
-		 * Note: this approach does not allow a bidirectional swap,
-		 * e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
-		 */
-		for (i = 0; i < pg->pg_upmap_items.len; i++) {
-			int from = pg->pg_upmap_items.from_to[i][0];
-			int to = pg->pg_upmap_items.from_to[i][1];
-			int pos = -1;
-			bool exists = false;
-
-			/* make sure replacement doesn't already appear */
-			for (j = 0; j < raw->size; j++) {
-				int osd = raw->osds[j];
-
-				if (osd == to) {
-					exists = true;
+		for (i = 0; i < raw->size; i++) {
+			for (j = 0; j < pg->pg_upmap_items.len; j++) {
+				int from = pg->pg_upmap_items.from_to[j][0];
+				int to = pg->pg_upmap_items.from_to[j][1];
+
+				if (from == raw->osds[i]) {
+					if (!(to != CRUSH_ITEM_NONE &&
+					      to < osdmap->max_osd &&
+					      osdmap->osd_weight[to] == 0))
+						raw->osds[i] = to;
 					break;
 				}
-				/* ignore mapping if target is marked out */
-				if (osd == from && pos < 0 &&
-				    !(to != CRUSH_ITEM_NONE &&
-				      to < osdmap->max_osd &&
-				      osdmap->osd_weight[to] == 0)) {
-					pos = j;
-				}
-			}
-			if (!exists && pos >= 0) {
-				raw->osds[pos] = to;
-				return;
 			}
 		}
 	}
-- 
cgit v1.2.3


From ae78dd8139ce93a528beb7f3914531b7a7be9e30 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 27 Jul 2017 17:59:14 +0200
Subject: libceph: make RECOVERY_DELETES feature create a new interval

This is needed so that the OSDs can regenerate the missing set at the
start of a new interval where support for recovery deletes changed.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Sage Weil <sage@redhat.com>
---
 include/linux/ceph/osd_client.h | 1 +
 include/linux/ceph/osdmap.h     | 2 ++
 include/linux/ceph/rados.h      | 4 ++++
 net/ceph/osd_client.c           | 5 +++++
 net/ceph/osdmap.c               | 5 ++++-
 5 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index c6d96a5f46fd..adf670ecaf94 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -148,6 +148,7 @@ struct ceph_osd_request_target {
 	int size;
 	int min_size;
 	bool sort_bitwise;
+	bool recovery_deletes;
 
 	unsigned int flags;                /* CEPH_OSD_FLAG_* */
 	bool paused;
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index a0996cb9faed..af3444a5bfdd 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -272,6 +272,8 @@ bool ceph_is_new_interval(const struct ceph_osds *old_acting,
 			  u32 new_pg_num,
 			  bool old_sort_bitwise,
 			  bool new_sort_bitwise,
+			  bool old_recovery_deletes,
+			  bool new_recovery_deletes,
 			  const struct ceph_pg *pgid);
 bool ceph_osds_changed(const struct ceph_osds *old_acting,
 		       const struct ceph_osds *new_acting,
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 385db08bb8b2..b8281feda9c7 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -158,6 +158,10 @@ extern const char *ceph_osd_state_name(int s);
 #define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */
 #define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */
 #define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */
+#define CEPH_OSDMAP_REQUIRE_JEWEL    (1<<16) /* require jewel for booting osds */
+#define CEPH_OSDMAP_REQUIRE_KRAKEN   (1<<17) /* require kraken for booting osds */
+#define CEPH_OSDMAP_REQUIRE_LUMINOUS (1<<18) /* require l for booting osds */
+#define CEPH_OSDMAP_RECOVERY_DELETES (1<<19) /* deletes performed during recovery instead of peering */
 
 /*
  * The error code to return when an OSD can't handle a write
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index b5f016cb9569..dcfbdd74dfd1 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1337,6 +1337,8 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
 	bool legacy_change;
 	bool split = false;
 	bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE);
+	bool recovery_deletes = ceph_osdmap_flag(osdc,
+						 CEPH_OSDMAP_RECOVERY_DELETES);
 	enum calc_target_result ct_res;
 	int ret;
 
@@ -1399,6 +1401,8 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
 				 pi->pg_num,
 				 t->sort_bitwise,
 				 sort_bitwise,
+				 t->recovery_deletes,
+				 recovery_deletes,
 				 &last_pgid))
 		force_resend = true;
 
@@ -1421,6 +1425,7 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
 		t->pg_num = pi->pg_num;
 		t->pg_num_mask = pi->pg_num_mask;
 		t->sort_bitwise = sort_bitwise;
+		t->recovery_deletes = recovery_deletes;
 
 		t->osd = acting.primary;
 	}
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 0bec71fa712e..f358d0bfa76b 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -2082,6 +2082,8 @@ bool ceph_is_new_interval(const struct ceph_osds *old_acting,
 			  u32 new_pg_num,
 			  bool old_sort_bitwise,
 			  bool new_sort_bitwise,
+			  bool old_recovery_deletes,
+			  bool new_recovery_deletes,
 			  const struct ceph_pg *pgid)
 {
 	return !osds_equal(old_acting, new_acting) ||
@@ -2089,7 +2091,8 @@ bool ceph_is_new_interval(const struct ceph_osds *old_acting,
 	       old_size != new_size ||
 	       old_min_size != new_min_size ||
 	       ceph_pg_is_split(pgid, old_pg_num, new_pg_num) ||
-	       old_sort_bitwise != new_sort_bitwise;
+	       old_sort_bitwise != new_sort_bitwise ||
+	       old_recovery_deletes != new_recovery_deletes;
 }
 
 static int calc_pg_rank(int osd, const struct ceph_osds *acting)
-- 
cgit v1.2.3