From 7b25bf5f02c5c80adf96120e031dc3a1756ce54d Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 25 Feb 2014 16:22:26 +0200 Subject: libceph: encode CEPH_OSD_OP_FLAG_* op flags Encode ceph_osd_op::flags field so that it gets sent over the wire. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil Reviewed-by: Alex Elder --- net/ceph/osd_client.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ceph/osd_client.c') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 0676f2b199d6..5d7fd0b8c1c8 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -688,7 +688,9 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, return 0; } + dst->op = cpu_to_le16(src->op); + dst->flags = cpu_to_le32(src->flags); dst->payload_len = cpu_to_le32(src->payload_len); return request_data_len; -- cgit v1.2.3 From c647b8a8c6366f849c2a237bfe525cb1d316d5f4 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 25 Feb 2014 16:22:27 +0200 Subject: libceph: add support for CEPH_OSD_OP_SETALLOCHINT osd op This is primarily for rbd's benefit and is supposed to combat fragmentation: "... knowing that rbd images have a 4m size, librbd can pass a hint that will let the osd do the xfs allocation size ioctl on new files so that they are allocated in 1m or 4m chunks. We've seen cases where users with rbd workloads have very high levels of fragmentation in xfs and this would mitigate that and probably have a pretty nice performance benefit." SETALLOCHINT is considered advisory, so our backwards compatibility mechanism here is to set FAILOK flag for all SETALLOCHINT ops. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil Reviewed-by: Alex Elder --- include/linux/ceph/osd_client.h | 8 ++++++++ include/linux/ceph/rados.h | 7 +++++++ net/ceph/osd_client.c | 27 +++++++++++++++++++++++++++ 3 files changed, 42 insertions(+) (limited to 'net/ceph/osd_client.c') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index e94f5da251d6..c42d1ada685f 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -103,6 +103,10 @@ struct ceph_osd_req_op { u32 timeout; __u8 flag; } watch; + struct { + u64 expected_object_size; + u64 expected_write_size; + } alloc_hint; }; }; @@ -294,6 +298,10 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req, extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, u64 cookie, u64 version, int flag); +extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, + unsigned int which, + u64 expected_object_size, + u64 expected_write_size); extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 8f9bf4570215..2caabef8d369 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -227,6 +227,9 @@ enum { CEPH_OSD_OP_OMAPRMKEYS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 24, CEPH_OSD_OP_OMAP_CMP = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 25, + /* hints */ + CEPH_OSD_OP_SETALLOCHINT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 35, + /** multi **/ CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_MULTI | 1, CEPH_OSD_OP_ASSERT_SRC_VERSION = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 2, @@ -416,6 +419,10 @@ struct ceph_osd_op { __le64 offset, length; __le64 src_offset; } __attribute__ ((packed)) clonerange; + struct { + __le64 expected_object_size; + __le64 expected_write_size; + } __attribute__ ((packed)) alloc_hint; }; __le32 payload_len; } __attribute__ ((packed)); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 5d7fd0b8c1c8..71830d79b0f4 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -436,6 +436,7 @@ static bool osd_req_opcode_valid(u16 opcode) case CEPH_OSD_OP_OMAPCLEAR: case CEPH_OSD_OP_OMAPRMKEYS: case CEPH_OSD_OP_OMAP_CMP: + case CEPH_OSD_OP_SETALLOCHINT: case CEPH_OSD_OP_CLONERANGE: case CEPH_OSD_OP_ASSERT_SRC_VERSION: case CEPH_OSD_OP_SRC_CMPXATTR: @@ -591,6 +592,26 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req, } EXPORT_SYMBOL(osd_req_op_watch_init); +void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, + unsigned int which, + u64 expected_object_size, + u64 expected_write_size) +{ + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, + CEPH_OSD_OP_SETALLOCHINT); + + op->alloc_hint.expected_object_size = expected_object_size; + op->alloc_hint.expected_write_size = expected_write_size; + + /* + * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed + * not worth a feature bit. Set FAILOK per-op flag to make + * sure older osds don't trip over an unsupported opcode. + */ + op->flags |= CEPH_OSD_OP_FLAG_FAILOK; +} +EXPORT_SYMBOL(osd_req_op_alloc_hint_init); + static void ceph_osdc_msg_data_add(struct ceph_msg *msg, struct ceph_osd_data *osd_data) { @@ -681,6 +702,12 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, dst->watch.ver = cpu_to_le64(src->watch.ver); dst->watch.flag = src->watch.flag; break; + case CEPH_OSD_OP_SETALLOCHINT: + dst->alloc_hint.expected_object_size = + cpu_to_le64(src->alloc_hint.expected_object_size); + dst->alloc_hint.expected_write_size = + cpu_to_le64(src->alloc_hint.expected_write_size); + break; default: pr_err("unsupported osd opcode %s\n", ceph_osd_op_name(src->op)); -- cgit v1.2.3 From a2505d63ee0541d9b4685250b033192e68222e97 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 13 Mar 2014 16:36:13 +0200 Subject: libceph: split osdmap allocation and decode steps Split osdmap allocation and initialization into a separate function, ceph_osdmap_decode(). Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- include/linux/ceph/osdmap.h | 2 +- net/ceph/osd_client.c | 2 +- net/ceph/osdmap.c | 44 +++++++++++++++++++++++++++++--------------- 3 files changed, 31 insertions(+), 17 deletions(-) (limited to 'net/ceph/osd_client.c') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 8c8b3cefc28b..46c3e304c3d8 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -156,7 +156,7 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid) return 0; } -extern struct ceph_osdmap *osdmap_decode(void **p, void *end); +extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end); extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, struct ceph_osdmap *map, struct ceph_messenger *msgr); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 71830d79b0f4..6f64eec18851 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2062,7 +2062,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) int skipped_map = 0; dout("taking full map %u len %d\n", epoch, maplen); - newmap = osdmap_decode(&p, p+maplen); + newmap = ceph_osdmap_decode(&p, p+maplen); if (IS_ERR(newmap)) { err = PTR_ERR(newmap); goto bad; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 4dd000d128fd..a82df6ea0749 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -684,9 +684,8 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) /* * decode a full map. */ -struct ceph_osdmap *osdmap_decode(void **p, void *end) +static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) { - struct ceph_osdmap *map; u16 version; u32 len, max, i; int err = -EINVAL; @@ -694,14 +693,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) void *start = *p; struct ceph_pg_pool_info *pi; - dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p)); - - map = kzalloc(sizeof(*map), GFP_NOFS); - if (map == NULL) - return ERR_PTR(-ENOMEM); - - map->pg_temp = RB_ROOT; - mutex_init(&map->crush_scratch_mutex); + dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p)); ceph_decode_16_safe(p, end, version, bad); if (version > 6) { @@ -751,7 +743,6 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) err = osdmap_set_max_osd(map, max); if (err < 0) goto bad; - dout("osdmap_decode max_osd = %d\n", map->max_osd); /* osds */ err = -EINVAL; @@ -819,7 +810,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) *p = end; dout("full osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd); - return map; + return 0; bad: pr_err("corrupt full osdmap (%d) epoch %d off %d (%p of %p-%p)\n", @@ -827,8 +818,31 @@ bad: print_hex_dump(KERN_DEBUG, "osdmap: ", DUMP_PREFIX_OFFSET, 16, 1, start, end - start, true); - ceph_osdmap_destroy(map); - return ERR_PTR(err); + return err; +} + +/* + * Allocate and decode a full map. + */ +struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) +{ + struct ceph_osdmap *map; + int ret; + + map = kzalloc(sizeof(*map), GFP_NOFS); + if (!map) + return ERR_PTR(-ENOMEM); + + map->pg_temp = RB_ROOT; + mutex_init(&map->crush_scratch_mutex); + + ret = osdmap_decode(p, end, map); + if (ret) { + ceph_osdmap_destroy(map); + return ERR_PTR(ret); + } + + return map; } /* @@ -872,7 +886,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, if (len > 0) { dout("apply_incremental full map len %d, %p to %p\n", len, *p, end); - return osdmap_decode(p, min(*p+len, end)); + return ceph_osdmap_decode(p, min(*p+len, end)); } /* new crush? */ -- cgit v1.2.3 From 8008ab1080c1768b02d232dcfd9e161cd47cc9f7 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 24 Mar 2014 17:12:48 +0200 Subject: libceph: return primary from ceph_calc_pg_acting() In preparation for adding support for primary_temp, stop assuming primaryness: add a primary out parameter to ceph_calc_pg_acting() and change call sites accordingly. Primary is now specified separately from the order of osds in the set. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- include/linux/ceph/osdmap.h | 2 +- net/ceph/osd_client.c | 10 ++++------ net/ceph/osdmap.c | 20 ++++++++++++-------- 3 files changed, 17 insertions(+), 15 deletions(-) (limited to 'net/ceph/osd_client.c') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index b0c8f8490663..561ea896c657 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -212,7 +212,7 @@ extern int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, - int *osds); + int *osds, int *primary); extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 6f64eec18851..b4157dc22199 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1333,7 +1333,7 @@ static int __map_request(struct ceph_osd_client *osdc, { struct ceph_pg pgid; int acting[CEPH_PG_MAX_SIZE]; - int o = -1, num = 0; + int num, o; int err; bool was_paused; @@ -1346,11 +1346,9 @@ static int __map_request(struct ceph_osd_client *osdc, } req->r_pgid = pgid; - err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting); - if (err > 0) { - o = acting[0]; - num = err; - } + num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o); + if (num < 0) + num = 0; was_paused = req->r_paused; req->r_paused = __req_should_be_paused(osdc, req); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index f1cad21d1533..df9389ddd56c 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1651,19 +1651,21 @@ static int apply_temps(struct ceph_osdmap *osdmap, /* * Calculate acting set for given pgid. * - * Return acting set length, or error. + * Return acting set length, or error. *primary is set to acting + * primary osd id, or -1 if acting set is empty or on error. */ int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, - int *osds) + int *osds, int *primary) { struct ceph_pg_pool_info *pool; u32 pps; int len; - int primary; pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool); - if (!pool) - return 0; + if (!pool) { + *primary = -1; + return -ENOENT; + } if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) { /* hash pool id and seed so that pool PGs do not overlap */ @@ -1684,12 +1686,14 @@ int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, } len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds); - if (len < 0) + if (len < 0) { + *primary = -1; return len; + } - len = raw_to_up_osds(osdmap, pool, osds, len, &primary); + len = raw_to_up_osds(osdmap, pool, osds, len, primary); - len = apply_temps(osdmap, pool, pgid, osds, len, &primary); + len = apply_temps(osdmap, pool, pgid, osds, len, primary); return len; } -- cgit v1.2.3