diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-07-11 22:12:28 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-07-11 22:12:28 +0300 |
commit | 3bf7878f0f7d60c394f6d6631bb179e86f09f73c (patch) | |
tree | f998ef959865db1657baa410c4ed281ad9003183 /include | |
parent | 07d306c838c5c30196619baae36107d0615e459b (diff) | |
parent | 33e9c8dbfbcef8e4cda8e43a445e692ab7e0d8c0 (diff) | |
download | linux-3bf7878f0f7d60c394f6d6631bb179e86f09f73c.tar.xz |
Merge tag 'ceph-for-4.13-rc1' of git://github.com/ceph/ceph-client
Pull ceph updates from Ilya Dryomov:
"The main item here is support for v12.y.z ("Luminous") clusters:
RESEND_ON_SPLIT, RADOS_BACKOFF, OSDMAP_PG_UPMAP and CRUSH_CHOOSE_ARGS
feature bits, and various other changes in the RADOS client protocol.
On top of that we have a new fsc mount option to allow supplying
fscache uniquifier (similar to NFS) and the usual pile of filesystem
fixes from Zheng"
* tag 'ceph-for-4.13-rc1' of git://github.com/ceph/ceph-client: (44 commits)
libceph: advertise support for NEW_OSDOP_ENCODING and SERVER_LUMINOUS
libceph: osd_state is 32 bits wide in luminous
crush: remove an obsolete comment
crush: crush_init_workspace starts with struct crush_work
libceph, crush: per-pool crush_choose_arg_map for crush_do_rule()
crush: implement weight and id overrides for straw2
libceph: apply_upmap()
libceph: compute actual pgid in ceph_pg_to_up_acting_osds()
libceph: pg_upmap[_items] infrastructure
libceph: ceph_decode_skip_* helpers
libceph: kill __{insert,lookup,remove}_pg_mapping()
libceph: introduce and switch to decode_pg_mapping()
libceph: don't pass pgid by value
libceph: respect RADOS_BACKOFF backoffs
libceph: make DEFINE_RB_* helpers more general
libceph: avoid unnecessary pi lookups in calc_target()
libceph: use target pi for calc_target() calculations
libceph: always populate t->target_{oid,oloc} in calc_target()
libceph: make sure need_resend targets reflect latest map
libceph: delete from need_resend_linger before check_linger_pool_dne()
...
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/ceph/ceph_features.h | 264 | ||||
-rw-r--r-- | include/linux/ceph/ceph_fs.h | 1 | ||||
-rw-r--r-- | include/linux/ceph/decode.h | 60 | ||||
-rw-r--r-- | include/linux/ceph/libceph.h | 49 | ||||
-rw-r--r-- | include/linux/ceph/messenger.h | 2 | ||||
-rw-r--r-- | include/linux/ceph/osd_client.h | 70 | ||||
-rw-r--r-- | include/linux/ceph/osdmap.h | 41 | ||||
-rw-r--r-- | include/linux/ceph/rados.h | 6 | ||||
-rw-r--r-- | include/linux/crush/crush.h | 66 | ||||
-rw-r--r-- | include/linux/crush/mapper.h | 9 |
10 files changed, 450 insertions, 118 deletions
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index fd8b2953c78f..f0f6c537b64c 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -2,103 +2,174 @@ #define __CEPH_FEATURES /* - * feature bits + * Each time we reclaim bits for reuse we need to specify another bit + * that, if present, indicates we have the new incarnation of that + * feature. Base case is 1 (first use). */ -#define CEPH_FEATURE_UID (1ULL<<0) -#define CEPH_FEATURE_NOSRCADDR (1ULL<<1) -#define CEPH_FEATURE_MONCLOCKCHECK (1ULL<<2) -#define CEPH_FEATURE_FLOCK (1ULL<<3) -#define CEPH_FEATURE_SUBSCRIBE2 (1ULL<<4) -#define CEPH_FEATURE_MONNAMES (1ULL<<5) -#define CEPH_FEATURE_RECONNECT_SEQ (1ULL<<6) -#define CEPH_FEATURE_DIRLAYOUTHASH (1ULL<<7) -#define CEPH_FEATURE_OBJECTLOCATOR (1ULL<<8) -#define CEPH_FEATURE_PGID64 (1ULL<<9) -#define CEPH_FEATURE_INCSUBOSDMAP (1ULL<<10) -#define CEPH_FEATURE_PGPOOL3 (1ULL<<11) -#define CEPH_FEATURE_OSDREPLYMUX (1ULL<<12) -#define CEPH_FEATURE_OSDENC (1ULL<<13) -#define CEPH_FEATURE_OMAP (1ULL<<14) -#define CEPH_FEATURE_MONENC (1ULL<<15) -#define CEPH_FEATURE_QUERY_T (1ULL<<16) -#define CEPH_FEATURE_INDEP_PG_MAP (1ULL<<17) -#define CEPH_FEATURE_CRUSH_TUNABLES (1ULL<<18) -#define CEPH_FEATURE_CHUNKY_SCRUB (1ULL<<19) -#define CEPH_FEATURE_MON_NULLROUTE (1ULL<<20) -#define CEPH_FEATURE_MON_GV (1ULL<<21) -#define CEPH_FEATURE_BACKFILL_RESERVATION (1ULL<<22) -#define CEPH_FEATURE_MSG_AUTH (1ULL<<23) -#define CEPH_FEATURE_RECOVERY_RESERVATION (1ULL<<24) -#define CEPH_FEATURE_CRUSH_TUNABLES2 (1ULL<<25) -#define CEPH_FEATURE_CREATEPOOLID (1ULL<<26) -#define CEPH_FEATURE_REPLY_CREATE_INODE (1ULL<<27) -#define CEPH_FEATURE_OSD_HBMSGS (1ULL<<28) -#define CEPH_FEATURE_MDSENC (1ULL<<29) -#define CEPH_FEATURE_OSDHASHPSPOOL (1ULL<<30) -#define CEPH_FEATURE_MON_SINGLE_PAXOS (1ULL<<31) -#define CEPH_FEATURE_OSD_SNAPMAPPER (1ULL<<32) -#define CEPH_FEATURE_MON_SCRUB (1ULL<<33) -#define CEPH_FEATURE_OSD_PACKED_RECOVERY (1ULL<<34) -#define CEPH_FEATURE_OSD_CACHEPOOL (1ULL<<35) -#define CEPH_FEATURE_CRUSH_V2 (1ULL<<36) /* new indep; SET_* steps */ -#define CEPH_FEATURE_EXPORT_PEER (1ULL<<37) -#define CEPH_FEATURE_OSD_ERASURE_CODES (1ULL<<38) -#define CEPH_FEATURE_OSD_TMAP2OMAP (1ULL<<38) /* overlap with EC */ -/* The process supports new-style OSDMap encoding. Monitors also use - this bit to determine if peers support NAK messages. */ -#define CEPH_FEATURE_OSDMAP_ENC (1ULL<<39) -#define CEPH_FEATURE_MDS_INLINE_DATA (1ULL<<40) -#define CEPH_FEATURE_CRUSH_TUNABLES3 (1ULL<<41) -#define CEPH_FEATURE_OSD_PRIMARY_AFFINITY (1ULL<<41) /* overlap w/ tunables3 */ -#define CEPH_FEATURE_MSGR_KEEPALIVE2 (1ULL<<42) -#define CEPH_FEATURE_OSD_POOLRESEND (1ULL<<43) -#define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2 (1ULL<<44) -#define CEPH_FEATURE_OSD_SET_ALLOC_HINT (1ULL<<45) -#define CEPH_FEATURE_OSD_FADVISE_FLAGS (1ULL<<46) -#define CEPH_FEATURE_OSD_REPOP (1ULL<<46) /* overlap with fadvise */ -#define CEPH_FEATURE_OSD_OBJECT_DIGEST (1ULL<<46) /* overlap with fadvise */ -#define CEPH_FEATURE_OSD_TRANSACTION_MAY_LAYOUT (1ULL<<46) /* overlap w/ fadvise */ -#define CEPH_FEATURE_MDS_QUOTA (1ULL<<47) -#define CEPH_FEATURE_CRUSH_V4 (1ULL<<48) /* straw2 buckets */ -#define CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY (1ULL<<49) -// duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY -#define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49) /* overlap w/ above */ -#define CEPH_FEATURE_MON_METADATA (1ULL<<50) -#define CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT (1ULL<<51) /* can sort objs bitwise */ -#define CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES (1ULL<<52) -#define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3 (1ULL<<53) -#define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<54) -#define CEPH_FEATURE_HAMMER_0_94_4 (1ULL<<55) -#define CEPH_FEATURE_NEW_OSDOP_ENCODING (1ULL<<56) /* New, v7 encoding */ -#define CEPH_FEATURE_MON_STATEFUL_SUB (1ULL<<57) /* stateful mon subscription */ -#define CEPH_FEATURE_MON_ROUTE_OSDMAP (1ULL<<57) /* peon sends osdmaps */ -#define CEPH_FEATURE_CRUSH_TUNABLES5 (1ULL<<58) /* chooseleaf stable mode */ -// duplicated since it was introduced at the same time as CEPH_FEATURE_CRUSH_TUNABLES5 -#define CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING (1ULL<<58) /* New, v7 encoding */ -#define CEPH_FEATURE_FS_FILE_LAYOUT_V2 (1ULL<<58) /* file_layout_t */ +#define CEPH_FEATURE_INCARNATION_1 (0ull) +#define CEPH_FEATURE_INCARNATION_2 (1ull<<57) // CEPH_FEATURE_SERVER_JEWEL + +#define DEFINE_CEPH_FEATURE(bit, incarnation, name) \ + const static uint64_t CEPH_FEATURE_##name = (1ULL<<bit); \ + const static uint64_t CEPH_FEATUREMASK_##name = \ + (1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation); + +/* this bit is ignored but still advertised by release *when* */ +#define DEFINE_CEPH_FEATURE_DEPRECATED(bit, incarnation, name, when) \ + const static uint64_t DEPRECATED_CEPH_FEATURE_##name = (1ULL<<bit); \ + const static uint64_t DEPRECATED_CEPH_FEATUREMASK_##name = \ + (1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation); /* - * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature - * vector to evaluate to 64 bit ~0. To cope, we designate 1ULL << 63 - * to mean 33 bit ~0, and introduce a helper below to do the - * translation. + * this bit is ignored by release *unused* and not advertised by + * release *unadvertised* + */ +#define DEFINE_CEPH_FEATURE_RETIRED(bit, inc, name, unused, unadvertised) + + +/* + * test for a feature. this test is safer than a typical mask against + * the bit because it ensures that we have the bit AND the marker for the + * bit's incarnation. this must be used in any case where the features + * bits may include an old meaning of the bit. + */ +#define CEPH_HAVE_FEATURE(x, name) \ + (((x) & (CEPH_FEATUREMASK_##name)) == (CEPH_FEATUREMASK_##name)) + + +/* + * Notes on deprecation: + * + * A *major* release is a release through which all upgrades must pass + * (e.g., jewel). For example, no pre-jewel server will ever talk to + * a post-jewel server (mon, osd, etc). + * + * For feature bits used *only* on the server-side: + * + * - In the first phase we indicate that a feature is DEPRECATED as of + * a particular release. This is the first major release X (say, + * jewel) that does not depend on its peers advertising the feature. + * That is, it safely assumes its peers all have the feature. We + * indicate this with the DEPRECATED macro. For example, + * + * DEFINE_CEPH_FEATURE_DEPRECATED( 2, 1, MONCLOCKCHECK, JEWEL) + * + * because 10.2.z (jewel) did not care if its peers advertised this + * feature bit. + * + * - In the second phase we stop advertising the the bit and call it + * RETIRED. This can normally be done in the *next* major release + * following the one in which we marked the feature DEPRECATED. In + * the above example, for 12.0.z (luminous) we can say: + * + * DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS) * - * This was introduced by ceph.git commit - * 9ea02b84104045c2ffd7e7f4e7af512953855ecd v0.58-657-g9ea02b8 - * and fixed by ceph.git commit - * 4255b5c2fb54ae40c53284b3ab700fdfc7e61748 v0.65-263-g4255b5c + * - The bit can be reused in the first post-luminous release, 13.0.z + * (m). + * + * This ensures that no two versions who have different meanings for + * the bit ever speak to each other. */ -#define CEPH_FEATURE_RESERVED (1ULL<<63) - -static inline u64 ceph_sanitize_features(u64 features) -{ - if (features & CEPH_FEATURE_RESERVED) { - /* everything through OSD_SNAPMAPPER */ - return 0x1ffffffffull; - } else { - return features; - } -} + +DEFINE_CEPH_FEATURE( 0, 1, UID) +DEFINE_CEPH_FEATURE( 1, 1, NOSRCADDR) +DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE( 3, 1, FLOCK) +DEFINE_CEPH_FEATURE( 4, 1, SUBSCRIBE2) +DEFINE_CEPH_FEATURE( 5, 1, MONNAMES) +DEFINE_CEPH_FEATURE( 6, 1, RECONNECT_SEQ) +DEFINE_CEPH_FEATURE( 7, 1, DIRLAYOUTHASH) +DEFINE_CEPH_FEATURE( 8, 1, OBJECTLOCATOR) +DEFINE_CEPH_FEATURE( 9, 1, PGID64) +DEFINE_CEPH_FEATURE(10, 1, INCSUBOSDMAP) +DEFINE_CEPH_FEATURE(11, 1, PGPOOL3) +DEFINE_CEPH_FEATURE(12, 1, OSDREPLYMUX) +DEFINE_CEPH_FEATURE(13, 1, OSDENC) +DEFINE_CEPH_FEATURE_RETIRED(14, 1, OMAP, HAMMER, JEWEL) +DEFINE_CEPH_FEATURE(14, 2, SERVER_KRAKEN) +DEFINE_CEPH_FEATURE(15, 1, MONENC) +DEFINE_CEPH_FEATURE_RETIRED(16, 1, QUERY_T, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE_RETIRED(17, 1, INDEP_PG_MAP, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE(18, 1, CRUSH_TUNABLES) +DEFINE_CEPH_FEATURE_RETIRED(19, 1, CHUNKY_SCRUB, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE_RETIRED(20, 1, MON_NULLROUTE, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE_RETIRED(21, 1, MON_GV, HAMMER, JEWEL) +DEFINE_CEPH_FEATURE(21, 2, SERVER_LUMINOUS) +DEFINE_CEPH_FEATURE(21, 2, RESEND_ON_SPLIT) // overlap +DEFINE_CEPH_FEATURE(21, 2, RADOS_BACKOFF) // overlap +DEFINE_CEPH_FEATURE(21, 2, OSDMAP_PG_UPMAP) // overlap +DEFINE_CEPH_FEATURE(21, 2, CRUSH_CHOOSE_ARGS) // overlap +DEFINE_CEPH_FEATURE_RETIRED(22, 1, BACKFILL_RESERVATION, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE(23, 1, MSG_AUTH) +DEFINE_CEPH_FEATURE_RETIRED(24, 1, RECOVERY_RESERVATION, JEWEL, LUNINOUS) + +DEFINE_CEPH_FEATURE(25, 1, CRUSH_TUNABLES2) +DEFINE_CEPH_FEATURE(26, 1, CREATEPOOLID) +DEFINE_CEPH_FEATURE(27, 1, REPLY_CREATE_INODE) +DEFINE_CEPH_FEATURE_RETIRED(28, 1, OSD_HBMSGS, HAMMER, JEWEL) +DEFINE_CEPH_FEATURE(28, 2, SERVER_M) +DEFINE_CEPH_FEATURE(29, 1, MDSENC) +DEFINE_CEPH_FEATURE(30, 1, OSDHASHPSPOOL) +DEFINE_CEPH_FEATURE(31, 1, MON_SINGLE_PAXOS) // deprecate me +DEFINE_CEPH_FEATURE_RETIRED(32, 1, OSD_SNAPMAPPER, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE_RETIRED(33, 1, MON_SCRUB, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE_RETIRED(34, 1, OSD_PACKED_RECOVERY, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE(35, 1, OSD_CACHEPOOL) +DEFINE_CEPH_FEATURE(36, 1, CRUSH_V2) +DEFINE_CEPH_FEATURE(37, 1, EXPORT_PEER) +DEFINE_CEPH_FEATURE(38, 1, OSD_ERASURE_CODES) +DEFINE_CEPH_FEATURE(38, 1, OSD_OSD_TMAP2OMAP) // overlap +DEFINE_CEPH_FEATURE(39, 1, OSDMAP_ENC) +DEFINE_CEPH_FEATURE(40, 1, MDS_INLINE_DATA) +DEFINE_CEPH_FEATURE(41, 1, CRUSH_TUNABLES3) +DEFINE_CEPH_FEATURE(41, 1, OSD_PRIMARY_AFFINITY) // overlap +DEFINE_CEPH_FEATURE(42, 1, MSGR_KEEPALIVE2) +DEFINE_CEPH_FEATURE(43, 1, OSD_POOLRESEND) +DEFINE_CEPH_FEATURE(44, 1, ERASURE_CODE_PLUGINS_V2) +DEFINE_CEPH_FEATURE_RETIRED(45, 1, OSD_SET_ALLOC_HINT, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE(46, 1, OSD_FADVISE_FLAGS) +DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_REPOP, JEWEL, LUMINOUS) // overlap +DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_OBJECT_DIGEST, JEWEL, LUMINOUS) // overlap +DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_TRANSACTION_MAY_LAYOUT, JEWEL, LUMINOUS) // overlap + +DEFINE_CEPH_FEATURE(47, 1, MDS_QUOTA) +DEFINE_CEPH_FEATURE(48, 1, CRUSH_V4) +DEFINE_CEPH_FEATURE_RETIRED(49, 1, OSD_MIN_SIZE_RECOVERY, JEWEL, LUMINOUS) +DEFINE_CEPH_FEATURE_RETIRED(49, 1, OSD_PROXY_FEATURES, JEWEL, LUMINOUS) // overlap + +DEFINE_CEPH_FEATURE(50, 1, MON_METADATA) +DEFINE_CEPH_FEATURE(51, 1, OSD_BITWISE_HOBJ_SORT) +DEFINE_CEPH_FEATURE(52, 1, OSD_PROXY_WRITE_FEATURES) +DEFINE_CEPH_FEATURE(53, 1, ERASURE_CODE_PLUGINS_V3) +DEFINE_CEPH_FEATURE(54, 1, OSD_HITSET_GMT) +DEFINE_CEPH_FEATURE(55, 1, HAMMER_0_94_4) +DEFINE_CEPH_FEATURE(56, 1, NEW_OSDOP_ENCODING) +DEFINE_CEPH_FEATURE(57, 1, MON_STATEFUL_SUB) +DEFINE_CEPH_FEATURE(57, 1, MON_ROUTE_OSDMAP) // overlap +DEFINE_CEPH_FEATURE(57, 1, OSDSUBOP_NO_SNAPCONTEXT) // overlap +DEFINE_CEPH_FEATURE(57, 1, SERVER_JEWEL) // overlap +DEFINE_CEPH_FEATURE(58, 1, CRUSH_TUNABLES5) +DEFINE_CEPH_FEATURE(58, 1, NEW_OSDOPREPLY_ENCODING) // overlap +DEFINE_CEPH_FEATURE(58, 1, FS_FILE_LAYOUT_V2) // overlap +DEFINE_CEPH_FEATURE(59, 1, FS_BTIME) +DEFINE_CEPH_FEATURE(59, 1, FS_CHANGE_ATTR) // overlap +DEFINE_CEPH_FEATURE(59, 1, MSG_ADDR2) // overlap +DEFINE_CEPH_FEATURE(60, 1, BLKIN_TRACING) // *do not share this bit* + +DEFINE_CEPH_FEATURE(61, 1, RESERVED2) // unused, but slow down! +DEFINE_CEPH_FEATURE(62, 1, RESERVED) // do not use; used as a sentinal +DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facing + /* * Features supported. @@ -113,6 +184,11 @@ static inline u64 ceph_sanitize_features(u64 features) CEPH_FEATURE_PGPOOL3 | \ CEPH_FEATURE_OSDENC | \ CEPH_FEATURE_CRUSH_TUNABLES | \ + CEPH_FEATURE_SERVER_LUMINOUS | \ + CEPH_FEATURE_RESEND_ON_SPLIT | \ + CEPH_FEATURE_RADOS_BACKOFF | \ + CEPH_FEATURE_OSDMAP_PG_UPMAP | \ + CEPH_FEATURE_CRUSH_CHOOSE_ARGS | \ CEPH_FEATURE_MSG_AUTH | \ CEPH_FEATURE_CRUSH_TUNABLES2 | \ CEPH_FEATURE_REPLY_CREATE_INODE | \ @@ -126,7 +202,11 @@ static inline u64 ceph_sanitize_features(u64 features) CEPH_FEATURE_CRUSH_TUNABLES3 | \ CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \ CEPH_FEATURE_MSGR_KEEPALIVE2 | \ + CEPH_FEATURE_OSD_POOLRESEND | \ CEPH_FEATURE_CRUSH_V4 | \ + CEPH_FEATURE_NEW_OSDOP_ENCODING | \ + CEPH_FEATURE_SERVER_JEWEL | \ + CEPH_FEATURE_MON_STATEFUL_SUB | \ CEPH_FEATURE_CRUSH_TUNABLES5 | \ CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING) diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index ad078ebe25d6..edf5b04b918a 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -147,6 +147,7 @@ struct ceph_dir_layout { #define CEPH_MSG_OSD_OP 42 #define CEPH_MSG_OSD_OPREPLY 43 #define CEPH_MSG_WATCH_NOTIFY 44 +#define CEPH_MSG_OSD_BACKOFF 61 /* watch-notify operations */ diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h index f990f2cc907a..14af9b70d301 100644 --- a/include/linux/ceph/decode.h +++ b/include/linux/ceph/decode.h @@ -133,6 +133,66 @@ bad: } /* + * skip helpers + */ +#define ceph_decode_skip_n(p, end, n, bad) \ + do { \ + ceph_decode_need(p, end, n, bad); \ + *p += n; \ + } while (0) + +#define ceph_decode_skip_64(p, end, bad) \ +ceph_decode_skip_n(p, end, sizeof(u64), bad) + +#define ceph_decode_skip_32(p, end, bad) \ +ceph_decode_skip_n(p, end, sizeof(u32), bad) + +#define ceph_decode_skip_16(p, end, bad) \ +ceph_decode_skip_n(p, end, sizeof(u16), bad) + +#define ceph_decode_skip_8(p, end, bad) \ +ceph_decode_skip_n(p, end, sizeof(u8), bad) + +#define ceph_decode_skip_string(p, end, bad) \ + do { \ + u32 len; \ + \ + ceph_decode_32_safe(p, end, len, bad); \ + ceph_decode_skip_n(p, end, len, bad); \ + } while (0) + +#define ceph_decode_skip_set(p, end, type, bad) \ + do { \ + u32 len; \ + \ + ceph_decode_32_safe(p, end, len, bad); \ + while (len--) \ + ceph_decode_skip_##type(p, end, bad); \ + } while (0) + +#define ceph_decode_skip_map(p, end, ktype, vtype, bad) \ + do { \ + u32 len; \ + \ + ceph_decode_32_safe(p, end, len, bad); \ + while (len--) { \ + ceph_decode_skip_##ktype(p, end, bad); \ + ceph_decode_skip_##vtype(p, end, bad); \ + } \ + } while (0) + +#define ceph_decode_skip_map_of_map(p, end, ktype1, ktype2, vtype2, bad) \ + do { \ + u32 len; \ + \ + ceph_decode_32_safe(p, end, len, bad); \ + while (len--) { \ + ceph_decode_skip_##ktype1(p, end, bad); \ + ceph_decode_skip_map(p, end, ktype2, vtype2, bad); \ + } \ + } while (0) + +/* * struct ceph_timespec <-> struct timespec */ static inline void ceph_decode_timespec(struct timespec *ts, diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 3229ae6c7846..8a79587e1317 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -184,10 +184,11 @@ static inline int calc_pages_for(u64 off, u64 len) (off >> PAGE_SHIFT); } -/* - * These are not meant to be generic - an integer key is assumed. - */ -#define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \ +#define RB_BYVAL(a) (a) +#define RB_BYPTR(a) (&(a)) +#define RB_CMP3WAY(a, b) ((a) < (b) ? -1 : (a) > (b)) + +#define DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, cmpexp, keyexp, nodefld) \ static void insert_##name(struct rb_root *root, type *t) \ { \ struct rb_node **n = &root->rb_node; \ @@ -197,11 +198,13 @@ static void insert_##name(struct rb_root *root, type *t) \ \ while (*n) { \ type *cur = rb_entry(*n, type, nodefld); \ + int cmp; \ \ parent = *n; \ - if (t->keyfld < cur->keyfld) \ + cmp = cmpexp(keyexp(t->keyfld), keyexp(cur->keyfld)); \ + if (cmp < 0) \ n = &(*n)->rb_left; \ - else if (t->keyfld > cur->keyfld) \ + else if (cmp > 0) \ n = &(*n)->rb_right; \ else \ BUG(); \ @@ -217,19 +220,24 @@ static void erase_##name(struct rb_root *root, type *t) \ RB_CLEAR_NODE(&t->nodefld); \ } -#define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) \ -extern type __lookup_##name##_key; \ -static type *lookup_##name(struct rb_root *root, \ - typeof(__lookup_##name##_key.keyfld) key) \ +/* + * @lookup_param_type is a parameter and not constructed from (@type, + * @keyfld) with typeof() because adding const is too unwieldy. + */ +#define DEFINE_RB_LOOKUP_FUNC2(name, type, keyfld, cmpexp, keyexp, \ + lookup_param_type, nodefld) \ +static type *lookup_##name(struct rb_root *root, lookup_param_type key) \ { \ struct rb_node *n = root->rb_node; \ \ while (n) { \ type *cur = rb_entry(n, type, nodefld); \ + int cmp; \ \ - if (key < cur->keyfld) \ + cmp = cmpexp(key, keyexp(cur->keyfld)); \ + if (cmp < 0) \ n = n->rb_left; \ - else if (key > cur->keyfld) \ + else if (cmp > 0) \ n = n->rb_right; \ else \ return cur; \ @@ -238,6 +246,23 @@ static type *lookup_##name(struct rb_root *root, \ return NULL; \ } +#define DEFINE_RB_FUNCS2(name, type, keyfld, cmpexp, keyexp, \ + lookup_param_type, nodefld) \ +DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, cmpexp, keyexp, nodefld) \ +DEFINE_RB_LOOKUP_FUNC2(name, type, keyfld, cmpexp, keyexp, \ + lookup_param_type, nodefld) + +/* + * Shorthands for integer keys. + */ +#define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \ +DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, RB_CMP3WAY, RB_BYVAL, nodefld) + +#define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) \ +extern type __lookup_##name##_key; \ +DEFINE_RB_LOOKUP_FUNC2(name, type, keyfld, RB_CMP3WAY, RB_BYVAL, \ + typeof(__lookup_##name##_key.keyfld), nodefld) + #define DEFINE_RB_FUNCS(name, type, keyfld, nodefld) \ DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \ DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index c5c4c713e00f..fbd94d9fa5dd 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -44,6 +44,8 @@ struct ceph_connection_operations { struct ceph_msg_header *hdr, int *skip); + void (*reencode_message) (struct ceph_msg *msg); + int (*sign_message) (struct ceph_msg *msg); int (*check_message_signature) (struct ceph_msg *msg); }; diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 85650b415e73..c6d96a5f46fd 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -1,6 +1,7 @@ #ifndef _FS_CEPH_OSD_CLIENT_H #define _FS_CEPH_OSD_CLIENT_H +#include <linux/bitrev.h> #include <linux/completion.h> #include <linux/kref.h> #include <linux/mempool.h> @@ -36,6 +37,8 @@ struct ceph_osd { struct ceph_connection o_con; struct rb_root o_requests; struct rb_root o_linger_requests; + struct rb_root o_backoff_mappings; + struct rb_root o_backoffs_by_id; struct list_head o_osd_lru; struct ceph_auth_handshake o_auth; unsigned long lru_ttl; @@ -136,7 +139,8 @@ struct ceph_osd_request_target { struct ceph_object_id target_oid; struct ceph_object_locator target_oloc; - struct ceph_pg pgid; + struct ceph_pg pgid; /* last raw pg we mapped to */ + struct ceph_spg spgid; /* last actual spg we mapped to */ u32 pg_num; u32 pg_num_mask; struct ceph_osds acting; @@ -148,6 +152,9 @@ struct ceph_osd_request_target { unsigned int flags; /* CEPH_OSD_FLAG_* */ bool paused; + u32 epoch; + u32 last_force_resend; + int osd; }; @@ -193,7 +200,6 @@ struct ceph_osd_request { unsigned long r_stamp; /* jiffies, send or check time */ unsigned long r_start_stamp; /* jiffies */ int r_attempts; - u32 r_last_force_resend; u32 r_map_dne_bound; struct ceph_osd_req_op r_ops[]; @@ -203,6 +209,23 @@ struct ceph_request_redirect { struct ceph_object_locator oloc; }; +/* + * osd request identifier + * + * caller name + incarnation# + tid to unique identify this request + */ +struct ceph_osd_reqid { + struct ceph_entity_name name; + __le64 tid; + __le32 inc; +} __packed; + +struct ceph_blkin_trace_info { + __le64 trace_id; + __le64 span_id; + __le64 parent_span_id; +} __packed; + typedef void (*rados_watchcb2_t)(void *arg, u64 notify_id, u64 cookie, u64 notifier_id, void *data, size_t data_len); typedef void (*rados_watcherrcb_t)(void *arg, u64 cookie, int err); @@ -221,7 +244,6 @@ struct ceph_osd_linger_request { struct list_head pending_lworks; struct ceph_osd_request_target t; - u32 last_force_resend; u32 map_dne_bound; struct timespec mtime; @@ -256,6 +278,48 @@ struct ceph_watch_item { struct ceph_entity_addr addr; }; +struct ceph_spg_mapping { + struct rb_node node; + struct ceph_spg spgid; + + struct rb_root backoffs; +}; + +struct ceph_hobject_id { + void *key; + size_t key_len; + void *oid; + size_t oid_len; + u64 snapid; + u32 hash; + u8 is_max; + void *nspace; + size_t nspace_len; + s64 pool; + + /* cache */ + u32 hash_reverse_bits; +}; + +static inline void ceph_hoid_build_hash_cache(struct ceph_hobject_id *hoid) +{ + hoid->hash_reverse_bits = bitrev32(hoid->hash); +} + +/* + * PG-wide backoff: [begin, end) + * per-object backoff: begin == end + */ +struct ceph_osd_backoff { + struct rb_node spg_node; + struct rb_node id_node; + + struct ceph_spg spgid; + u64 id; + struct ceph_hobject_id *begin; + struct ceph_hobject_id *end; +}; + #define CEPH_LINGER_ID_START 0xffff000000000000ULL struct ceph_osd_client { diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 938656f70807..a0996cb9faed 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -24,7 +24,15 @@ struct ceph_pg { uint32_t seed; }; +#define CEPH_SPG_NOSHARD -1 + +struct ceph_spg { + struct ceph_pg pgid; + s8 shard; +}; + int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs); +int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs); #define CEPH_POOL_FLAG_HASHPSPOOL (1ULL << 0) /* hash pg seed and pool id together */ @@ -135,10 +143,14 @@ struct ceph_pg_mapping { struct { int len; int osds[]; - } pg_temp; + } pg_temp, pg_upmap; struct { int osd; } primary_temp; + struct { + int len; + int from_to[][2]; + } pg_upmap_items; }; }; @@ -150,13 +162,17 @@ struct ceph_osdmap { u32 flags; /* CEPH_OSDMAP_* */ u32 max_osd; /* size of osd_state, _offload, _addr arrays */ - u8 *osd_state; /* CEPH_OSD_* */ + u32 *osd_state; /* CEPH_OSD_* */ u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */ struct ceph_entity_addr *osd_addr; struct rb_root pg_temp; struct rb_root primary_temp; + /* remap (post-CRUSH, pre-up) */ + struct rb_root pg_upmap; /* PG := raw set */ + struct rb_root pg_upmap_items; /* from -> to within raw set */ + u32 *osd_primary_affinity; struct rb_root pg_pools; @@ -187,7 +203,7 @@ static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd) return !ceph_osd_is_up(map, osd); } -extern char *ceph_osdmap_state_str(char *str, int len, int state); +char *ceph_osdmap_state_str(char *str, int len, u32 state); extern u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd); static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map, @@ -198,11 +214,13 @@ static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map, return &map->osd_addr[osd]; } +#define CEPH_PGID_ENCODING_LEN (1 + 8 + 4 + 4) + static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid) { __u8 version; - if (!ceph_has_room(p, end, 1 + 8 + 4 + 4)) { + if (!ceph_has_room(p, end, CEPH_PGID_ENCODING_LEN)) { pr_warn("incomplete pg encoding\n"); return -EINVAL; } @@ -240,6 +258,8 @@ static inline void ceph_osds_init(struct ceph_osds *set) void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src); +bool ceph_pg_is_split(const struct ceph_pg *pgid, u32 old_pg_num, + u32 new_pg_num); bool ceph_is_new_interval(const struct ceph_osds *old_acting, const struct ceph_osds *new_acting, const struct ceph_osds *old_up, @@ -262,15 +282,24 @@ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u64 off, u64 len, u64 *bno, u64 *oxoff, u64 *oxlen); +int __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi, + const struct ceph_object_id *oid, + const struct ceph_object_locator *oloc, + struct ceph_pg *raw_pgid); int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, - struct ceph_object_id *oid, - struct ceph_object_locator *oloc, + const struct ceph_object_id *oid, + const struct ceph_object_locator *oloc, struct ceph_pg *raw_pgid); void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pi, const struct ceph_pg *raw_pgid, struct ceph_osds *up, struct ceph_osds *acting); +bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pi, + const struct ceph_pg *raw_pgid, + struct ceph_spg *spgid); int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap, const struct ceph_pg *raw_pgid); diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 5d0018782d50..385db08bb8b2 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -439,6 +439,12 @@ enum { const char *ceph_osd_watch_op_name(int o); +enum { + CEPH_OSD_BACKOFF_OP_BLOCK = 1, + CEPH_OSD_BACKOFF_OP_ACK_BLOCK = 2, + CEPH_OSD_BACKOFF_OP_UNBLOCK = 3, +}; + /* * an individual object operation. each may be accompanied by some data * payload diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index fbecbd089d75..92e165d417a6 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -2,6 +2,7 @@ #define CEPH_CRUSH_CRUSH_H #ifdef __KERNEL__ +# include <linux/rbtree.h> # include <linux/types.h> #else # include "crush_compat.h" @@ -137,6 +138,68 @@ struct crush_bucket { }; +/** @ingroup API + * + * Replacement weights for each item in a bucket. The size of the + * array must be exactly the size of the straw2 bucket, just as the + * item_weights array. + * + */ +struct crush_weight_set { + __u32 *weights; /*!< 16.16 fixed point weights + in the same order as items */ + __u32 size; /*!< size of the __weights__ array */ +}; + +/** @ingroup API + * + * Replacement weights and ids for a given straw2 bucket, for + * placement purposes. + * + * When crush_do_rule() chooses the Nth item from a straw2 bucket, the + * replacement weights found at __weight_set[N]__ are used instead of + * the weights from __item_weights__. If __N__ is greater than + * __weight_set_size__, the weights found at __weight_set_size-1__ are + * used instead. For instance if __weight_set__ is: + * + * [ [ 0x10000, 0x20000 ], // position 0 + * [ 0x20000, 0x40000 ] ] // position 1 + * + * choosing the 0th item will use position 0 weights [ 0x10000, 0x20000 ] + * choosing the 1th item will use position 1 weights [ 0x20000, 0x40000 ] + * choosing the 2th item will use position 1 weights [ 0x20000, 0x40000 ] + * etc. + * + */ +struct crush_choose_arg { + __s32 *ids; /*!< values to use instead of items */ + __u32 ids_size; /*!< size of the __ids__ array */ + struct crush_weight_set *weight_set; /*!< weight replacements for + a given position */ + __u32 weight_set_size; /*!< size of the __weight_set__ array */ +}; + +/** @ingroup API + * + * Replacement weights and ids for each bucket in the crushmap. The + * __size__ of the __args__ array must be exactly the same as the + * __map->max_buckets__. + * + * The __crush_choose_arg__ at index N will be used when choosing + * an item from the bucket __map->buckets[N]__ bucket, provided it + * is a straw2 bucket. + * + */ +struct crush_choose_arg_map { +#ifdef __KERNEL__ + struct rb_node node; + u64 choose_args_index; +#endif + struct crush_choose_arg *args; /*!< replacement for each bucket + in the crushmap */ + __u32 size; /*!< size of the __args__ array */ +}; + struct crush_bucket_uniform { struct crush_bucket h; __u32 item_weight; /* 16-bit fixed point; all items equally weighted */ @@ -236,6 +299,9 @@ struct crush_map { __u32 allowed_bucket_algs; __u32 *choose_tries; +#else + /* CrushWrapper::choose_args */ + struct rb_root choose_args; #endif }; diff --git a/include/linux/crush/mapper.h b/include/linux/crush/mapper.h index c95e19e1ff11..141edabb947e 100644 --- a/include/linux/crush/mapper.h +++ b/include/linux/crush/mapper.h @@ -11,11 +11,10 @@ #include "crush.h" extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size); -extern int crush_do_rule(const struct crush_map *map, - int ruleno, - int x, int *result, int result_max, - const __u32 *weights, int weight_max, - void *cwin); +int crush_do_rule(const struct crush_map *map, + int ruleno, int x, int *result, int result_max, + const __u32 *weight, int weight_max, + void *cwin, const struct crush_choose_arg *choose_args); /* * Returns the exact amount of workspace that will need to be used |