diff options
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/ceph/ceph_features.h | 38 | ||||
-rw-r--r-- | include/linux/ceph/ceph_fs.h | 32 | ||||
-rw-r--r-- | include/linux/ceph/decode.h | 29 | ||||
-rw-r--r-- | include/linux/ceph/libceph.h | 16 | ||||
-rw-r--r-- | include/linux/ceph/mdsmap.h | 4 | ||||
-rw-r--r-- | include/linux/ceph/messenger.h | 2 | ||||
-rw-r--r-- | include/linux/ceph/osd_client.h | 74 | ||||
-rw-r--r-- | include/linux/ceph/osdmap.h | 30 | ||||
-rw-r--r-- | include/linux/ceph/rados.h | 158 | ||||
-rw-r--r-- | include/linux/crush/crush.h | 2 |
10 files changed, 227 insertions, 158 deletions
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index dad579b0c0e6..76554cecaab2 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -12,16 +12,46 @@ #define CEPH_FEATURE_MONNAMES (1<<5) #define CEPH_FEATURE_RECONNECT_SEQ (1<<6) #define CEPH_FEATURE_DIRLAYOUTHASH (1<<7) -/* bits 8-17 defined by user-space; not supported yet here */ +#define CEPH_FEATURE_OBJECTLOCATOR (1<<8) +#define CEPH_FEATURE_PGID64 (1<<9) +#define CEPH_FEATURE_INCSUBOSDMAP (1<<10) +#define CEPH_FEATURE_PGPOOL3 (1<<11) +#define CEPH_FEATURE_OSDREPLYMUX (1<<12) +#define CEPH_FEATURE_OSDENC (1<<13) +#define CEPH_FEATURE_OMAP (1<<14) +#define CEPH_FEATURE_MONENC (1<<15) +#define CEPH_FEATURE_QUERY_T (1<<16) +#define CEPH_FEATURE_INDEP_PG_MAP (1<<17) #define CEPH_FEATURE_CRUSH_TUNABLES (1<<18) +#define CEPH_FEATURE_CHUNKY_SCRUB (1<<19) +#define CEPH_FEATURE_MON_NULLROUTE (1<<20) +#define CEPH_FEATURE_MON_GV (1<<21) +#define CEPH_FEATURE_BACKFILL_RESERVATION (1<<22) +#define CEPH_FEATURE_MSG_AUTH (1<<23) +#define CEPH_FEATURE_RECOVERY_RESERVATION (1<<24) +#define CEPH_FEATURE_CRUSH_TUNABLES2 (1<<25) +#define CEPH_FEATURE_CREATEPOOLID (1<<26) +#define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27) +#define CEPH_FEATURE_OSD_HBMSGS (1<<28) +#define CEPH_FEATURE_MDSENC (1<<29) +#define CEPH_FEATURE_OSDHASHPSPOOL (1<<30) /* * Features supported. */ #define CEPH_FEATURES_SUPPORTED_DEFAULT \ - (CEPH_FEATURE_NOSRCADDR | \ - CEPH_FEATURE_CRUSH_TUNABLES) + (CEPH_FEATURE_NOSRCADDR | \ + CEPH_FEATURE_PGID64 | \ + CEPH_FEATURE_PGPOOL3 | \ + CEPH_FEATURE_OSDENC | \ + CEPH_FEATURE_CRUSH_TUNABLES | \ + CEPH_FEATURE_CRUSH_TUNABLES2 | \ + CEPH_FEATURE_REPLY_CREATE_INODE | \ + CEPH_FEATURE_OSDHASHPSPOOL) #define CEPH_FEATURES_REQUIRED_DEFAULT \ - (CEPH_FEATURE_NOSRCADDR) + (CEPH_FEATURE_NOSRCADDR | \ + CEPH_FEATURE_PGID64 | \ + CEPH_FEATURE_PGPOOL3 | \ + CEPH_FEATURE_OSDENC) #endif diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index cf6f4d998a76..2ad7b860f062 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -21,16 +21,14 @@ * internal cluster protocols separately from the public, * client-facing protocol. */ -#define CEPH_OSD_PROTOCOL 8 /* cluster internal */ -#define CEPH_MDS_PROTOCOL 12 /* cluster internal */ -#define CEPH_MON_PROTOCOL 5 /* cluster internal */ #define CEPH_OSDC_PROTOCOL 24 /* server/client */ #define CEPH_MDSC_PROTOCOL 32 /* server/client */ #define CEPH_MONC_PROTOCOL 15 /* server/client */ -#define CEPH_INO_ROOT 1 -#define CEPH_INO_CEPH 2 /* hidden .ceph dir */ +#define CEPH_INO_ROOT 1 +#define CEPH_INO_CEPH 2 /* hidden .ceph dir */ +#define CEPH_INO_DOTDOT 3 /* used by ceph fuse for parent (..) */ /* arbitrary limit on max # of monitors (cluster of 3 is typical) */ #define CEPH_MAX_MON 31 @@ -51,7 +49,7 @@ struct ceph_file_layout { __le32 fl_object_stripe_unit; /* UNUSED. for per-object parity, if any */ /* object -> pg layout */ - __le32 fl_unused; /* unused; used to be preferred primary (-1) */ + __le32 fl_unused; /* unused; used to be preferred primary for pg (-1 for none) */ __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */ } __attribute__ ((packed)); @@ -101,6 +99,8 @@ struct ceph_dir_layout { #define CEPH_MSG_MON_SUBSCRIBE_ACK 16 #define CEPH_MSG_AUTH 17 #define CEPH_MSG_AUTH_REPLY 18 +#define CEPH_MSG_MON_GET_VERSION 19 +#define CEPH_MSG_MON_GET_VERSION_REPLY 20 /* client <-> mds */ #define CEPH_MSG_MDS_MAP 21 @@ -221,6 +221,11 @@ struct ceph_mon_subscribe_ack { } __attribute__ ((packed)); /* + * mdsmap flags + */ +#define CEPH_MDSMAP_DOWN (1<<0) /* cluster deliberately down */ + +/* * mds states * > 0 -> in * <= 0 -> out @@ -233,6 +238,7 @@ struct ceph_mon_subscribe_ack { #define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */ #define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */ #define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */ +#define CEPH_MDS_STATE_REPLAYONCE -9 /* up, replaying an active node's journal */ #define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */ #define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed @@ -264,6 +270,7 @@ extern const char *ceph_mds_state_name(int s); #define CEPH_LOCK_IXATTR 2048 #define CEPH_LOCK_IFLOCK 4096 /* advisory file locks */ #define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */ +#define CEPH_LOCK_IPOLICY 16384 /* policy lock on dirs. MDS internal */ /* client_session ops */ enum { @@ -338,6 +345,12 @@ extern const char *ceph_mds_op_name(int op); #define CEPH_SETATTR_SIZE 32 #define CEPH_SETATTR_CTIME 64 +/* + * Ceph setxattr request flags. + */ +#define CEPH_XATTR_CREATE 1 +#define CEPH_XATTR_REPLACE 2 + union ceph_mds_request_args { struct { __le32 mask; /* CEPH_CAP_* */ @@ -522,14 +535,17 @@ int ceph_flags_to_mode(int flags); #define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */ #define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */ +#define CEPH_CAP_SIMPLE_BITS 2 +#define CEPH_CAP_FILE_BITS 8 + /* per-lock shift */ #define CEPH_CAP_SAUTH 2 #define CEPH_CAP_SLINK 4 #define CEPH_CAP_SXATTR 6 #define CEPH_CAP_SFILE 8 -#define CEPH_CAP_SFLOCK 20 +#define CEPH_CAP_SFLOCK 20 -#define CEPH_CAP_BITS 22 +#define CEPH_CAP_BITS 22 /* composed values */ #define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH) diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h index 63d092822bad..360d9d08ca9e 100644 --- a/include/linux/ceph/decode.h +++ b/include/linux/ceph/decode.h @@ -52,10 +52,10 @@ static inline int ceph_has_room(void **p, void *end, size_t n) return end >= *p && n <= end - *p; } -#define ceph_decode_need(p, end, n, bad) \ - do { \ - if (!likely(ceph_has_room(p, end, n))) \ - goto bad; \ +#define ceph_decode_need(p, end, n, bad) \ + do { \ + if (!likely(ceph_has_room(p, end, n))) \ + goto bad; \ } while (0) #define ceph_decode_64_safe(p, end, v, bad) \ @@ -99,8 +99,8 @@ static inline int ceph_has_room(void **p, void *end, size_t n) * * There are two possible failures: * - converting the string would require accessing memory at or - * beyond the "end" pointer provided (-E - * - memory could not be allocated for the result + * beyond the "end" pointer provided (-ERANGE) + * - memory could not be allocated for the result (-ENOMEM) */ static inline char *ceph_extract_encoded_string(void **p, void *end, size_t *lenp, gfp_t gfp) @@ -217,10 +217,10 @@ static inline void ceph_encode_string(void **p, void *end, *p += len; } -#define ceph_encode_need(p, end, n, bad) \ - do { \ - if (!likely(ceph_has_room(p, end, n))) \ - goto bad; \ +#define ceph_encode_need(p, end, n, bad) \ + do { \ + if (!likely(ceph_has_room(p, end, n))) \ + goto bad; \ } while (0) #define ceph_encode_64_safe(p, end, v, bad) \ @@ -231,12 +231,17 @@ static inline void ceph_encode_string(void **p, void *end, #define ceph_encode_32_safe(p, end, v, bad) \ do { \ ceph_encode_need(p, end, sizeof(u32), bad); \ - ceph_encode_32(p, v); \ + ceph_encode_32(p, v); \ } while (0) #define ceph_encode_16_safe(p, end, v, bad) \ do { \ ceph_encode_need(p, end, sizeof(u16), bad); \ - ceph_encode_16(p, v); \ + ceph_encode_16(p, v); \ + } while (0) +#define ceph_encode_8_safe(p, end, v, bad) \ + do { \ + ceph_encode_need(p, end, sizeof(u8), bad); \ + ceph_encode_8(p, v); \ } while (0) #define ceph_encode_copy_safe(p, end, pv, n, bad) \ diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 084d3c622b12..29818fc3fa49 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -193,6 +193,8 @@ static inline int calc_pages_for(u64 off, u64 len) } /* ceph_common.c */ +extern bool libceph_compatible(void *data); + extern const char *ceph_msg_type_name(int type); extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); extern struct kmem_cache *ceph_inode_cachep; @@ -220,7 +222,7 @@ extern int ceph_open_session(struct ceph_client *client); /* pagevec.c */ extern void ceph_release_page_vector(struct page **pages, int num_pages); -extern struct page **ceph_get_direct_page_vector(const char __user *data, +extern struct page **ceph_get_direct_page_vector(const void __user *data, int num_pages, bool write_page); extern void ceph_put_page_vector(struct page **pages, int num_pages, @@ -228,15 +230,15 @@ extern void ceph_put_page_vector(struct page **pages, int num_pages, extern void ceph_release_page_vector(struct page **pages, int num_pages); extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); extern int ceph_copy_user_to_page_vector(struct page **pages, - const char __user *data, + const void __user *data, loff_t off, size_t len); -extern int ceph_copy_to_page_vector(struct page **pages, - const char *data, +extern void ceph_copy_to_page_vector(struct page **pages, + const void *data, loff_t off, size_t len); -extern int ceph_copy_from_page_vector(struct page **pages, - char *data, +extern void ceph_copy_from_page_vector(struct page **pages, + void *data, loff_t off, size_t len); -extern int ceph_copy_page_vector_to_user(struct page **pages, char __user *data, +extern int ceph_copy_page_vector_to_user(struct page **pages, void __user *data, loff_t off, size_t len); extern void ceph_zero_page_vector_range(int off, int len, struct page **pages); diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h index cb15b5d867c7..87ed09f54800 100644 --- a/include/linux/ceph/mdsmap.h +++ b/include/linux/ceph/mdsmap.h @@ -29,8 +29,8 @@ struct ceph_mdsmap { /* which object pools file data can be stored in */ int m_num_data_pg_pools; - u32 *m_data_pg_pools; - u32 m_cas_pg_pool; + u64 *m_data_pg_pools; + u64 m_cas_pg_pool; }; static inline struct ceph_entity_addr * diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 14ba5ee738a9..60903e0f665c 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -83,9 +83,11 @@ struct ceph_msg { struct list_head list_head; struct kref kref; +#ifdef CONFIG_BLOCK struct bio *bio; /* instead of pages/pagelist */ struct bio *bio_iter; /* bio iterator */ int bio_seg; /* current bio segment */ +#endif /* CONFIG_BLOCK */ struct ceph_pagelist *trail; /* the trailing part of the data */ bool front_is_vmalloc; bool more_to_follow; diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index d9b880e977e6..1dd5d466b6f9 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -10,6 +10,7 @@ #include <linux/ceph/osdmap.h> #include <linux/ceph/messenger.h> #include <linux/ceph/auth.h> +#include <linux/ceph/pagelist.h> /* * Maximum object name size @@ -22,7 +23,6 @@ struct ceph_snap_context; struct ceph_osd_request; struct ceph_osd_client; struct ceph_authorizer; -struct ceph_pagelist; /* * completion callback for async writepages @@ -47,6 +47,9 @@ struct ceph_osd { struct list_head o_keepalive_item; }; + +#define CEPH_OSD_MAX_OP 10 + /* an in-flight request */ struct ceph_osd_request { u64 r_tid; /* unique for this client */ @@ -63,9 +66,23 @@ struct ceph_osd_request { struct ceph_connection *r_con_filling_msg; struct ceph_msg *r_request, *r_reply; - int r_result; int r_flags; /* any additional flags for the osd */ u32 r_sent; /* >0 if r_request is sending/sent */ + int r_num_ops; + + /* encoded message content */ + struct ceph_osd_op *r_request_ops; + /* these are updated on each send */ + __le32 *r_request_osdmap_epoch; + __le32 *r_request_flags; + __le64 *r_request_pool; + void *r_request_pgid; + __le32 *r_request_attempts; + struct ceph_eversion *r_request_reassert_version; + + int r_result; + int r_reply_op_len[CEPH_OSD_MAX_OP]; + s32 r_reply_op_result[CEPH_OSD_MAX_OP]; int r_got_reply; int r_linger; @@ -82,6 +99,7 @@ struct ceph_osd_request { char r_oid[MAX_OBJ_NAME_SIZE]; /* object name */ int r_oid_len; + u64 r_snapid; unsigned long r_stamp; /* send OR check time */ struct ceph_file_layout r_file_layout; @@ -95,7 +113,7 @@ struct ceph_osd_request { struct bio *r_bio; /* instead of pages */ #endif - struct ceph_pagelist *r_trail; /* trailing part of the data */ + struct ceph_pagelist r_trail; /* trailing part of the data */ }; struct ceph_osd_event { @@ -107,7 +125,6 @@ struct ceph_osd_event { struct rb_node node; struct list_head osd_node; struct kref kref; - struct completion completion; }; struct ceph_osd_event_work { @@ -157,7 +174,7 @@ struct ceph_osd_client { struct ceph_osd_req_op { u16 op; /* CEPH_OSD_OP_* */ - u32 flags; /* CEPH_OSD_FLAG_* */ + u32 payload_len; union { struct { u64 offset, length; @@ -166,23 +183,24 @@ struct ceph_osd_req_op { } extent; struct { const char *name; - u32 name_len; const char *val; + u32 name_len; u32 value_len; __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ } xattr; struct { const char *class_name; - __u8 class_len; const char *method_name; - __u8 method_len; - __u8 argc; const char *indata; u32 indata_len; + __u8 class_len; + __u8 method_len; + __u8 argc; } cls; struct { - u64 cookie, count; + u64 cookie; + u64 count; } pgls; struct { u64 snapid; @@ -190,12 +208,11 @@ struct ceph_osd_req_op { struct { u64 cookie; u64 ver; - __u8 flag; u32 prot_ver; u32 timeout; + __u8 flag; } watch; }; - u32 payload_len; }; extern int ceph_osdc_init(struct ceph_osd_client *osdc, @@ -207,29 +224,19 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg); -extern int ceph_calc_raw_layout(struct ceph_osd_client *osdc, - struct ceph_file_layout *layout, - u64 snapid, - u64 off, u64 *plen, u64 *bno, - struct ceph_osd_request *req, - struct ceph_osd_req_op *op); - extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, - int flags, struct ceph_snap_context *snapc, - struct ceph_osd_req_op *ops, + unsigned int num_op, bool use_mempool, - gfp_t gfp_flags, - struct page **pages, - struct bio *bio); + gfp_t gfp_flags); extern void ceph_osdc_build_request(struct ceph_osd_request *req, - u64 off, u64 *plen, + u64 off, u64 len, + unsigned int num_op, struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, - struct timespec *mtime, - const char *oid, - int oid_len); + u64 snap_id, + struct timespec *mtime); extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, struct ceph_file_layout *layout, @@ -239,8 +246,7 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, int do_sync, u32 truncate_seq, u64 truncate_size, struct timespec *mtime, - bool use_mempool, int num_reply, - int page_align); + bool use_mempool, int page_align); extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, struct ceph_osd_request *req); @@ -279,17 +285,13 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, u64 off, u64 len, u32 truncate_seq, u64 truncate_size, struct timespec *mtime, - struct page **pages, int nr_pages, - int flags, int do_sync, bool nofail); + struct page **pages, int nr_pages); /* watch/notify events */ extern int ceph_osdc_create_event(struct ceph_osd_client *osdc, void (*event_cb)(u64, u64, u8, void *), - int one_shot, void *data, - struct ceph_osd_event **pevent); + void *data, struct ceph_osd_event **pevent); extern void ceph_osdc_cancel_event(struct ceph_osd_event *event); -extern int ceph_osdc_wait_event(struct ceph_osd_event *event, - unsigned long timeout); extern void ceph_osdc_put_event(struct ceph_osd_event *event); #endif diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 10a417f9f76f..c819190d1642 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -18,14 +18,31 @@ * The map can be updated either via an incremental map (diff) describing * the change between two successive epochs, or as a fully encoded map. */ +struct ceph_pg { + uint64_t pool; + uint32_t seed; +}; + +#define CEPH_POOL_FLAG_HASHPSPOOL 1 + struct ceph_pg_pool_info { struct rb_node node; - int id; - struct ceph_pg_pool v; - int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask; + s64 id; + u8 type; + u8 size; + u8 crush_ruleset; + u8 object_hash; + u32 pg_num, pgp_num; + int pg_num_mask, pgp_num_mask; + u64 flags; char *name; }; +struct ceph_object_locator { + uint64_t pool; + char *key; +}; + struct ceph_pg_mapping { struct rb_node node; struct ceph_pg pgid; @@ -110,15 +127,16 @@ extern void ceph_osdmap_destroy(struct ceph_osdmap *map); /* calculate mapping of a file extent to an object */ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, - u64 off, u64 *plen, + u64 off, u64 len, u64 *bno, u64 *oxoff, u64 *oxlen); /* calculate mapping of object to a placement group */ -extern int ceph_calc_object_layout(struct ceph_object_layout *ol, +extern int ceph_calc_object_layout(struct ceph_pg *pg, const char *oid, struct ceph_file_layout *fl, struct ceph_osdmap *osdmap); -extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, +extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, + struct ceph_pg pgid, int *acting); extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid); diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 2c04afeead1c..68c96a508ac2 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -9,14 +9,6 @@ #include <linux/ceph/msgr.h> /* - * osdmap encoding versions - */ -#define CEPH_OSDMAP_INC_VERSION 5 -#define CEPH_OSDMAP_INC_VERSION_EXT 6 -#define CEPH_OSDMAP_VERSION 5 -#define CEPH_OSDMAP_VERSION_EXT 6 - -/* * fs id */ struct ceph_fsid { @@ -64,7 +56,7 @@ struct ceph_timespec { * placement group. * we encode this into one __le64. */ -struct ceph_pg { +struct ceph_pg_v1 { __le16 preferred; /* preferred primary osd */ __le16 ps; /* placement seed */ __le32 pool; /* object pool */ @@ -91,21 +83,6 @@ struct ceph_pg { #define CEPH_PG_TYPE_REP 1 #define CEPH_PG_TYPE_RAID4 2 -#define CEPH_PG_POOL_VERSION 2 -struct ceph_pg_pool { - __u8 type; /* CEPH_PG_TYPE_* */ - __u8 size; /* number of osds in each pg */ - __u8 crush_ruleset; /* crush placement rule */ - __u8 object_hash; /* hash mapping object name to ps */ - __le32 pg_num, pgp_num; /* number of pg's */ - __le32 lpg_num, lpgp_num; /* number of localized pg's */ - __le32 last_change; /* most recent epoch changed */ - __le64 snap_seq; /* seq for per-pool snapshot */ - __le32 snap_epoch; /* epoch of last snap */ - __le32 num_snaps; - __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */ - __le64 auid; /* who owns the pg */ -} __attribute__ ((packed)); /* * stable_mod func is used to control number of placement groups. @@ -128,7 +105,7 @@ static inline int ceph_stable_mod(int x, int b, int bmask) * object layout - how a given object should be stored. */ struct ceph_object_layout { - struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */ + struct ceph_pg_v1 ol_pgid; /* raw pg, with _full_ ps precision. */ __le32 ol_stripe_unit; /* for per-object parity, if any */ } __attribute__ ((packed)); @@ -145,8 +122,12 @@ struct ceph_eversion { */ /* status bits */ -#define CEPH_OSD_EXISTS 1 -#define CEPH_OSD_UP 2 +#define CEPH_OSD_EXISTS (1<<0) +#define CEPH_OSD_UP (1<<1) +#define CEPH_OSD_AUTOOUT (1<<2) /* osd was automatically marked out */ +#define CEPH_OSD_NEW (1<<3) /* osd is new, never marked in */ + +extern const char *ceph_osd_state_name(int s); /* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */ #define CEPH_OSD_IN 0x10000 @@ -161,9 +142,25 @@ struct ceph_eversion { #define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */ #define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */ #define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */ +#define CEPH_OSDMAP_NOUP (1<<5) /* block osd boot */ +#define CEPH_OSDMAP_NODOWN (1<<6) /* block osd mark-down/failure */ +#define CEPH_OSDMAP_NOOUT (1<<7) /* block osd auto mark-out */ +#define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */ +#define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */ +#define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */ + +/* + * The error code to return when an OSD can't handle a write + * because it is too large. + */ +#define OSD_WRITETOOBIG EMSGSIZE /* * osd ops + * + * WARNING: do not use these op codes directly. Use the helpers + * defined below instead. In certain cases, op code behavior was + * redefined, resulting in special-cases in the helpers. */ #define CEPH_OSD_OP_MODE 0xf000 #define CEPH_OSD_OP_MODE_RD 0x1000 @@ -177,6 +174,7 @@ struct ceph_eversion { #define CEPH_OSD_OP_TYPE_ATTR 0x0300 #define CEPH_OSD_OP_TYPE_EXEC 0x0400 #define CEPH_OSD_OP_TYPE_PG 0x0500 +#define CEPH_OSD_OP_TYPE_MULTI 0x0600 /* multiobject */ enum { /** data **/ @@ -217,6 +215,23 @@ enum { CEPH_OSD_OP_WATCH = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 15, + /* omap */ + CEPH_OSD_OP_OMAPGETKEYS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 17, + CEPH_OSD_OP_OMAPGETVALS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 18, + CEPH_OSD_OP_OMAPGETHEADER = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 19, + CEPH_OSD_OP_OMAPGETVALSBYKEYS = + CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 20, + CEPH_OSD_OP_OMAPSETVALS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 21, + CEPH_OSD_OP_OMAPSETHEADER = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 22, + CEPH_OSD_OP_OMAPCLEAR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 23, + CEPH_OSD_OP_OMAPRMKEYS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 24, + CEPH_OSD_OP_OMAP_CMP = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 25, + + /** multi **/ + CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_MULTI | 1, + CEPH_OSD_OP_ASSERT_SRC_VERSION = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 2, + CEPH_OSD_OP_SRC_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 3, + /** attrs **/ /* read */ CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1, @@ -238,6 +253,7 @@ enum { CEPH_OSD_OP_SCRUB_RESERVE = CEPH_OSD_OP_MODE_SUB | 6, CEPH_OSD_OP_SCRUB_UNRESERVE = CEPH_OSD_OP_MODE_SUB | 7, CEPH_OSD_OP_SCRUB_STOP = CEPH_OSD_OP_MODE_SUB | 8, + CEPH_OSD_OP_SCRUB_MAP = CEPH_OSD_OP_MODE_SUB | 9, /** lock **/ CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1, @@ -248,10 +264,12 @@ enum { CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6, /** exec **/ + /* note: the RD bit here is wrong; see special-case below in helper */ CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1, /** pg **/ CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1, + CEPH_OSD_OP_PGLS_FILTER = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 2, }; static inline int ceph_osd_op_type_lock(int op) @@ -274,6 +292,10 @@ static inline int ceph_osd_op_type_pg(int op) { return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG; } +static inline int ceph_osd_op_type_multi(int op) +{ + return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_MULTI; +} static inline int ceph_osd_op_mode_subop(int op) { @@ -281,11 +303,12 @@ static inline int ceph_osd_op_mode_subop(int op) } static inline int ceph_osd_op_mode_read(int op) { - return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD; + return (op & CEPH_OSD_OP_MODE_RD) && + op != CEPH_OSD_OP_CALL; } static inline int ceph_osd_op_mode_modify(int op) { - return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR; + return op & CEPH_OSD_OP_MODE_WR; } /* @@ -294,34 +317,38 @@ static inline int ceph_osd_op_mode_modify(int op) */ #define CEPH_OSD_TMAP_HDR 'h' #define CEPH_OSD_TMAP_SET 's' +#define CEPH_OSD_TMAP_CREATE 'c' /* create key */ #define CEPH_OSD_TMAP_RM 'r' +#define CEPH_OSD_TMAP_RMSLOPPY 'R' extern const char *ceph_osd_op_name(int op); - /* * osd op flags * * An op may be READ, WRITE, or READ|WRITE. */ enum { - CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */ - CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */ - CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */ - CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */ - CEPH_OSD_FLAG_READ = 16, /* op may read */ - CEPH_OSD_FLAG_WRITE = 32, /* op may write */ - CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */ - CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */ - CEPH_OSD_FLAG_BALANCE_READS = 256, - CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */ - CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */ - CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */ - CEPH_OSD_FLAG_EXEC_PUBLIC = 4096, /* op may exec (public) */ + CEPH_OSD_FLAG_ACK = 0x0001, /* want (or is) "ack" ack */ + CEPH_OSD_FLAG_ONNVRAM = 0x0002, /* want (or is) "onnvram" ack */ + CEPH_OSD_FLAG_ONDISK = 0x0004, /* want (or is) "ondisk" ack */ + CEPH_OSD_FLAG_RETRY = 0x0008, /* resend attempt */ + CEPH_OSD_FLAG_READ = 0x0010, /* op may read */ + CEPH_OSD_FLAG_WRITE = 0x0020, /* op may write */ + CEPH_OSD_FLAG_ORDERSNAP = 0x0040, /* EOLDSNAP if snapc is out of order */ + CEPH_OSD_FLAG_PEERSTAT_OLD = 0x0080, /* DEPRECATED msg includes osd_peer_stat */ + CEPH_OSD_FLAG_BALANCE_READS = 0x0100, + CEPH_OSD_FLAG_PARALLELEXEC = 0x0200, /* execute op in parallel */ + CEPH_OSD_FLAG_PGOP = 0x0400, /* pg op, no object */ + CEPH_OSD_FLAG_EXEC = 0x0800, /* op may exec */ + CEPH_OSD_FLAG_EXEC_PUBLIC = 0x1000, /* DEPRECATED op may exec (public) */ + CEPH_OSD_FLAG_LOCALIZE_READS = 0x2000, /* read from nearby replica, if any */ + CEPH_OSD_FLAG_RWORDERED = 0x4000, /* order wrt concurrent reads */ }; enum { CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */ + CEPH_OSD_OP_FLAG_FAILOK = 2, /* continue despite failure */ }; #define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ @@ -381,48 +408,13 @@ struct ceph_osd_op { __le64 ver; __u8 flag; /* 0 = unwatch, 1 = watch */ } __attribute__ ((packed)) watch; -}; + struct { + __le64 offset, length; + __le64 src_offset; + } __attribute__ ((packed)) clonerange; + }; __le32 payload_len; } __attribute__ ((packed)); -/* - * osd request message header. each request may include multiple - * ceph_osd_op object operations. - */ -struct ceph_osd_request_head { - __le32 client_inc; /* client incarnation */ - struct ceph_object_layout layout; /* pgid */ - __le32 osdmap_epoch; /* client's osdmap epoch */ - - __le32 flags; - - struct ceph_timespec mtime; /* for mutations only */ - struct ceph_eversion reassert_version; /* if we are replaying op */ - - __le32 object_len; /* length of object name */ - - __le64 snapid; /* snapid to read */ - __le64 snap_seq; /* writer's snap context */ - __le32 num_snaps; - - __le16 num_ops; - struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */ -} __attribute__ ((packed)); - -struct ceph_osd_reply_head { - __le32 client_inc; /* client incarnation */ - __le32 flags; - struct ceph_object_layout layout; - __le32 osdmap_epoch; - struct ceph_eversion reassert_version; /* for replaying uncommitted */ - - __le32 result; /* result code */ - - __le32 object_len; /* length of object name */ - __le32 num_ops; - struct ceph_osd_op ops[0]; /* ops[], object */ -} __attribute__ ((packed)); - - #endif diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 25baa287cff7..6a1101f24cfb 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -162,6 +162,8 @@ struct crush_map { __u32 choose_local_fallback_tries; /* choose attempts before giving up */ __u32 choose_total_tries; + /* attempt chooseleaf inner descent once; on failure retry outer descent */ + __u32 chooseleaf_descend_once; }; |