diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-05-27 00:10:32 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-05-27 00:10:32 +0300 |
commit | a10c38a4f385f5d7c173a263ff6bb2d36021b3bb (patch) | |
tree | 3cbaa916940b36a9fdb27c8a231e1488fbc352d6 /include | |
parent | ea8ea737c46cffa5d0ee74309f81e55a7e5e9c2a (diff) | |
parent | e536030934aebf049fe6aaebc58dd37aeee21840 (diff) | |
download | linux-a10c38a4f385f5d7c173a263ff6bb2d36021b3bb.tar.xz |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
Pull Ceph updates from Sage Weil:
"This changeset has a few main parts:
- Ilya has finished a huge refactoring effort to sync up the
client-side logic in libceph with the user-space client code, which
has evolved significantly over the last couple years, with lots of
additional behaviors (e.g., how requests are handled when cluster
is full and transitions from full to non-full).
This structure of the code is more closely aligned with userspace
now such that it will be much easier to maintain going forward when
behavior changes take place. There are some locking improvements
bundled in as well.
- Zheng adds multi-filesystem support (multiple namespaces within the
same Ceph cluster)
- Zheng has changed the readdir offsets and directory enumeration so
that dentry offsets are hash-based and therefore stable across
directory fragmentation events on the MDS.
- Zheng has a smorgasbord of bug fixes across fs/ceph"
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (71 commits)
ceph: fix wake_up_session_cb()
ceph: don't use truncate_pagecache() to invalidate read cache
ceph: SetPageError() for writeback pages if writepages fails
ceph: handle interrupted ceph_writepage()
ceph: make ceph_update_writeable_page() uninterruptible
libceph: make ceph_osdc_wait_request() uninterruptible
ceph: handle -EAGAIN returned by ceph_update_writeable_page()
ceph: make fault/page_mkwrite return VM_FAULT_OOM for -ENOMEM
ceph: block non-fatal signals for fault/page_mkwrite
ceph: make logical calculation functions return bool
ceph: tolerate bad i_size for symlink inode
ceph: improve fragtree change detection
ceph: keep leaf frag when updating fragtree
ceph: fix dir_auth check in ceph_fill_dirfrag()
ceph: don't assume frag tree splits in mds reply are sorted
ceph: fix inode reference leak
ceph: using hash value to compose dentry offset
ceph: don't forbid marking directory complete after forward seek
ceph: record 'offset' for each entry of readdir result
ceph: define 'end/complete' in readdir reply as bit flags
...
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/ceph/ceph_frag.h | 4 | ||||
-rw-r--r-- | include/linux/ceph/ceph_fs.h | 20 | ||||
-rw-r--r-- | include/linux/ceph/decode.h | 2 | ||||
-rw-r--r-- | include/linux/ceph/libceph.h | 57 | ||||
-rw-r--r-- | include/linux/ceph/mon_client.h | 23 | ||||
-rw-r--r-- | include/linux/ceph/osd_client.h | 231 | ||||
-rw-r--r-- | include/linux/ceph/osdmap.h | 158 | ||||
-rw-r--r-- | include/linux/ceph/rados.h | 34 |
8 files changed, 379 insertions, 150 deletions
diff --git a/include/linux/ceph/ceph_frag.h b/include/linux/ceph/ceph_frag.h index b827e066e55a..146507df8650 100644 --- a/include/linux/ceph/ceph_frag.h +++ b/include/linux/ceph/ceph_frag.h @@ -51,11 +51,11 @@ static inline __u32 ceph_frag_make_child(__u32 f, int by, int i) return ceph_frag_make(newbits, ceph_frag_value(f) | (i << (24 - newbits))); } -static inline int ceph_frag_is_leftmost(__u32 f) +static inline bool ceph_frag_is_leftmost(__u32 f) { return ceph_frag_value(f) == 0; } -static inline int ceph_frag_is_rightmost(__u32 f) +static inline bool ceph_frag_is_rightmost(__u32 f) { return ceph_frag_value(f) == ceph_frag_mask(f); } diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 37f28bf55ce4..dfce616002ad 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -153,8 +153,9 @@ struct ceph_dir_layout { /* watch-notify operations */ enum { - WATCH_NOTIFY = 1, /* notifying watcher */ - WATCH_NOTIFY_COMPLETE = 2, /* notifier notified when done */ + CEPH_WATCH_EVENT_NOTIFY = 1, /* notifying watcher */ + CEPH_WATCH_EVENT_NOTIFY_COMPLETE = 2, /* notifier notified when done */ + CEPH_WATCH_EVENT_DISCONNECT = 3, /* we were disconnected */ }; @@ -207,6 +208,8 @@ struct ceph_mon_subscribe_ack { struct ceph_fsid fsid; } __attribute__ ((packed)); +#define CEPH_FS_CLUSTER_ID_NONE -1 + /* * mdsmap flags */ @@ -344,6 +347,18 @@ extern const char *ceph_mds_op_name(int op); #define CEPH_XATTR_REPLACE (1 << 1) #define CEPH_XATTR_REMOVE (1 << 31) +/* + * readdir request flags; + */ +#define CEPH_READDIR_REPLY_BITFLAGS (1<<0) + +/* + * readdir reply flags. + */ +#define CEPH_READDIR_FRAG_END (1<<0) +#define CEPH_READDIR_FRAG_COMPLETE (1<<8) +#define CEPH_READDIR_HASH_ORDER (1<<9) + union ceph_mds_request_args { struct { __le32 mask; /* CEPH_CAP_* */ @@ -361,6 +376,7 @@ union ceph_mds_request_args { __le32 frag; /* which dir fragment */ __le32 max_entries; /* how many dentries to grab */ __le32 max_bytes; + __le16 flags; } __attribute__ ((packed)) readdir; struct { __le32 mode; diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h index a6ef9cc267ec..19e9932f3e77 100644 --- a/include/linux/ceph/decode.h +++ b/include/linux/ceph/decode.h @@ -47,7 +47,7 @@ static inline void ceph_decode_copy(void **p, void *pv, size_t n) /* * bounds check input. */ -static inline int ceph_has_room(void **p, void *end, size_t n) +static inline bool ceph_has_room(void **p, void *end, size_t n) { return end >= *p && n <= end - *p; } diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index db92a8d4926e..690985daad1c 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -180,6 +180,63 @@ static inline int calc_pages_for(u64 off, u64 len) (off >> PAGE_SHIFT); } +/* + * These are not meant to be generic - an integer key is assumed. + */ +#define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \ +static void insert_##name(struct rb_root *root, type *t) \ +{ \ + struct rb_node **n = &root->rb_node; \ + struct rb_node *parent = NULL; \ + \ + BUG_ON(!RB_EMPTY_NODE(&t->nodefld)); \ + \ + while (*n) { \ + type *cur = rb_entry(*n, type, nodefld); \ + \ + parent = *n; \ + if (t->keyfld < cur->keyfld) \ + n = &(*n)->rb_left; \ + else if (t->keyfld > cur->keyfld) \ + n = &(*n)->rb_right; \ + else \ + BUG(); \ + } \ + \ + rb_link_node(&t->nodefld, parent, n); \ + rb_insert_color(&t->nodefld, root); \ +} \ +static void erase_##name(struct rb_root *root, type *t) \ +{ \ + BUG_ON(RB_EMPTY_NODE(&t->nodefld)); \ + rb_erase(&t->nodefld, root); \ + RB_CLEAR_NODE(&t->nodefld); \ +} + +#define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) \ +static type *lookup_##name(struct rb_root *root, \ + typeof(((type *)0)->keyfld) key) \ +{ \ + struct rb_node *n = root->rb_node; \ + \ + while (n) { \ + type *cur = rb_entry(n, type, nodefld); \ + \ + if (key < cur->keyfld) \ + n = n->rb_left; \ + else if (key > cur->keyfld) \ + n = n->rb_right; \ + else \ + return cur; \ + } \ + \ + return NULL; \ +} + +#define DEFINE_RB_FUNCS(name, type, keyfld, nodefld) \ +DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \ +DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) + extern struct kmem_cache *ceph_inode_cachep; extern struct kmem_cache *ceph_cap_cachep; extern struct kmem_cache *ceph_cap_flush_cachep; diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index e230e7ed60d3..e2a92df08b47 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -39,20 +39,31 @@ struct ceph_mon_request { ceph_monc_request_func_t do_request; }; +typedef void (*ceph_monc_callback_t)(struct ceph_mon_generic_request *); + /* * ceph_mon_generic_request is being used for the statfs and * mon_get_version requests which are being done a bit differently * because we need to get data back to the caller */ struct ceph_mon_generic_request { + struct ceph_mon_client *monc; struct kref kref; u64 tid; struct rb_node node; int result; - void *buf; + struct completion completion; + ceph_monc_callback_t complete_cb; + u64 private_data; /* r_tid/linger_id */ + struct ceph_msg *request; /* original request */ struct ceph_msg *reply; /* and reply */ + + union { + struct ceph_statfs *st; + u64 newest; + } u; }; struct ceph_mon_client { @@ -77,7 +88,6 @@ struct ceph_mon_client { /* pending generic requests */ struct rb_root generic_request_tree; - int num_generic_requests; u64 last_tid; /* subs, indexed with CEPH_SUB_* */ @@ -86,6 +96,7 @@ struct ceph_mon_client { bool want; u32 have; /* epoch */ } subs[3]; + int fs_cluster_id; /* "mdsmap.<id>" sub */ #ifdef CONFIG_DEBUG_FS struct dentry *debugfs_file; @@ -116,16 +127,18 @@ extern const char *ceph_sub_str[]; bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch, bool continuous); void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch); +void ceph_monc_renew_subs(struct ceph_mon_client *monc); -extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc); extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, unsigned long timeout); extern int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf); -extern int ceph_monc_do_get_version(struct ceph_mon_client *monc, - const char *what, u64 *newest); +int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what, + u64 *newest); +int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what, + ceph_monc_callback_t cb, u64 private_data); extern int ceph_monc_open_session(struct ceph_mon_client *monc); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index cbf460927c42..19b14862d3e0 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -20,10 +20,11 @@ struct ceph_osd_client; /* * completion callback for async writepages */ -typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *, - struct ceph_msg *); +typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *); typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool); +#define CEPH_HOMELESS_OSD -1 + /* a given osd we're communicating with */ struct ceph_osd { atomic_t o_ref; @@ -32,16 +33,15 @@ struct ceph_osd { int o_incarnation; struct rb_node o_node; struct ceph_connection o_con; - struct list_head o_requests; - struct list_head o_linger_requests; + struct rb_root o_requests; + struct rb_root o_linger_requests; struct list_head o_osd_lru; struct ceph_auth_handshake o_auth; unsigned long lru_ttl; - int o_marked_for_keepalive; struct list_head o_keepalive_item; + struct mutex lock; }; - #define CEPH_OSD_SLAB_OPS 2 #define CEPH_OSD_MAX_OPS 16 @@ -104,76 +104,95 @@ struct ceph_osd_req_op { struct ceph_osd_data response_data; __u8 class_len; __u8 method_len; - __u8 argc; + u32 indata_len; } cls; struct { u64 cookie; - u64 ver; - u32 prot_ver; - u32 timeout; - __u8 flag; + __u8 op; /* CEPH_OSD_WATCH_OP_ */ + u32 gen; } watch; struct { + struct ceph_osd_data request_data; + } notify_ack; + struct { + u64 cookie; + struct ceph_osd_data request_data; + struct ceph_osd_data response_data; + } notify; + struct { u64 expected_object_size; u64 expected_write_size; } alloc_hint; }; }; +struct ceph_osd_request_target { + struct ceph_object_id base_oid; + struct ceph_object_locator base_oloc; + struct ceph_object_id target_oid; + struct ceph_object_locator target_oloc; + + struct ceph_pg pgid; + u32 pg_num; + u32 pg_num_mask; + struct ceph_osds acting; + struct ceph_osds up; + int size; + int min_size; + bool sort_bitwise; + + unsigned int flags; /* CEPH_OSD_FLAG_* */ + bool paused; + + int osd; +}; + /* an in-flight request */ struct ceph_osd_request { u64 r_tid; /* unique for this client */ struct rb_node r_node; - struct list_head r_req_lru_item; - struct list_head r_osd_item; - struct list_head r_linger_item; - struct list_head r_linger_osd_item; + struct rb_node r_mc_node; /* map check */ struct ceph_osd *r_osd; - struct ceph_pg r_pgid; - int r_pg_osds[CEPH_PG_MAX_SIZE]; - int r_num_pg_osds; + + struct ceph_osd_request_target r_t; +#define r_base_oid r_t.base_oid +#define r_base_oloc r_t.base_oloc +#define r_flags r_t.flags struct ceph_msg *r_request, *r_reply; - int r_flags; /* any additional flags for the osd */ u32 r_sent; /* >0 if r_request is sending/sent */ /* request osd ops array */ unsigned int r_num_ops; - /* these are updated on each send */ - __le32 *r_request_osdmap_epoch; - __le32 *r_request_flags; - __le64 *r_request_pool; - void *r_request_pgid; - __le32 *r_request_attempts; - bool r_paused; - struct ceph_eversion *r_request_reassert_version; - int r_result; - int r_got_reply; - int r_linger; + bool r_got_reply; struct ceph_osd_client *r_osdc; struct kref r_kref; bool r_mempool; - struct completion r_completion, r_safe_completion; + struct completion r_completion; + struct completion r_safe_completion; /* fsync waiter */ ceph_osdc_callback_t r_callback; ceph_osdc_unsafe_callback_t r_unsafe_callback; - struct ceph_eversion r_reassert_version; struct list_head r_unsafe_item; struct inode *r_inode; /* for use by callbacks */ void *r_priv; /* ditto */ - struct ceph_object_locator r_base_oloc; - struct ceph_object_id r_base_oid; - struct ceph_object_locator r_target_oloc; - struct ceph_object_id r_target_oid; - - u64 r_snapid; - unsigned long r_stamp; /* send OR check time */ + /* set by submitter */ + u64 r_snapid; /* for reads, CEPH_NOSNAP o/w */ + struct ceph_snap_context *r_snapc; /* for writes */ + struct timespec r_mtime; /* ditto */ + u64 r_data_offset; /* ditto */ + bool r_linger; /* don't resend on failure */ - struct ceph_snap_context *r_snapc; /* snap context for writes */ + /* internal */ + unsigned long r_stamp; /* jiffies, send or check time */ + int r_attempts; + struct ceph_eversion r_replay_version; /* aka reassert_version */ + u32 r_last_force_resend; + u32 r_map_dne_bound; struct ceph_osd_req_op r_ops[]; }; @@ -182,44 +201,70 @@ struct ceph_request_redirect { struct ceph_object_locator oloc; }; -struct ceph_osd_event { - u64 cookie; - int one_shot; +typedef void (*rados_watchcb2_t)(void *arg, u64 notify_id, u64 cookie, + u64 notifier_id, void *data, size_t data_len); +typedef void (*rados_watcherrcb_t)(void *arg, u64 cookie, int err); + +struct ceph_osd_linger_request { struct ceph_osd_client *osdc; - void (*cb)(u64, u64, u8, void *); - void *data; - struct rb_node node; - struct list_head osd_node; + u64 linger_id; + bool committed; + bool is_watch; /* watch or notify */ + + struct ceph_osd *osd; + struct ceph_osd_request *reg_req; + struct ceph_osd_request *ping_req; + unsigned long ping_sent; + unsigned long watch_valid_thru; + struct list_head pending_lworks; + + struct ceph_osd_request_target t; + u32 last_force_resend; + u32 map_dne_bound; + + struct timespec mtime; + struct kref kref; -}; + struct mutex lock; + struct rb_node node; /* osd */ + struct rb_node osdc_node; /* osdc */ + struct rb_node mc_node; /* map check */ + struct list_head scan_item; + + struct completion reg_commit_wait; + struct completion notify_finish_wait; + int reg_commit_error; + int notify_finish_error; + int last_error; + + u32 register_gen; + u64 notify_id; + + rados_watchcb2_t wcb; + rados_watcherrcb_t errcb; + void *data; -struct ceph_osd_event_work { - struct work_struct work; - struct ceph_osd_event *event; - u64 ver; - u64 notify_id; - u8 opcode; + struct page ***preply_pages; + size_t *preply_len; }; struct ceph_osd_client { struct ceph_client *client; struct ceph_osdmap *osdmap; /* current map */ - struct rw_semaphore map_sem; - struct completion map_waiters; - u64 last_requested_map; + struct rw_semaphore lock; - struct mutex request_mutex; struct rb_root osds; /* osds */ struct list_head osd_lru; /* idle osds */ - u64 timeout_tid; /* tid of timeout triggering rq */ - u64 last_tid; /* tid of last request */ - struct rb_root requests; /* pending requests */ - struct list_head req_lru; /* in-flight lru */ - struct list_head req_unsent; /* unsent/need-resend queue */ - struct list_head req_notarget; /* map to no osd */ - struct list_head req_linger; /* lingering requests */ - int num_requests; + spinlock_t osd_lru_lock; + struct ceph_osd homeless_osd; + atomic64_t last_tid; /* tid of last request */ + u64 last_linger_id; + struct rb_root linger_requests; /* lingering requests */ + struct rb_root map_checks; + struct rb_root linger_map_checks; + atomic_t num_requests; + atomic_t num_homeless; struct delayed_work timeout_work; struct delayed_work osds_timeout_work; #ifdef CONFIG_DEBUG_FS @@ -231,10 +276,6 @@ struct ceph_osd_client { struct ceph_msgpool msgpool_op; struct ceph_msgpool msgpool_op_reply; - spinlock_t event_lock; - struct rb_root event_tree; - u64 event_count; - struct workqueue_struct *notify_wq; }; @@ -271,9 +312,6 @@ extern void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req, extern struct ceph_osd_data *osd_req_op_extent_osd_data( struct ceph_osd_request *osd_req, unsigned int which); -extern struct ceph_osd_data *osd_req_op_cls_response_data( - struct ceph_osd_request *osd_req, - unsigned int which); extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *, unsigned int which, @@ -309,9 +347,6 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req, extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, const char *name, const void *value, size_t size, u8 cmp_op, u8 cmp_mode); -extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req, - unsigned int which, u16 opcode, - u64 cookie, u64 version, int flag); extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, unsigned int which, u64 expected_object_size, @@ -322,11 +357,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client * unsigned int num_ops, bool use_mempool, gfp_t gfp_flags); - -extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, - struct ceph_snap_context *snapc, - u64 snap_id, - struct timespec *mtime); +int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp); extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, struct ceph_file_layout *layout, @@ -338,9 +369,6 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, u32 truncate_seq, u64 truncate_size, bool use_mempool); -extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, - struct ceph_osd_request *req); - extern void ceph_osdc_get_request(struct ceph_osd_request *req); extern void ceph_osdc_put_request(struct ceph_osd_request *req); @@ -353,6 +381,7 @@ extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc, extern void ceph_osdc_sync(struct ceph_osd_client *osdc); extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc); +void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc); extern int ceph_osdc_readpages(struct ceph_osd_client *osdc, struct ceph_vino vino, @@ -371,11 +400,33 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct timespec *mtime, struct page **pages, int nr_pages); -/* watch/notify events */ -extern int ceph_osdc_create_event(struct ceph_osd_client *osdc, - void (*event_cb)(u64, u64, u8, void *), - void *data, struct ceph_osd_event **pevent); -extern void ceph_osdc_cancel_event(struct ceph_osd_event *event); -extern void ceph_osdc_put_event(struct ceph_osd_event *event); +/* watch/notify */ +struct ceph_osd_linger_request * +ceph_osdc_watch(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + rados_watchcb2_t wcb, + rados_watcherrcb_t errcb, + void *data); +int ceph_osdc_unwatch(struct ceph_osd_client *osdc, + struct ceph_osd_linger_request *lreq); + +int ceph_osdc_notify_ack(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + u64 notify_id, + u64 cookie, + void *payload, + size_t payload_len); +int ceph_osdc_notify(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + void *payload, + size_t payload_len, + u32 timeout, + struct page ***preply_pages, + size_t *preply_len); +int ceph_osdc_watch_check(struct ceph_osd_client *osdc, + struct ceph_osd_linger_request *lreq); #endif diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index e55c08bc3a96..ddc426b22d81 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -24,21 +24,29 @@ struct ceph_pg { uint32_t seed; }; -#define CEPH_POOL_FLAG_HASHPSPOOL 1 +int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs); + +#define CEPH_POOL_FLAG_HASHPSPOOL (1ULL << 0) /* hash pg seed and pool id + together */ +#define CEPH_POOL_FLAG_FULL (1ULL << 1) /* pool is full */ struct ceph_pg_pool_info { struct rb_node node; s64 id; - u8 type; + u8 type; /* CEPH_POOL_TYPE_* */ u8 size; + u8 min_size; u8 crush_ruleset; u8 object_hash; + u32 last_force_request_resend; u32 pg_num, pgp_num; int pg_num_mask, pgp_num_mask; s64 read_tier; s64 write_tier; /* wins for read+write ops */ - u64 flags; + u64 flags; /* CEPH_POOL_FLAG_* */ char *name; + + bool was_full; /* for handle_one_map() */ }; static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool) @@ -57,6 +65,22 @@ struct ceph_object_locator { s64 pool; }; +static inline void ceph_oloc_init(struct ceph_object_locator *oloc) +{ + oloc->pool = -1; +} + +static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc) +{ + return oloc->pool == -1; +} + +static inline void ceph_oloc_copy(struct ceph_object_locator *dest, + const struct ceph_object_locator *src) +{ + dest->pool = src->pool; +} + /* * Maximum supported by kernel client object name length * @@ -64,11 +88,47 @@ struct ceph_object_locator { */ #define CEPH_MAX_OID_NAME_LEN 100 +/* + * 51-char inline_name is long enough for all cephfs and all but one + * rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be + * arbitrarily long (~PAGE_SIZE). It's done once during rbd map; all + * other rbd requests fit into inline_name. + * + * Makes ceph_object_id 64 bytes on 64-bit. + */ +#define CEPH_OID_INLINE_LEN 52 + +/* + * Both inline and external buffers have space for a NUL-terminator, + * which is carried around. It's not required though - RADOS object + * names don't have to be NUL-terminated and may contain NULs. + */ struct ceph_object_id { - char name[CEPH_MAX_OID_NAME_LEN]; + char *name; + char inline_name[CEPH_OID_INLINE_LEN]; int name_len; }; +static inline void ceph_oid_init(struct ceph_object_id *oid) +{ + oid->name = oid->inline_name; + oid->name_len = 0; +} + +static inline bool ceph_oid_empty(const struct ceph_object_id *oid) +{ + return oid->name == oid->inline_name && !oid->name_len; +} + +void ceph_oid_copy(struct ceph_object_id *dest, + const struct ceph_object_id *src); +__printf(2, 3) +void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...); +__printf(3, 4) +int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp, + const char *fmt, ...); +void ceph_oid_destroy(struct ceph_object_id *oid); + struct ceph_pg_mapping { struct rb_node node; struct ceph_pg pgid; @@ -87,7 +147,6 @@ struct ceph_pg_mapping { struct ceph_osdmap { struct ceph_fsid fsid; u32 epoch; - u32 mkfs_epoch; struct ceph_timespec created, modified; u32 flags; /* CEPH_OSDMAP_* */ @@ -113,43 +172,19 @@ struct ceph_osdmap { int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3]; }; -static inline void ceph_oid_set_name(struct ceph_object_id *oid, - const char *name) -{ - int len; - - len = strlen(name); - if (len > sizeof(oid->name)) { - WARN(1, "ceph_oid_set_name '%s' len %d vs %zu, truncating\n", - name, len, sizeof(oid->name)); - len = sizeof(oid->name); - } - - memcpy(oid->name, name, len); - oid->name_len = len; -} - -static inline void ceph_oid_copy(struct ceph_object_id *dest, - struct ceph_object_id *src) -{ - BUG_ON(src->name_len > sizeof(dest->name)); - memcpy(dest->name, src->name, src->name_len); - dest->name_len = src->name_len; -} - -static inline int ceph_osd_exists(struct ceph_osdmap *map, int osd) +static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd) { return osd >= 0 && osd < map->max_osd && (map->osd_state[osd] & CEPH_OSD_EXISTS); } -static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd) +static inline bool ceph_osd_is_up(struct ceph_osdmap *map, int osd) { return ceph_osd_exists(map, osd) && (map->osd_state[osd] & CEPH_OSD_UP); } -static inline int ceph_osd_is_down(struct ceph_osdmap *map, int osd) +static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd) { return !ceph_osd_is_up(map, osd); } @@ -192,28 +227,59 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid) return 0; } +struct ceph_osdmap *ceph_osdmap_alloc(void); extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end); -extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, - struct ceph_osdmap *map, - struct ceph_messenger *msgr); +struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, + struct ceph_osdmap *map); extern void ceph_osdmap_destroy(struct ceph_osdmap *map); +struct ceph_osds { + int osds[CEPH_PG_MAX_SIZE]; + int size; + int primary; /* id, NOT index */ +}; + +static inline void ceph_osds_init(struct ceph_osds *set) +{ + set->size = 0; + set->primary = -1; +} + +void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src); + +bool ceph_is_new_interval(const struct ceph_osds *old_acting, + const struct ceph_osds *new_acting, + const struct ceph_osds *old_up, + const struct ceph_osds *new_up, + int old_size, + int new_size, + int old_min_size, + int new_min_size, + u32 old_pg_num, + u32 new_pg_num, + bool old_sort_bitwise, + bool new_sort_bitwise, + const struct ceph_pg *pgid); +bool ceph_osds_changed(const struct ceph_osds *old_acting, + const struct ceph_osds *new_acting, + bool any_change); + /* calculate mapping of a file extent to an object */ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u64 off, u64 len, u64 *bno, u64 *oxoff, u64 *oxlen); -/* calculate mapping of object to a placement group */ -extern int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, - struct ceph_object_locator *oloc, - struct ceph_object_id *oid, - struct ceph_pg *pg_out); - -extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, - struct ceph_pg pgid, - int *osds, int *primary); -extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, - struct ceph_pg pgid); +int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + struct ceph_pg *raw_pgid); + +void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap, + const struct ceph_pg *raw_pgid, + struct ceph_osds *up, + struct ceph_osds *acting); +int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap, + const struct ceph_pg *raw_pgid); extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id); diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 2f822dca1046..5c0da61cb763 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -114,8 +114,8 @@ struct ceph_object_layout { * compound epoch+version, used by storage layer to serialize mutations */ struct ceph_eversion { - __le32 epoch; __le64 version; + __le32 epoch; } __attribute__ ((packed)); /* @@ -153,6 +153,11 @@ extern const char *ceph_osd_state_name(int s); #define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */ #define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */ #define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */ +#define CEPH_OSDMAP_NOSCRUB (1<<11) /* block periodic scrub */ +#define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */ +#define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */ +#define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */ +#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */ /* * The error code to return when an OSD can't handle a write @@ -389,6 +394,13 @@ enum { CEPH_OSD_FLAG_SKIPRWLOCKS = 0x10000, /* skip rw locks */ CEPH_OSD_FLAG_IGNORE_OVERLAY = 0x20000, /* ignore pool overlay */ CEPH_OSD_FLAG_FLUSH = 0x40000, /* this is part of flush */ + CEPH_OSD_FLAG_MAP_SNAP_CLONE = 0x80000, /* map snap direct to clone id */ + CEPH_OSD_FLAG_ENFORCE_SNAPC = 0x100000, /* use snapc provided even if + pool uses pool snaps */ + CEPH_OSD_FLAG_REDIRECTED = 0x200000, /* op has been redirected */ + CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000, /* redirect bit is authoritative */ + CEPH_OSD_FLAG_FULL_TRY = 0x800000, /* try op despite full flag */ + CEPH_OSD_FLAG_FULL_FORCE = 0x1000000, /* force op despite full flag */ }; enum { @@ -415,7 +427,17 @@ enum { CEPH_OSD_CMPXATTR_MODE_U64 = 2 }; -#define RADOS_NOTIFY_VER 1 +enum { + CEPH_OSD_WATCH_OP_UNWATCH = 0, + CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1, + /* note: use only ODD ids to prevent pre-giant code from + interpreting the op as UNWATCH */ + CEPH_OSD_WATCH_OP_WATCH = 3, + CEPH_OSD_WATCH_OP_RECONNECT = 5, + CEPH_OSD_WATCH_OP_PING = 7, +}; + +const char *ceph_osd_watch_op_name(int o); /* * an individual object operation. each may be accompanied by some data @@ -450,10 +472,14 @@ struct ceph_osd_op { } __attribute__ ((packed)) snap; struct { __le64 cookie; - __le64 ver; - __u8 flag; /* 0 = unwatch, 1 = watch */ + __le64 ver; /* no longer used */ + __u8 op; /* CEPH_OSD_WATCH_OP_* */ + __le32 gen; /* registration generation */ } __attribute__ ((packed)) watch; struct { + __le64 cookie; + } __attribute__ ((packed)) notify; + struct { __le64 offset, length; __le64 src_offset; } __attribute__ ((packed)) clonerange; |