From 9d521470a40f16110bd31018034155c60c1a1275 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 31 Jan 2014 17:54:26 +0200 Subject: libceph: a per-osdc crush scratch buffer With the addition of erasure coding support in the future, scratch variable-length array in crush_do_rule_ary() is going to grow to at least 200 bytes on average, on top of another 128 bytes consumed by rawosd/osd arrays in the call chain. Replace it with a buffer inside struct osdmap and a mutex. This shouldn't result in any contention, because all osd requests were already serialized by request_mutex at that point; the only unlocked caller was ceph_ioctl_get_dataloc(). Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/osdmap.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 49ff69f0746b..8c8b3cefc28b 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -84,6 +84,9 @@ struct ceph_osdmap { /* the CRUSH map specifies the mapping of placement groups to * the list of osds that store+replicate them. */ struct crush_map *crush; + + struct mutex crush_scratch_mutex; + int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3]; }; static inline void ceph_oid_set_name(struct ceph_object_id *oid, -- cgit v1.2.3 From 7b25bf5f02c5c80adf96120e031dc3a1756ce54d Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 25 Feb 2014 16:22:26 +0200 Subject: libceph: encode CEPH_OSD_OP_FLAG_* op flags Encode ceph_osd_op::flags field so that it gets sent over the wire. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil Reviewed-by: Alex Elder --- include/linux/ceph/osd_client.h | 1 + include/linux/ceph/rados.h | 2 +- net/ceph/osd_client.c | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index fd47e872ebcc..e94f5da251d6 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -76,6 +76,7 @@ struct ceph_osd_data { struct ceph_osd_req_op { u16 op; /* CEPH_OSD_OP_* */ + u32 flags; /* CEPH_OSD_OP_FLAG_* */ u32 payload_len; union { struct ceph_osd_data raw_data_in; diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 96292df4041b..8f9bf4570215 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -382,7 +382,7 @@ enum { */ struct ceph_osd_op { __le16 op; /* CEPH_OSD_OP_* */ - __le32 flags; /* CEPH_OSD_FLAG_* */ + __le32 flags; /* CEPH_OSD_OP_FLAG_* */ union { struct { __le64 offset, length; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 0676f2b199d6..5d7fd0b8c1c8 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -688,7 +688,9 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, return 0; } + dst->op = cpu_to_le16(src->op); + dst->flags = cpu_to_le32(src->flags); dst->payload_len = cpu_to_le32(src->payload_len); return request_data_len; -- cgit v1.2.3 From c647b8a8c6366f849c2a237bfe525cb1d316d5f4 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 25 Feb 2014 16:22:27 +0200 Subject: libceph: add support for CEPH_OSD_OP_SETALLOCHINT osd op This is primarily for rbd's benefit and is supposed to combat fragmentation: "... knowing that rbd images have a 4m size, librbd can pass a hint that will let the osd do the xfs allocation size ioctl on new files so that they are allocated in 1m or 4m chunks. We've seen cases where users with rbd workloads have very high levels of fragmentation in xfs and this would mitigate that and probably have a pretty nice performance benefit." SETALLOCHINT is considered advisory, so our backwards compatibility mechanism here is to set FAILOK flag for all SETALLOCHINT ops. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil Reviewed-by: Alex Elder --- include/linux/ceph/osd_client.h | 8 ++++++++ include/linux/ceph/rados.h | 7 +++++++ net/ceph/osd_client.c | 27 +++++++++++++++++++++++++++ 3 files changed, 42 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index e94f5da251d6..c42d1ada685f 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -103,6 +103,10 @@ struct ceph_osd_req_op { u32 timeout; __u8 flag; } watch; + struct { + u64 expected_object_size; + u64 expected_write_size; + } alloc_hint; }; }; @@ -294,6 +298,10 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req, extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, u64 cookie, u64 version, int flag); +extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, + unsigned int which, + u64 expected_object_size, + u64 expected_write_size); extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 8f9bf4570215..2caabef8d369 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -227,6 +227,9 @@ enum { CEPH_OSD_OP_OMAPRMKEYS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 24, CEPH_OSD_OP_OMAP_CMP = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 25, + /* hints */ + CEPH_OSD_OP_SETALLOCHINT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 35, + /** multi **/ CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_MULTI | 1, CEPH_OSD_OP_ASSERT_SRC_VERSION = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 2, @@ -416,6 +419,10 @@ struct ceph_osd_op { __le64 offset, length; __le64 src_offset; } __attribute__ ((packed)) clonerange; + struct { + __le64 expected_object_size; + __le64 expected_write_size; + } __attribute__ ((packed)) alloc_hint; }; __le32 payload_len; } __attribute__ ((packed)); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 5d7fd0b8c1c8..71830d79b0f4 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -436,6 +436,7 @@ static bool osd_req_opcode_valid(u16 opcode) case CEPH_OSD_OP_OMAPCLEAR: case CEPH_OSD_OP_OMAPRMKEYS: case CEPH_OSD_OP_OMAP_CMP: + case CEPH_OSD_OP_SETALLOCHINT: case CEPH_OSD_OP_CLONERANGE: case CEPH_OSD_OP_ASSERT_SRC_VERSION: case CEPH_OSD_OP_SRC_CMPXATTR: @@ -591,6 +592,26 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req, } EXPORT_SYMBOL(osd_req_op_watch_init); +void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, + unsigned int which, + u64 expected_object_size, + u64 expected_write_size) +{ + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, + CEPH_OSD_OP_SETALLOCHINT); + + op->alloc_hint.expected_object_size = expected_object_size; + op->alloc_hint.expected_write_size = expected_write_size; + + /* + * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed + * not worth a feature bit. Set FAILOK per-op flag to make + * sure older osds don't trip over an unsupported opcode. + */ + op->flags |= CEPH_OSD_OP_FLAG_FAILOK; +} +EXPORT_SYMBOL(osd_req_op_alloc_hint_init); + static void ceph_osdc_msg_data_add(struct ceph_msg *msg, struct ceph_osd_data *osd_data) { @@ -681,6 +702,12 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, dst->watch.ver = cpu_to_le64(src->watch.ver); dst->watch.flag = src->watch.flag; break; + case CEPH_OSD_OP_SETALLOCHINT: + dst->alloc_hint.expected_object_size = + cpu_to_le64(src->alloc_hint.expected_object_size); + dst->alloc_hint.expected_write_size = + cpu_to_le64(src->alloc_hint.expected_write_size); + break; default: pr_err("unsupported osd opcode %s\n", ceph_osd_op_name(src->op)); -- cgit v1.2.3 From 7cc69d42e6950404587bef9489a5ed6f9f6bab4e Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 25 Feb 2014 16:22:27 +0200 Subject: libceph: bump CEPH_OSD_MAX_OP to 3 Our longest osd request now contains 3 ops: copyup+hint+write. Also, CEPH_OSD_MAX_OP value in a BUG_ON in rbd_osd_req_callback() was hard-coded to 2. Fix it, and switch to rbd_assert while at it. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil Reviewed-by: Alex Elder --- drivers/block/rbd.c | 2 +- include/linux/ceph/osd_client.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b55b7812cf93..4c612c4041b6 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1654,7 +1654,7 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, if (osd_req->r_result < 0) obj_request->result = osd_req->r_result; - BUG_ON(osd_req->r_num_ops > 2); + rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP); /* * We support a 64-bit length, but ultimately it has to be diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index c42d1ada685f..94ec69672164 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -43,7 +43,7 @@ struct ceph_osd { }; -#define CEPH_OSD_MAX_OP 2 +#define CEPH_OSD_MAX_OP 3 enum ceph_osd_data_type { CEPH_OSD_DATA_TYPE_NONE = 0, -- cgit v1.2.3 From 19913b4eac4a230dccb548931358398f45dabe4c Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 6 Mar 2014 16:40:32 +0800 Subject: ceph: add get_name() NFS export callback Use the newly introduced LOOKUPNAME MDS request to connect child inode to its parent directory. Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/export.c | 40 ++++++++++++++++++++++++++++++++++ fs/ceph/inode.c | 51 +++++++++++++++++++++++++++++++++++++++++++- fs/ceph/strings.c | 1 + include/linux/ceph/ceph_fs.h | 1 + 4 files changed, 92 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/ceph/export.c b/fs/ceph/export.c index eb66408ff236..00d6af6a32ec 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -202,9 +202,49 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb, return dentry; } +static int ceph_get_name(struct dentry *parent, char *name, + struct dentry *child) +{ + struct ceph_mds_client *mdsc; + struct ceph_mds_request *req; + int err; + + mdsc = ceph_inode_to_client(child->d_inode)->mdsc; + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME, + USE_ANY_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + + mutex_lock(&parent->d_inode->i_mutex); + + req->r_inode = child->d_inode; + ihold(child->d_inode); + req->r_ino2 = ceph_vino(parent->d_inode); + req->r_locked_dir = parent->d_inode; + req->r_num_caps = 2; + err = ceph_mdsc_do_request(mdsc, NULL, req); + + mutex_unlock(&parent->d_inode->i_mutex); + + if (!err) { + struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + memcpy(name, rinfo->dname, rinfo->dname_len); + name[rinfo->dname_len] = 0; + dout("get_name %p ino %llx.%llx name %s\n", + child, ceph_vinop(child->d_inode), name); + } else { + dout("get_name %p ino %llx.%llx err %d\n", + child, ceph_vinop(child->d_inode), err); + } + + ceph_mdsc_put_request(req); + return err; +} + const struct export_operations ceph_export_ops = { .encode_fh = ceph_encode_fh, .fh_to_dentry = ceph_fh_to_dentry, .fh_to_parent = ceph_fh_to_parent, .get_parent = ceph_get_parent, + .get_name = ceph_get_name, }; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 8bf2384bd423..91d6c9d49e3e 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1044,10 +1044,59 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, session, req->r_request_started, -1, &req->r_caps_reservation); if (err < 0) - return err; + goto done; } else { WARN_ON_ONCE(1); } + + if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME) { + struct qstr dname; + struct dentry *dn, *parent; + + BUG_ON(!rinfo->head->is_target); + BUG_ON(req->r_dentry); + + parent = d_find_any_alias(dir); + BUG_ON(!parent); + + dname.name = rinfo->dname; + dname.len = rinfo->dname_len; + dname.hash = full_name_hash(dname.name, dname.len); + vino.ino = le64_to_cpu(rinfo->targeti.in->ino); + vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); +retry_lookup: + dn = d_lookup(parent, &dname); + dout("d_lookup on parent=%p name=%.*s got %p\n", + parent, dname.len, dname.name, dn); + + if (!dn) { + dn = d_alloc(parent, &dname); + dout("d_alloc %p '%.*s' = %p\n", parent, + dname.len, dname.name, dn); + if (dn == NULL) { + dput(parent); + err = -ENOMEM; + goto done; + } + err = ceph_init_dentry(dn); + if (err < 0) { + dput(dn); + dput(parent); + goto done; + } + } else if (dn->d_inode && + (ceph_ino(dn->d_inode) != vino.ino || + ceph_snap(dn->d_inode) != vino.snap)) { + dout(" dn %p points to wrong inode %p\n", + dn, dn->d_inode); + d_delete(dn); + dput(dn); + goto retry_lookup; + } + + req->r_dentry = dn; + dput(parent); + } } if (rinfo->head->is_target) { diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c index 4440f447fd3f..51cc23e48111 100644 --- a/fs/ceph/strings.c +++ b/fs/ceph/strings.c @@ -54,6 +54,7 @@ const char *ceph_mds_op_name(int op) case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash"; case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent"; case CEPH_MDS_OP_LOOKUPINO: return "lookupino"; + case CEPH_MDS_OP_LOOKUPNAME: return "lookupname"; case CEPH_MDS_OP_GETATTR: return "getattr"; case CEPH_MDS_OP_SETXATTR: return "setxattr"; case CEPH_MDS_OP_SETATTR: return "setattr"; diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 25bfb0eff772..35f345f7b3a3 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -332,6 +332,7 @@ enum { CEPH_MDS_OP_LOOKUPHASH = 0x00102, CEPH_MDS_OP_LOOKUPPARENT = 0x00103, CEPH_MDS_OP_LOOKUPINO = 0x00104, + CEPH_MDS_OP_LOOKUPNAME = 0x00105, CEPH_MDS_OP_SETXATTR = 0x01105, CEPH_MDS_OP_RMXATTR = 0x01106, -- cgit v1.2.3 From eb13e832f823f6c110ea53e3067bafe22b87de63 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 9 Mar 2014 23:16:40 +0800 Subject: ceph: use fl->fl_file as owner identifier of flock and posix lock flock and posix lock should use fl->fl_file instead of process ID as owner identifier. (posix lock uses fl->fl_owner. fl->fl_owner is usually equal to fl->fl_file, but it also can be a customized value). The process ID of who holds the lock is just for F_GETLK fcntl(2). The fix is rename the 'pid' fields of struct ceph_mds_request_args and struct ceph_filelock to 'owner', rename 'pid_namespace' fields to 'pid'. Assign fl->fl_file to the 'owner' field of lock messages. We also set the most significant bit of the 'owner' field. MDS can use that bit to distinguish between old and new clients. The MDS counterpart of this patch modifies the flock code to not take the 'pid_namespace' into consideration when checking conflict locks. Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/locks.c | 61 +++++++++++++++++++++++++++++--------------- fs/ceph/super.c | 1 + fs/ceph/super.h | 1 + include/linux/ceph/ceph_fs.h | 4 +-- 4 files changed, 45 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index f91a569a20fb..d94ba0df9f4d 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -2,11 +2,31 @@ #include #include +#include #include "super.h" #include "mds_client.h" #include +static u64 lock_secret; + +static inline u64 secure_addr(void *addr) +{ + u64 v = lock_secret ^ (u64)(unsigned long)addr; + /* + * Set the most significant bit, so that MDS knows the 'owner' + * is sufficient to identify the owner of lock. (old code uses + * both 'owner' and 'pid') + */ + v |= (1ULL << 63); + return v; +} + +void __init ceph_flock_init(void) +{ + get_random_bytes(&lock_secret, sizeof(lock_secret)); +} + /** * Implement fcntl and flock locking functions. */ @@ -14,11 +34,11 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, int cmd, u8 wait, struct file_lock *fl) { struct inode *inode = file_inode(file); - struct ceph_mds_client *mdsc = - ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; int err; u64 length = 0; + u64 owner; req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); if (IS_ERR(req)) @@ -32,25 +52,27 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, else length = fl->fl_end - fl->fl_start + 1; - dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " - "length: %llu, wait: %d, type: %d", (int)lock_type, - (int)operation, (u64)fl->fl_pid, fl->fl_start, - length, wait, fl->fl_type); + if (lock_type == CEPH_LOCK_FCNTL) + owner = secure_addr(fl->fl_owner); + else + owner = secure_addr(fl->fl_file); + + dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, " + "start: %llu, length: %llu, wait: %d, type: %d", (int)lock_type, + (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length, + wait, fl->fl_type); req->r_args.filelock_change.rule = lock_type; req->r_args.filelock_change.type = cmd; + req->r_args.filelock_change.owner = cpu_to_le64(owner); req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid); - /* This should be adjusted, but I'm not sure if - namespaces actually get id numbers*/ - req->r_args.filelock_change.pid_namespace = - cpu_to_le64((u64)(unsigned long)fl->fl_nspid); req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start); req->r_args.filelock_change.length = cpu_to_le64(length); req->r_args.filelock_change.wait = wait; err = ceph_mdsc_do_request(mdsc, inode, req); - if ( operation == CEPH_MDS_OP_GETFILELOCK){ + if (operation == CEPH_MDS_OP_GETFILELOCK) { fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid); if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type) fl->fl_type = F_RDLCK; @@ -93,8 +115,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK) return -ENOLCK; - fl->fl_nspid = get_pid(task_tgid(current)); - dout("ceph_lock, fl_pid:%d", fl->fl_pid); + dout("ceph_lock, fl_owner: %p", fl->fl_owner); /* set wait bit as appropriate, then make command as Ceph expects it*/ if (IS_GETLK(cmd)) @@ -111,7 +132,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl); if (!err) { - if ( op != CEPH_MDS_OP_GETFILELOCK ){ + if (op != CEPH_MDS_OP_GETFILELOCK) { dout("mds locked, locking locally"); err = posix_lock_file(file, fl, NULL); if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { @@ -145,8 +166,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK) return -ENOLCK; - fl->fl_nspid = get_pid(task_tgid(current)); - dout("ceph_flock, fl_pid:%d", fl->fl_pid); + dout("ceph_flock, fl_file: %p", fl->fl_file); if (IS_SETLKW(cmd)) wait = 1; @@ -289,13 +309,14 @@ int lock_to_ceph_filelock(struct file_lock *lock, struct ceph_filelock *cephlock) { int err = 0; - cephlock->start = cpu_to_le64(lock->fl_start); cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); cephlock->client = cpu_to_le64(0); - cephlock->pid = cpu_to_le64(lock->fl_pid); - cephlock->pid_namespace = - cpu_to_le64((u64)(unsigned long)lock->fl_nspid); + cephlock->pid = cpu_to_le64((u64)lock->fl_pid); + if (lock->fl_flags & FL_POSIX) + cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner)); + else + cephlock->owner = cpu_to_le64(secure_addr(lock->fl_file)); switch (lock->fl_type) { case F_RDLCK: diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 10a4ccbf38da..06150fd745ac 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1026,6 +1026,7 @@ static int __init init_ceph(void) if (ret) goto out; + ceph_flock_init(); ceph_xattr_init(); ret = register_filesystem(&ceph_fs_type); if (ret) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 70bb183385b7..7866cd05a6bb 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -871,6 +871,7 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg); extern const struct export_operations ceph_export_ops; /* locks.c */ +extern __init void ceph_flock_init(void); extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num); diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 35f345f7b3a3..5f6db18d72e8 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -421,8 +421,8 @@ union ceph_mds_request_args { struct { __u8 rule; /* currently fcntl or flock */ __u8 type; /* shared, exclusive, remove*/ + __le64 owner; /* owner of the lock */ __le64 pid; /* process id requesting the lock */ - __le64 pid_namespace; __le64 start; /* initial location to lock */ __le64 length; /* num bytes to lock from start */ __u8 wait; /* will caller wait for lock to become available? */ @@ -533,8 +533,8 @@ struct ceph_filelock { __le64 start;/* file offset to start lock at */ __le64 length; /* num bytes to lock; 0 for all following start */ __le64 client; /* which client holds the lock */ + __le64 owner; /* owner the lock */ __le64 pid; /* process id holding the lock on the client */ - __le64 pid_namespace; __u8 type; /* shared lock, exclusive lock, or unlock */ } __attribute__ ((packed)); -- cgit v1.2.3 From e2b149cc4ba00766aceb87950c6de72ea7fc8b2e Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 19 Mar 2014 16:58:37 +0200 Subject: crush: add chooseleaf_vary_r tunable The current crush_choose_firstn code will re-use the same 'r' value for the recursive call. That means that if we are hitting a collision or rejection for some reason (say, an OSD that is marked out) and need to retry, we will keep making the same (bad) choice in that recursive selection. Introduce a tunable that fixes that behavior by incorporating the parent 'r' value into the recursive starting point, so that a different path will be taken in subsequent placement attempts. Note that this was done from the get-go for the new crush_choose_indep algorithm. This was exposed by a user who was seeing PGs stuck in active+remapped after reweight-by-utilization because the up set mapped to a single OSD. Reflects ceph.git commit a8e6c9fbf88bad056dd05d3eb790e98a5e43451a. Signed-off-by: Ilya Dryomov Reviewed-by: Josh Durgin --- include/linux/crush/crush.h | 6 ++++++ net/ceph/crush/mapper.c | 30 ++++++++++++++++++++++++------ 2 files changed, 30 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index acaa5615d634..75f36a6c7f67 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -173,6 +173,12 @@ struct crush_map { * apply to a collision: in that case we will retry as we used * to. */ __u32 chooseleaf_descend_once; + + /* if non-zero, feed r into chooseleaf, bit-shifted right by (r-1) + * bits. a value of 1 is best for new clusters. for legacy clusters + * that want to limit reshuffling, a value of 3 or 4 will make the + * mappings line up a bit better with previous mappings. */ + __u8 chooseleaf_vary_r; }; diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index b3fb84903b30..947150cde297 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -295,7 +295,9 @@ static int is_out(const struct crush_map *map, * @local_retries: localized retries * @local_fallback_retries: localized fallback retries * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) + * @vary_r: pass r to recursive calls * @out2: second output vector for leaf items (if @recurse_to_leaf) + * @parent_r: r value passed from the parent */ static int crush_choose_firstn(const struct crush_map *map, struct crush_bucket *bucket, @@ -307,7 +309,9 @@ static int crush_choose_firstn(const struct crush_map *map, unsigned int local_retries, unsigned int local_fallback_retries, int recurse_to_leaf, - int *out2) + unsigned int vary_r, + int *out2, + int parent_r) { int rep; unsigned int ftotal, flocal; @@ -319,8 +323,11 @@ static int crush_choose_firstn(const struct crush_map *map, int itemtype; int collide, reject; - dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", - bucket->id, x, outpos, numrep); + dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n", + recurse_to_leaf ? "_LEAF" : "", + bucket->id, x, outpos, numrep, + tries, recurse_tries, local_retries, local_fallback_retries, + parent_r); for (rep = outpos; rep < numrep; rep++) { /* keep trying until we get a non-out, non-colliding item */ @@ -335,7 +342,7 @@ static int crush_choose_firstn(const struct crush_map *map, do { collide = 0; retry_bucket = 0; - r = rep; + r = rep + parent_r; /* r' = r + f_total */ r += ftotal; @@ -387,6 +394,11 @@ static int crush_choose_firstn(const struct crush_map *map, reject = 0; if (!collide && recurse_to_leaf) { if (item < 0) { + int sub_r; + if (vary_r) + sub_r = r >> (vary_r-1); + else + sub_r = 0; if (crush_choose_firstn(map, map->buckets[-1-item], weight, weight_max, @@ -396,7 +408,9 @@ static int crush_choose_firstn(const struct crush_map *map, local_retries, local_fallback_retries, 0, - NULL) <= outpos) + vary_r, + NULL, + sub_r) <= outpos) /* didn't get leaf */ reject = 1; } else { @@ -653,6 +667,8 @@ int crush_do_rule(const struct crush_map *map, int choose_local_retries = map->choose_local_tries; int choose_local_fallback_retries = map->choose_local_fallback_tries; + int vary_r = map->chooseleaf_vary_r; + if ((__u32)ruleno >= map->max_rules) { dprintk(" bad ruleno %d\n", ruleno); return 0; @@ -745,7 +761,9 @@ int crush_do_rule(const struct crush_map *map, choose_local_retries, choose_local_fallback_retries, recurse_to_leaf, - c+osize); + vary_r, + c+osize, + 0); } else { crush_choose_indep( map, -- cgit v1.2.3 From d83ed858f144e3cfbef883d4bc499113cdddabeb Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 19 Mar 2014 16:58:37 +0200 Subject: crush: add SET_CHOOSELEAF_VARY_R step This lets you adjust the vary_r tunable on a per-rule basis. Reflects ceph.git commit f944ccc20aee60a7d8da7e405ec75ad1cd449fac. Signed-off-by: Ilya Dryomov Reviewed-by: Josh Durgin --- include/linux/crush/crush.h | 1 + net/ceph/crush/mapper.c | 5 +++++ 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 75f36a6c7f67..4fad5f8ee01d 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -51,6 +51,7 @@ enum { CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */ CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10, CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11, + CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12 }; /* diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 947150cde297..a1ef53c04415 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -709,6 +709,11 @@ int crush_do_rule(const struct crush_map *map, choose_local_fallback_retries = curstep->arg1; break; + case CRUSH_RULE_SET_CHOOSELEAF_VARY_R: + if (curstep->arg1 >= 0) + vary_r = curstep->arg1; + break; + case CRUSH_RULE_CHOOSELEAF_FIRSTN: case CRUSH_RULE_CHOOSE_FIRSTN: firstn = 1; -- cgit v1.2.3 From 07bd7de47a65767432ceb66d4ab30cdc05ed2b35 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 19 Mar 2014 16:58:37 +0200 Subject: crush: support chooseleaf_vary_r tunable (tunables3) by default Add TUNABLES3 feature (chooseleaf_vary_r tunable) to a set of features supported by default. Signed-off-by: Ilya Dryomov Reviewed-by: Josh Durgin --- include/linux/ceph/ceph_features.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index 138448f766b4..77c097fe9ea9 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -43,6 +43,13 @@ #define CEPH_FEATURE_CRUSH_V2 (1ULL<<36) /* new indep; SET_* steps */ #define CEPH_FEATURE_EXPORT_PEER (1ULL<<37) #define CEPH_FEATURE_OSD_ERASURE_CODES (1ULL<<38) +#define CEPH_FEATURE_OSD_TMAP2OMAP (1ULL<<38) /* overlap with EC */ +/* The process supports new-style OSDMap encoding. Monitors also use + this bit to determine if peers support NAK messages. */ +#define CEPH_FEATURE_OSDMAP_ENC (1ULL<<39) +#define CEPH_FEATURE_MDS_INLINE_DATA (1ULL<<40) +#define CEPH_FEATURE_CRUSH_TUNABLES3 (1ULL<<41) +#define CEPH_FEATURE_OSD_PRIMARY_AFFINITY (1ULL<<41) /* overlap w/ tunables3 */ /* * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature @@ -82,7 +89,8 @@ static inline u64 ceph_sanitize_features(u64 features) CEPH_FEATURE_OSDHASHPSPOOL | \ CEPH_FEATURE_OSD_CACHEPOOL | \ CEPH_FEATURE_CRUSH_V2 | \ - CEPH_FEATURE_EXPORT_PEER) + CEPH_FEATURE_EXPORT_PEER | \ + CEPH_FEATURE_CRUSH_TUNABLES3) #define CEPH_FEATURES_REQUIRED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ -- cgit v1.2.3 From a2505d63ee0541d9b4685250b033192e68222e97 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 13 Mar 2014 16:36:13 +0200 Subject: libceph: split osdmap allocation and decode steps Split osdmap allocation and initialization into a separate function, ceph_osdmap_decode(). Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- include/linux/ceph/osdmap.h | 2 +- net/ceph/osd_client.c | 2 +- net/ceph/osdmap.c | 44 +++++++++++++++++++++++++++++--------------- 3 files changed, 31 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 8c8b3cefc28b..46c3e304c3d8 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -156,7 +156,7 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid) return 0; } -extern struct ceph_osdmap *osdmap_decode(void **p, void *end); +extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end); extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, struct ceph_osdmap *map, struct ceph_messenger *msgr); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 71830d79b0f4..6f64eec18851 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2062,7 +2062,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) int skipped_map = 0; dout("taking full map %u len %d\n", epoch, maplen); - newmap = osdmap_decode(&p, p+maplen); + newmap = ceph_osdmap_decode(&p, p+maplen); if (IS_ERR(newmap)) { err = PTR_ERR(newmap); goto bad; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 4dd000d128fd..a82df6ea0749 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -684,9 +684,8 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) /* * decode a full map. */ -struct ceph_osdmap *osdmap_decode(void **p, void *end) +static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) { - struct ceph_osdmap *map; u16 version; u32 len, max, i; int err = -EINVAL; @@ -694,14 +693,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) void *start = *p; struct ceph_pg_pool_info *pi; - dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p)); - - map = kzalloc(sizeof(*map), GFP_NOFS); - if (map == NULL) - return ERR_PTR(-ENOMEM); - - map->pg_temp = RB_ROOT; - mutex_init(&map->crush_scratch_mutex); + dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p)); ceph_decode_16_safe(p, end, version, bad); if (version > 6) { @@ -751,7 +743,6 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) err = osdmap_set_max_osd(map, max); if (err < 0) goto bad; - dout("osdmap_decode max_osd = %d\n", map->max_osd); /* osds */ err = -EINVAL; @@ -819,7 +810,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) *p = end; dout("full osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd); - return map; + return 0; bad: pr_err("corrupt full osdmap (%d) epoch %d off %d (%p of %p-%p)\n", @@ -827,8 +818,31 @@ bad: print_hex_dump(KERN_DEBUG, "osdmap: ", DUMP_PREFIX_OFFSET, 16, 1, start, end - start, true); - ceph_osdmap_destroy(map); - return ERR_PTR(err); + return err; +} + +/* + * Allocate and decode a full map. + */ +struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) +{ + struct ceph_osdmap *map; + int ret; + + map = kzalloc(sizeof(*map), GFP_NOFS); + if (!map) + return ERR_PTR(-ENOMEM); + + map->pg_temp = RB_ROOT; + mutex_init(&map->crush_scratch_mutex); + + ret = osdmap_decode(p, end, map); + if (ret) { + ceph_osdmap_destroy(map); + return ERR_PTR(ret); + } + + return map; } /* @@ -872,7 +886,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, if (len > 0) { dout("apply_incremental full map len %d, %p to %p\n", len, *p, end); - return osdmap_decode(p, min(*p+len, end)); + return ceph_osdmap_decode(p, min(*p+len, end)); } /* new crush? */ -- cgit v1.2.3 From 35a935d75d51abe58d3427a8b4ae3745a5a14e1c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 21 Mar 2014 19:05:29 +0200 Subject: libceph: generalize ceph_pg_mapping In preparation for adding support for primary_temp mappings, generalize struct ceph_pg_mapping so it can hold mappings other than pg_temp. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- include/linux/ceph/osdmap.h | 9 +++++++-- net/ceph/debugfs.c | 4 ++-- net/ceph/osdmap.c | 8 ++++---- 3 files changed, 13 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 46c3e304c3d8..4837e58e3203 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -60,8 +60,13 @@ struct ceph_object_id { struct ceph_pg_mapping { struct rb_node node; struct ceph_pg pgid; - int len; - int osds[]; + + union { + struct { + int len; + int osds[]; + } pg_temp; + }; }; struct ceph_osdmap { diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index c45d235e774e..5865f2c9580a 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -88,9 +88,9 @@ static int osdmap_show(struct seq_file *s, void *p) seq_printf(s, "pg_temp %llu.%x [", pg->pgid.pool, pg->pgid.seed); - for (i = 0; i < pg->len; i++) + for (i = 0; i < pg->pg_temp.len; i++) seq_printf(s, "%s%d", (i == 0 ? "" : ","), - pg->osds[i]); + pg->pg_temp.osds[i]); seq_printf(s, "]\n"); } diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index be2a65fbd902..c67a309fdfc2 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -822,9 +822,9 @@ static int __decode_pg_temp(void **p, void *end, struct ceph_osdmap *map, return -ENOMEM; pg->pgid = pgid; - pg->len = len; + pg->pg_temp.len = len; for (i = 0; i < len; i++) - pg->osds[i] = ceph_decode_32(p); + pg->pg_temp.osds[i] = ceph_decode_32(p); ret = __insert_pg_mapping(pg, &map->pg_temp); if (ret) { @@ -1281,8 +1281,8 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, pool->pg_num_mask); pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); if (pg) { - *num = pg->len; - return pg->osds; + *num = pg->pg_temp.len; + return pg->pg_temp.osds; } /* crush */ -- cgit v1.2.3 From 9686f94c8cfc06e8afb7b2233ab8f1f6ac01957f Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 21 Mar 2014 19:05:29 +0200 Subject: libceph: primary_temp infrastructure Add primary_temp mappings infrastructure. struct ceph_pg_mapping is overloaded, primary_temp mappings are stored in an rb-tree, rooted at ceph_osdmap, in a manner similar to pg_temp mappings. Dump primary_temp mappings to /sys/kernel/debug/ceph//osdmap, one 'primary_temp ' per line, e.g: primary_temp 2.6 4 Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- include/linux/ceph/osdmap.h | 5 +++++ net/ceph/debugfs.c | 7 +++++++ net/ceph/osdmap.c | 10 +++++++++- 3 files changed, 21 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 4837e58e3203..db4fb6322aae 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -66,6 +66,9 @@ struct ceph_pg_mapping { int len; int osds[]; } pg_temp; + struct { + int osd; + } primary_temp; }; }; @@ -83,6 +86,8 @@ struct ceph_osdmap { struct ceph_entity_addr *osd_addr; struct rb_root pg_temp; + struct rb_root primary_temp; + struct rb_root pg_pools; u32 pool_max; diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 5865f2c9580a..612bf55e6a8b 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -93,6 +93,13 @@ static int osdmap_show(struct seq_file *s, void *p) pg->pg_temp.osds[i]); seq_printf(s, "]\n"); } + for (n = rb_first(&map->primary_temp); n; n = rb_next(n)) { + struct ceph_pg_mapping *pg = + rb_entry(n, struct ceph_pg_mapping, node); + + seq_printf(s, "primary_temp %llu.%x %d\n", pg->pgid.pool, + pg->pgid.seed, pg->primary_temp.osd); + } return 0; } diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index c67a309fdfc2..c0fc517ab321 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -343,7 +343,7 @@ bad: /* * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid - * to a set of osds) + * to a set of osds) and primary_temp (explicit primary setting) */ static int pgid_cmp(struct ceph_pg l, struct ceph_pg r) { @@ -633,6 +633,13 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) rb_erase(&pg->node, &map->pg_temp); kfree(pg); } + while (!RB_EMPTY_ROOT(&map->primary_temp)) { + struct ceph_pg_mapping *pg = + rb_entry(rb_first(&map->primary_temp), + struct ceph_pg_mapping, node); + rb_erase(&pg->node, &map->primary_temp); + kfree(pg); + } while (!RB_EMPTY_ROOT(&map->pg_pools)) { struct ceph_pg_pool_info *pi = rb_entry(rb_first(&map->pg_pools), @@ -966,6 +973,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) return ERR_PTR(-ENOMEM); map->pg_temp = RB_ROOT; + map->primary_temp = RB_ROOT; mutex_init(&map->crush_scratch_mutex); ret = osdmap_decode(p, end, map); -- cgit v1.2.3 From 2cfa34f2d67a36e292cbe6e4c1e60d212b7ba4d1 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 21 Mar 2014 19:05:30 +0200 Subject: libceph: primary_affinity infrastructure Add primary_affinity infrastructure. primary_affinity values are stored in an max_osd-sized array, hanging off ceph_osdmap, similar to a osd_weight array. Introduce {get,set}_primary_affinity() helpers, primarily to return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY when no affinity has been set and to abstract out osd_primary_affinity array allocation and initialization. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- include/linux/ceph/osdmap.h | 3 +++ include/linux/ceph/rados.h | 4 ++++ net/ceph/debugfs.c | 5 +++-- net/ceph/osdmap.c | 47 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 57 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index db4fb6322aae..6e030cb3c9ca 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -88,6 +88,8 @@ struct ceph_osdmap { struct rb_root pg_temp; struct rb_root primary_temp; + u32 *osd_primary_affinity; + struct rb_root pg_pools; u32 pool_max; @@ -134,6 +136,7 @@ static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag) } extern char *ceph_osdmap_state_str(char *str, int len, int state); +extern u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd); static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map, int osd) diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 2caabef8d369..bb6f40c9cb0f 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -133,6 +133,10 @@ extern const char *ceph_osd_state_name(int s); #define CEPH_OSD_IN 0x10000 #define CEPH_OSD_OUT 0 +/* osd primary-affinity. fixed point value: 0x10000 == baseline */ +#define CEPH_OSD_MAX_PRIMARY_AFFINITY 0x10000 +#define CEPH_OSD_DEFAULT_PRIMARY_AFFINITY 0x10000 + /* * osd map flag bits diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 612bf55e6a8b..34453a2b4b4d 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -77,10 +77,11 @@ static int osdmap_show(struct seq_file *s, void *p) int state = map->osd_state[i]; char sb[64]; - seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\n", + seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n", i, ceph_pr_addr(&addr->in_addr), ((map->osd_weight[i]*100) >> 16), - ceph_osdmap_state_str(sb, sizeof(sb), state)); + ceph_osdmap_state_str(sb, sizeof(sb), state), + ((ceph_get_primary_affinity(map, i)*100) >> 16)); } for (n = rb_first(&map->pg_temp); n; n = rb_next(n)) { struct ceph_pg_mapping *pg = diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index c2d793d3d98d..0ac129388e07 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -649,6 +649,7 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) kfree(map->osd_state); kfree(map->osd_weight); kfree(map->osd_addr); + kfree(map->osd_primary_affinity); kfree(map); } @@ -685,6 +686,20 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) map->osd_weight = weight; map->osd_addr = addr; + if (map->osd_primary_affinity) { + u32 *affinity; + + affinity = krealloc(map->osd_primary_affinity, + max*sizeof(*affinity), GFP_NOFS); + if (!affinity) + return -ENOMEM; + + for (i = map->max_osd; i < max; i++) + affinity[i] = CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; + + map->osd_primary_affinity = affinity; + } + map->max_osd = max; return 0; @@ -912,6 +927,38 @@ static int decode_new_primary_temp(void **p, void *end, return __decode_primary_temp(p, end, map, true); } +u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd) +{ + BUG_ON(osd >= map->max_osd); + + if (!map->osd_primary_affinity) + return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; + + return map->osd_primary_affinity[osd]; +} + +static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff) +{ + BUG_ON(osd >= map->max_osd); + + if (!map->osd_primary_affinity) { + int i; + + map->osd_primary_affinity = kmalloc(map->max_osd*sizeof(u32), + GFP_NOFS); + if (!map->osd_primary_affinity) + return -ENOMEM; + + for (i = 0; i < map->max_osd; i++) + map->osd_primary_affinity[i] = + CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; + } + + map->osd_primary_affinity[osd] = aff; + + return 0; +} + /* * decode a full map. */ -- cgit v1.2.3 From ddf3a21a03d0f01c5ba83deaecd2d0c381d5ef42 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 21 Mar 2014 19:05:31 +0200 Subject: libceph: enable OSDMAP_ENC feature bit Announce our support for "new" (v7 - split and separately versioned client and osd sections) osdmap enconding. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- include/linux/ceph/ceph_features.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index 77c097fe9ea9..7a4cab50b2cd 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -90,6 +90,7 @@ static inline u64 ceph_sanitize_features(u64 features) CEPH_FEATURE_OSD_CACHEPOOL | \ CEPH_FEATURE_CRUSH_V2 | \ CEPH_FEATURE_EXPORT_PEER | \ + CEPH_FEATURE_OSDMAP_ENC | \ CEPH_FEATURE_CRUSH_TUNABLES3) #define CEPH_FEATURES_REQUIRED_DEFAULT \ -- cgit v1.2.3 From 246138fa6787db6f4016f26604fdc05dc9f95627 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 24 Mar 2014 17:12:46 +0200 Subject: libceph: ceph_osd_{exists,is_up,is_down}(osd) definitions Sync up with ceph.git definitions. Bring in ceph_osd_is_down(). Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- include/linux/ceph/osdmap.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 6e030cb3c9ca..0895797b9e28 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -125,9 +125,21 @@ static inline void ceph_oid_copy(struct ceph_object_id *dest, dest->name_len = src->name_len; } +static inline int ceph_osd_exists(struct ceph_osdmap *map, int osd) +{ + return osd >= 0 && osd < map->max_osd && + (map->osd_state[osd] & CEPH_OSD_EXISTS); +} + static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd) { - return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP); + return ceph_osd_exists(map, osd) && + (map->osd_state[osd] & CEPH_OSD_UP); +} + +static inline int ceph_osd_is_down(struct ceph_osdmap *map, int osd) +{ + return !ceph_osd_is_up(map, osd); } static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag) -- cgit v1.2.3 From 2abebdbca7997422bfab6bf8b6559384a6b95294 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 24 Mar 2014 17:12:47 +0200 Subject: libceph: ceph_can_shift_osds(pool) and pool type defines Bring in pg_pool_t::can_shift_osds() counterpart along with pool type defines. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- include/linux/ceph/osdmap.h | 12 ++++++++++++ include/linux/ceph/rados.h | 5 +++-- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 0895797b9e28..4e28c1e5d62f 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -41,6 +41,18 @@ struct ceph_pg_pool_info { char *name; }; +static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool) +{ + switch (pool->type) { + case CEPH_POOL_TYPE_REP: + return true; + case CEPH_POOL_TYPE_EC: + return false; + default: + BUG_ON(1); + } +} + struct ceph_object_locator { s64 pool; }; diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index bb6f40c9cb0f..f20e0d8a2155 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -81,8 +81,9 @@ struct ceph_pg_v1 { */ #define CEPH_NOPOOL ((__u64) (-1)) /* pool id not defined */ -#define CEPH_PG_TYPE_REP 1 -#define CEPH_PG_TYPE_RAID4 2 +#define CEPH_POOL_TYPE_REP 1 +#define CEPH_POOL_TYPE_RAID4 2 /* never implemented */ +#define CEPH_POOL_TYPE_EC 3 /* * stable_mod func is used to control number of placement groups. -- cgit v1.2.3 From ac972230e20581b044f5ce66dcaf3c5af8d57444 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 24 Mar 2014 17:12:48 +0200 Subject: libceph: switch ceph_calc_pg_acting() to new helpers Switch ceph_calc_pg_acting() to new helpers: pg_to_raw_osds(), raw_to_up_osds() and apply_temps(). Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- include/linux/ceph/osdmap.h | 2 +- net/ceph/osdmap.c | 51 +++++++++++++++++++++++++++++++++------------ 2 files changed, 39 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 4e28c1e5d62f..b0c8f8490663 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -212,7 +212,7 @@ extern int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, - int *acting); + int *osds); extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index bd40f56b53ed..f1cad21d1533 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1649,24 +1649,49 @@ static int apply_temps(struct ceph_osdmap *osdmap, } /* - * Return acting set for given pgid. + * Calculate acting set for given pgid. + * + * Return acting set length, or error. */ int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, - int *acting) + int *osds) { - int rawosds[CEPH_PG_MAX_SIZE], *osds; - int i, o, num = CEPH_PG_MAX_SIZE; + struct ceph_pg_pool_info *pool; + u32 pps; + int len; + int primary; - osds = calc_pg_raw(osdmap, pgid, rawosds, &num); - if (!osds) - return -1; + pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool); + if (!pool) + return 0; - /* primary is first up osd */ - o = 0; - for (i = 0; i < num; i++) - if (ceph_osd_is_up(osdmap, osds[i])) - acting[o++] = osds[i]; - return o; + if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) { + /* hash pool id and seed so that pool PGs do not overlap */ + pps = crush_hash32_2(CRUSH_HASH_RJENKINS1, + ceph_stable_mod(pgid.seed, pool->pgp_num, + pool->pgp_num_mask), + pgid.pool); + } else { + /* + * legacy behavior: add ps and pool together. this is + * not a great approach because the PGs from each pool + * will overlap on top of each other: 0.5 == 1.4 == + * 2.3 == ... + */ + pps = ceph_stable_mod(pgid.seed, pool->pgp_num, + pool->pgp_num_mask) + + (unsigned)pgid.pool; + } + + len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds); + if (len < 0) + return len; + + len = raw_to_up_osds(osdmap, pool, osds, len, &primary); + + len = apply_temps(osdmap, pool, pgid, osds, len, &primary); + + return len; } /* -- cgit v1.2.3 From 8008ab1080c1768b02d232dcfd9e161cd47cc9f7 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 24 Mar 2014 17:12:48 +0200 Subject: libceph: return primary from ceph_calc_pg_acting() In preparation for adding support for primary_temp, stop assuming primaryness: add a primary out parameter to ceph_calc_pg_acting() and change call sites accordingly. Primary is now specified separately from the order of osds in the set. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- include/linux/ceph/osdmap.h | 2 +- net/ceph/osd_client.c | 10 ++++------ net/ceph/osdmap.c | 20 ++++++++++++-------- 3 files changed, 17 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index b0c8f8490663..561ea896c657 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -212,7 +212,7 @@ extern int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, - int *osds); + int *osds, int *primary); extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 6f64eec18851..b4157dc22199 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1333,7 +1333,7 @@ static int __map_request(struct ceph_osd_client *osdc, { struct ceph_pg pgid; int acting[CEPH_PG_MAX_SIZE]; - int o = -1, num = 0; + int num, o; int err; bool was_paused; @@ -1346,11 +1346,9 @@ static int __map_request(struct ceph_osd_client *osdc, } req->r_pgid = pgid; - err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting); - if (err > 0) { - o = acting[0]; - num = err; - } + num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o); + if (num < 0) + num = 0; was_paused = req->r_paused; req->r_paused = __req_should_be_paused(osdc, req); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index f1cad21d1533..df9389ddd56c 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1651,19 +1651,21 @@ static int apply_temps(struct ceph_osdmap *osdmap, /* * Calculate acting set for given pgid. * - * Return acting set length, or error. + * Return acting set length, or error. *primary is set to acting + * primary osd id, or -1 if acting set is empty or on error. */ int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, - int *osds) + int *osds, int *primary) { struct ceph_pg_pool_info *pool; u32 pps; int len; - int primary; pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool); - if (!pool) - return 0; + if (!pool) { + *primary = -1; + return -ENOENT; + } if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) { /* hash pool id and seed so that pool PGs do not overlap */ @@ -1684,12 +1686,14 @@ int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, } len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds); - if (len < 0) + if (len < 0) { + *primary = -1; return len; + } - len = raw_to_up_osds(osdmap, pool, osds, len, &primary); + len = raw_to_up_osds(osdmap, pool, osds, len, primary); - len = apply_temps(osdmap, pool, pgid, osds, len, &primary); + len = apply_temps(osdmap, pool, pgid, osds, len, primary); return len; } -- cgit v1.2.3 From 18cb95af2d7c69aa136ab13f02dd55188c120e75 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 24 Mar 2014 17:12:50 +0200 Subject: libceph: enable PRIMARY_AFFINITY feature bit Announce our support for osdmaps with non-default primary affinity values. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- include/linux/ceph/ceph_features.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index 7a4cab50b2cd..d12659ce550d 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -91,7 +91,8 @@ static inline u64 ceph_sanitize_features(u64 features) CEPH_FEATURE_CRUSH_V2 | \ CEPH_FEATURE_EXPORT_PEER | \ CEPH_FEATURE_OSDMAP_ENC | \ - CEPH_FEATURE_CRUSH_TUNABLES3) + CEPH_FEATURE_CRUSH_TUNABLES3 | \ + CEPH_FEATURE_OSD_PRIMARY_AFFINITY) #define CEPH_FEATURES_REQUIRED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ -- cgit v1.2.3