diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-03-01 02:36:09 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-03-01 02:36:09 +0300 |
commit | b2deee2dc06db7cdf99b84346e69bdb9db9baa85 (patch) | |
tree | ceb073fa12c1a9804761ec8ce8911a517b007ed6 | |
parent | d4f4cf77b37eaea58ef863a4cbc95dad3880b524 (diff) | |
parent | 54ea0046b6fe36ec18e82d282a29a18da6cdea0f (diff) | |
download | linux-b2deee2dc06db7cdf99b84346e69bdb9db9baa85.tar.xz |
Merge tag 'ceph-for-4.11-rc1' of git://github.com/ceph/ceph-client
Pull ceph updates from Ilya Dryomov:
"This time around we have:
- support for rbd data-pool feature, which enables rbd images on
erasure-coded pools (myself). CEPH_PG_MAX_SIZE has been bumped to
allow erasure-coded profiles with k+m up to 32.
- a patch for ceph_d_revalidate() performance regression introduced
in 4.9, along with some cleanups in the area (Jeff Layton)
- a set of fixes for unsafe ->d_parent accesses in CephFS (Jeff
Layton)
- buffered reads are now processed in rsize windows instead of rasize
windows (Andreas Gerstmayr). The new default for rsize mount option
is 64M.
- ack vs commit distinction is gone, greatly simplifying ->fsync()
and MOSDOpReply handling code (myself)
... also a few filesystem bug fixes from Zheng, a CRUSH sync up (CRUSH
computations are still serialized though) and several minor fixes and
cleanups all over"
* tag 'ceph-for-4.11-rc1' of git://github.com/ceph/ceph-client: (52 commits)
libceph, rbd, ceph: WRITE | ONDISK -> WRITE
libceph: get rid of ack vs commit
ceph: remove special ack vs commit behavior
ceph: tidy some white space in get_nonsnap_parent()
crush: fix dprintk compilation
crush: do is_out test only if we do not collide
ceph: remove req from unsafe list when unregistering it
rbd: constify device_type structure
rbd: kill obj_request->object_name and rbd_segment_name_cache
rbd: store and use obj_request->object_no
rbd: RBD_V{1,2}_DATA_FORMAT macros
rbd: factor out __rbd_osd_req_create()
rbd: set offset and length outside of rbd_obj_request_create()
rbd: support for data-pool feature
rbd: introduce rbd_init_layout()
rbd: use rbd_obj_bytes() more
rbd: remove now unused rbd_obj_request_wait() and helpers
rbd: switch rbd_obj_method_sync() to ceph_osdc_call()
libceph: pass reply buffer length through ceph_osdc_call()
rbd: do away with obj_request in rbd_obj_read_sync()
...
-rw-r--r-- | Documentation/filesystems/ceph.txt | 5 | ||||
-rw-r--r-- | drivers/block/rbd.c | 601 | ||||
-rw-r--r-- | drivers/block/rbd_types.h | 10 | ||||
-rw-r--r-- | fs/ceph/addr.c | 19 | ||||
-rw-r--r-- | fs/ceph/cache.c | 2 | ||||
-rw-r--r-- | fs/ceph/caps.c | 40 | ||||
-rw-r--r-- | fs/ceph/debugfs.c | 2 | ||||
-rw-r--r-- | fs/ceph/dir.c | 32 | ||||
-rw-r--r-- | fs/ceph/export.c | 3 | ||||
-rw-r--r-- | fs/ceph/file.c | 106 | ||||
-rw-r--r-- | fs/ceph/inode.c | 172 | ||||
-rw-r--r-- | fs/ceph/ioctl.c | 4 | ||||
-rw-r--r-- | fs/ceph/mds_client.c | 175 | ||||
-rw-r--r-- | fs/ceph/mds_client.h | 15 | ||||
-rw-r--r-- | fs/ceph/super.c | 9 | ||||
-rw-r--r-- | fs/ceph/super.h | 14 | ||||
-rw-r--r-- | include/linux/ceph/osd_client.h | 6 | ||||
-rw-r--r-- | include/linux/ceph/osdmap.h | 13 | ||||
-rw-r--r-- | include/linux/ceph/rados.h | 2 | ||||
-rw-r--r-- | include/linux/crush/crush.h | 41 | ||||
-rw-r--r-- | include/linux/crush/mapper.h | 16 | ||||
-rw-r--r-- | net/ceph/cls_lock_client.c | 14 | ||||
-rw-r--r-- | net/ceph/crush/crush.c | 5 | ||||
-rw-r--r-- | net/ceph/crush/mapper.c | 227 | ||||
-rw-r--r-- | net/ceph/crypto.c | 1 | ||||
-rw-r--r-- | net/ceph/osd_client.c | 130 | ||||
-rw-r--r-- | net/ceph/osdmap.c | 101 | ||||
-rw-r--r-- | net/ceph/snapshot.c | 2 |
28 files changed, 835 insertions, 932 deletions
diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt index f5306ee40ea9..0b302a11718a 100644 --- a/Documentation/filesystems/ceph.txt +++ b/Documentation/filesystems/ceph.txt @@ -98,11 +98,10 @@ Mount Options size. rsize=X - Specify the maximum read size in bytes. By default there is no - maximum. + Specify the maximum read size in bytes. Default: 64 MB. rasize=X - Specify the maximum readahead. + Specify the maximum readahead. Default: 8 MB. mount_timeout=X Specify the timeout value for mount (in seconds), in the case diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 362cecc77130..4d6807723798 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -123,9 +123,11 @@ static int atomic_dec_return_safe(atomic_t *v) #define RBD_FEATURE_LAYERING (1<<0) #define RBD_FEATURE_STRIPINGV2 (1<<1) #define RBD_FEATURE_EXCLUSIVE_LOCK (1<<2) +#define RBD_FEATURE_DATA_POOL (1<<7) #define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \ RBD_FEATURE_STRIPINGV2 | \ - RBD_FEATURE_EXCLUSIVE_LOCK) + RBD_FEATURE_EXCLUSIVE_LOCK | \ + RBD_FEATURE_DATA_POOL) /* Features supported by this (client software) implementation. */ @@ -144,10 +146,9 @@ struct rbd_image_header { /* These six fields never change for a given rbd image */ char *object_prefix; __u8 obj_order; - __u8 crypt_type; - __u8 comp_type; u64 stripe_unit; u64 stripe_count; + s64 data_pool_id; u64 features; /* Might be changeable someday? */ /* The remaining fields need to be updated occasionally */ @@ -230,7 +231,7 @@ enum obj_req_flags { }; struct rbd_obj_request { - const char *object_name; + u64 object_no; u64 offset; /* object start byte */ u64 length; /* bytes from offset */ unsigned long flags; @@ -438,7 +439,6 @@ static DEFINE_SPINLOCK(rbd_client_list_lock); static struct kmem_cache *rbd_img_request_cache; static struct kmem_cache *rbd_obj_request_cache; -static struct kmem_cache *rbd_segment_name_cache; static int rbd_major; static DEFINE_IDA(rbd_dev_id_ida); @@ -973,6 +973,30 @@ static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) } /* + * returns the size of an object in the image + */ +static u32 rbd_obj_bytes(struct rbd_image_header *header) +{ + return 1U << header->obj_order; +} + +static void rbd_init_layout(struct rbd_device *rbd_dev) +{ + if (rbd_dev->header.stripe_unit == 0 || + rbd_dev->header.stripe_count == 0) { + rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header); + rbd_dev->header.stripe_count = 1; + } + + rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit; + rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count; + rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header); + rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ? + rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id; + RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL); +} + +/* * Fill an rbd image header with information from the given format 1 * on-disk header. */ @@ -992,15 +1016,11 @@ static int rbd_header_from_disk(struct rbd_device *rbd_dev, /* Allocate this now to avoid having to handle failure below */ if (first_time) { - size_t len; - - len = strnlen(ondisk->object_prefix, - sizeof (ondisk->object_prefix)); - object_prefix = kmalloc(len + 1, GFP_KERNEL); + object_prefix = kstrndup(ondisk->object_prefix, + sizeof(ondisk->object_prefix), + GFP_KERNEL); if (!object_prefix) return -ENOMEM; - memcpy(object_prefix, ondisk->object_prefix, len); - object_prefix[len] = '\0'; } /* Allocate the snapshot context and fill it in */ @@ -1051,12 +1071,7 @@ static int rbd_header_from_disk(struct rbd_device *rbd_dev, if (first_time) { header->object_prefix = object_prefix; header->obj_order = ondisk->options.order; - header->crypt_type = ondisk->options.crypt_type; - header->comp_type = ondisk->options.comp_type; - /* The rest aren't used for format 1 images */ - header->stripe_unit = 0; - header->stripe_count = 0; - header->features = 0; + rbd_init_layout(rbd_dev); } else { ceph_put_snap_context(header->snapc); kfree(header->snap_names); @@ -1232,42 +1247,9 @@ static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) rbd_dev->mapping.features = 0; } -static void rbd_segment_name_free(const char *name) -{ - /* The explicit cast here is needed to drop the const qualifier */ - - kmem_cache_free(rbd_segment_name_cache, (void *)name); -} - -static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) -{ - char *name; - u64 segment; - int ret; - char *name_format; - - name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO); - if (!name) - return NULL; - segment = offset >> rbd_dev->header.obj_order; - name_format = "%s.%012llx"; - if (rbd_dev->image_format == 2) - name_format = "%s.%016llx"; - ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format, - rbd_dev->header.object_prefix, segment); - if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) { - pr_err("error formatting segment name for #%llu (%d)\n", - segment, ret); - rbd_segment_name_free(name); - name = NULL; - } - - return name; -} - static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) { - u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; + u64 segment_size = rbd_obj_bytes(&rbd_dev->header); return offset & (segment_size - 1); } @@ -1275,7 +1257,7 @@ static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) static u64 rbd_segment_length(struct rbd_device *rbd_dev, u64 offset, u64 length) { - u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; + u64 segment_size = rbd_obj_bytes(&rbd_dev->header); offset &= segment_size - 1; @@ -1287,14 +1269,6 @@ static u64 rbd_segment_length(struct rbd_device *rbd_dev, } /* - * returns the size of an object in the image - */ -static u64 rbd_obj_bytes(struct rbd_image_header *header) -{ - return 1 << header->obj_order; -} - -/* * bio helpers */ @@ -1623,7 +1597,9 @@ static void rbd_obj_request_submit(struct rbd_obj_request *obj_request) { struct ceph_osd_request *osd_req = obj_request->osd_req; - dout("%s %p osd_req %p\n", __func__, obj_request, osd_req); + dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__, + obj_request, obj_request->object_no, obj_request->offset, + obj_request->length, osd_req); if (obj_request_img_data_test(obj_request)) { WARN_ON(obj_request->callback != rbd_img_obj_callback); rbd_img_request_get(obj_request->img_request); @@ -1631,44 +1607,6 @@ static void rbd_obj_request_submit(struct rbd_obj_request *obj_request) ceph_osdc_start_request(osd_req->r_osdc, osd_req, false); } -static void rbd_obj_request_end(struct rbd_obj_request *obj_request) -{ - dout("%s %p\n", __func__, obj_request); - ceph_osdc_cancel_request(obj_request->osd_req); -} - -/* - * Wait for an object request to complete. If interrupted, cancel the - * underlying osd request. - * - * @timeout: in jiffies, 0 means "wait forever" - */ -static int __rbd_obj_request_wait(struct rbd_obj_request *obj_request, - unsigned long timeout) -{ - long ret; - - dout("%s %p\n", __func__, obj_request); - ret = wait_for_completion_interruptible_timeout( - &obj_request->completion, - ceph_timeout_jiffies(timeout)); - if (ret <= 0) { - if (ret == 0) - ret = -ETIMEDOUT; - rbd_obj_request_end(obj_request); - } else { - ret = 0; - } - - dout("%s %p ret %d\n", __func__, obj_request, (int)ret); - return ret; -} - -static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) -{ - return __rbd_obj_request_wait(obj_request, 0); -} - static void rbd_img_request_complete(struct rbd_img_request *img_request) { @@ -1955,8 +1893,8 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req) rbd_osd_call_callback(obj_request); break; default: - rbd_warn(NULL, "%s: unsupported op %hu", - obj_request->object_name, (unsigned short) opcode); + rbd_warn(NULL, "unexpected OSD op: object_no %016llx opcode %d", + obj_request->object_no, opcode); break; } @@ -1980,6 +1918,40 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) osd_req->r_data_offset = obj_request->offset; } +static struct ceph_osd_request * +__rbd_osd_req_create(struct rbd_device *rbd_dev, + struct ceph_snap_context *snapc, + int num_ops, unsigned int flags, + struct rbd_obj_request *obj_request) +{ + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + struct ceph_osd_request *req; + const char *name_format = rbd_dev->image_format == 1 ? + RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT; + + req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO); + if (!req) + return NULL; + + req->r_flags = flags; + req->r_callback = rbd_osd_req_callback; + req->r_priv = obj_request; + + req->r_base_oloc.pool = rbd_dev->layout.pool_id; + if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format, + rbd_dev->header.object_prefix, obj_request->object_no)) + goto err_req; + + if (ceph_osdc_alloc_messages(req, GFP_NOIO)) + goto err_req; + + return req; + +err_req: + ceph_osdc_put_request(req); + return NULL; +} + /* * Create an osd request. A read request has one osd op (read). * A write request has either one (watch) or two (hint+write) osd ops. @@ -1993,8 +1965,6 @@ static struct ceph_osd_request *rbd_osd_req_create( struct rbd_obj_request *obj_request) { struct ceph_snap_context *snapc = NULL; - struct ceph_osd_client *osdc; - struct ceph_osd_request *osd_req; if (obj_request_img_data_test(obj_request) && (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) { @@ -2009,35 +1979,9 @@ static struct ceph_osd_request *rbd_osd_req_create( rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2)); - /* Allocate and initialize the request, for the num_ops ops */ - - osdc = &rbd_dev->rbd_client->client->osdc; - osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, - GFP_NOIO); - if (!osd_req) - goto fail; - - if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) - osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; - else - osd_req->r_flags = CEPH_OSD_FLAG_READ; - - osd_req->r_callback = rbd_osd_req_callback; - osd_req->r_priv = obj_request; - - osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id; - if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s", - obj_request->object_name)) - goto fail; - - if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO)) - goto fail; - - return osd_req; - -fail: - ceph_osdc_put_request(osd_req); - return NULL; + return __rbd_osd_req_create(rbd_dev, snapc, num_ops, + (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) ? + CEPH_OSD_FLAG_WRITE : CEPH_OSD_FLAG_READ, obj_request); } /* @@ -2050,10 +1994,6 @@ static struct ceph_osd_request * rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) { struct rbd_img_request *img_request; - struct ceph_snap_context *snapc; - struct rbd_device *rbd_dev; - struct ceph_osd_client *osdc; - struct ceph_osd_request *osd_req; int num_osd_ops = 3; rbd_assert(obj_request_img_data_test(obj_request)); @@ -2065,77 +2005,34 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) if (img_request_discard_test(img_request)) num_osd_ops = 2; - /* Allocate and initialize the request, for all the ops */ - - snapc = img_request->snapc; - rbd_dev = img_request->rbd_dev; - osdc = &rbd_dev->rbd_client->client->osdc; - osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops, - false, GFP_NOIO); - if (!osd_req) - goto fail; - - osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; - osd_req->r_callback = rbd_osd_req_callback; - osd_req->r_priv = obj_request; - - osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id; - if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s", - obj_request->object_name)) - goto fail; - - if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO)) - goto fail; - - return osd_req; - -fail: - ceph_osdc_put_request(osd_req); - return NULL; + return __rbd_osd_req_create(img_request->rbd_dev, + img_request->snapc, num_osd_ops, + CEPH_OSD_FLAG_WRITE, obj_request); } - static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) { ceph_osdc_put_request(osd_req); } -/* object_name is assumed to be a non-null pointer and NUL-terminated */ - -static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, - u64 offset, u64 length, - enum obj_request_type type) +static struct rbd_obj_request * +rbd_obj_request_create(enum obj_request_type type) { struct rbd_obj_request *obj_request; - size_t size; - char *name; rbd_assert(obj_request_type_valid(type)); - size = strlen(object_name) + 1; - name = kmalloc(size, GFP_NOIO); - if (!name) - return NULL; - obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO); - if (!obj_request) { - kfree(name); + if (!obj_request) return NULL; - } - obj_request->object_name = memcpy(name, object_name, size); - obj_request->offset = offset; - obj_request->length = length; - obj_request->flags = 0; obj_request->which = BAD_WHICH; obj_request->type = type; INIT_LIST_HEAD(&obj_request->links); init_completion(&obj_request->completion); kref_init(&obj_request->kref); - dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name, - offset, length, (int)type, obj_request); - + dout("%s %p\n", __func__, obj_request); return obj_request; } @@ -2170,8 +2067,6 @@ static void rbd_obj_request_destroy(struct kref *kref) break; } - kfree(obj_request->object_name); - obj_request->object_name = NULL; kmem_cache_free(rbd_obj_request_cache, obj_request); } @@ -2546,22 +2441,18 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, while (resid) { struct ceph_osd_request *osd_req; - const char *object_name; - u64 offset; - u64 length; + u64 object_no = img_offset >> rbd_dev->header.obj_order; + u64 offset = rbd_segment_offset(rbd_dev, img_offset); + u64 length = rbd_segment_length(rbd_dev, img_offset, resid); - object_name = rbd_segment_name(rbd_dev, img_offset); - if (!object_name) - goto out_unwind; - offset = rbd_segment_offset(rbd_dev, img_offset); - length = rbd_segment_length(rbd_dev, img_offset, resid); - obj_request = rbd_obj_request_create(object_name, - offset, length, type); - /* object request has its own copy of the object name */ - rbd_segment_name_free(object_name); + obj_request = rbd_obj_request_create(type); if (!obj_request) goto out_unwind; + obj_request->object_no = object_no; + obj_request->offset = offset; + obj_request->length = length; + /* * set obj_request->img_request before creating the * osd_request so that it gets the right snapc @@ -2771,7 +2662,7 @@ static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) * child image to which the original request was to be sent. */ img_offset = obj_request->img_offset - obj_request->offset; - length = (u64)1 << rbd_dev->header.obj_order; + length = rbd_obj_bytes(&rbd_dev->header); /* * There is no defined parent data beyond the parent @@ -2900,11 +2791,12 @@ static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) size_t size; int ret; - stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0, - OBJ_REQUEST_PAGES); + stat_request = rbd_obj_request_create(OBJ_REQUEST_PAGES); if (!stat_request) return -ENOMEM; + stat_request->object_no = obj_request->object_no; + stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, stat_request); if (!stat_request->osd_req) { @@ -3983,17 +3875,17 @@ out: * returned in the outbound buffer, or a negative error code. */ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, - const char *object_name, - const char *class_name, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, const char *method_name, const void *outbound, size_t outbound_size, void *inbound, size_t inbound_size) { - struct rbd_obj_request *obj_request; - struct page **pages; - u32 page_count; + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + struct page *req_page = NULL; + struct page *reply_page; int ret; /* @@ -4003,61 +3895,35 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, * method. Currently if this is present it will be a * snapshot id. */ - page_count = (u32)calc_pages_for(0, inbound_size); - pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); - if (IS_ERR(pages)) - return PTR_ERR(pages); - - ret = -ENOMEM; - obj_request = rbd_obj_request_create(object_name, 0, inbound_size, - OBJ_REQUEST_PAGES); - if (!obj_request) - goto out; + if (outbound) { + if (outbound_size > PAGE_SIZE) + return -E2BIG; - obj_request->pages = pages; - obj_request->page_count = page_count; - - obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, - obj_request); - if (!obj_request->osd_req) - goto out; - - osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL, - class_name, method_name); - if (outbound_size) { - struct ceph_pagelist *pagelist; - - pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); - if (!pagelist) - goto out; + req_page = alloc_page(GFP_KERNEL); + if (!req_page) + return -ENOMEM; - ceph_pagelist_init(pagelist); - ceph_pagelist_append(pagelist, outbound, outbound_size); - osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0, - pagelist); + memcpy(page_address(req_page), outbound, outbound_size); } - osd_req_op_cls_response_data_pages(obj_request->osd_req, 0, - obj_request->pages, inbound_size, - 0, false, false); - - rbd_obj_request_submit(obj_request); - ret = rbd_obj_request_wait(obj_request); - if (ret) - goto out; - ret = obj_request->result; - if (ret < 0) - goto out; + reply_page = alloc_page(GFP_KERNEL); + if (!reply_page) { + if (req_page) + __free_page(req_page); + return -ENOMEM; + } - rbd_assert(obj_request->xferred < (u64)INT_MAX); - ret = (int)obj_request->xferred; - ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred); -out: - if (obj_request) - rbd_obj_request_put(obj_request); - else - ceph_release_page_vector(pages, page_count); + ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name, + CEPH_OSD_FLAG_READ, req_page, outbound_size, + reply_page, &inbound_size); + if (!ret) { + memcpy(inbound, page_address(reply_page), inbound_size); + ret = inbound_size; + } + if (req_page) + __free_page(req_page); + __free_page(reply_page); return ret; } @@ -4256,63 +4122,46 @@ static void rbd_free_disk(struct rbd_device *rbd_dev) } static int rbd_obj_read_sync(struct rbd_device *rbd_dev, - const char *object_name, - u64 offset, u64 length, void *buf) + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + void *buf, int buf_len) { - struct rbd_obj_request *obj_request; - struct page **pages = NULL; - u32 page_count; - size_t size; + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + struct ceph_osd_request *req; + struct page **pages; + int num_pages = calc_pages_for(0, buf_len); int ret; - page_count = (u32) calc_pages_for(offset, length); - pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); - if (IS_ERR(pages)) - return PTR_ERR(pages); - - ret = -ENOMEM; - obj_request = rbd_obj_request_create(object_name, offset, length, - OBJ_REQUEST_PAGES); - if (!obj_request) - goto out; - - obj_request->pages = pages; - obj_request->page_count = page_count; - - obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, - obj_request); - if (!obj_request->osd_req) - goto out; + req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL); + if (!req) + return -ENOMEM; - osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ, - offset, length, 0, 0); - osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, - obj_request->pages, - obj_request->length, - obj_request->offset & ~PAGE_MASK, - false, false); + ceph_oid_copy(&req->r_base_oid, oid); + ceph_oloc_copy(&req->r_base_oloc, oloc); + req->r_flags = CEPH_OSD_FLAG_READ; - rbd_obj_request_submit(obj_request); - ret = rbd_obj_request_wait(obj_request); + ret = ceph_osdc_alloc_messages(req, GFP_KERNEL); if (ret) - goto out; + goto out_req; - ret = obj_request->result; - if (ret < 0) - goto out; + pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); + if (IS_ERR(pages)) { + ret = PTR_ERR(pages); + goto out_req; + } - rbd_assert(obj_request->xferred <= (u64) SIZE_MAX); - size = (size_t) obj_request->xferred; - ceph_copy_from_page_vector(pages, buf, 0, size); - rbd_assert(size <= (size_t)INT_MAX); - ret = (int)size; -out: - if (obj_request) - rbd_obj_request_put(obj_request); - else - ceph_release_page_vector(pages, page_count); + osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0); + osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false, + true); + + ceph_osdc_start_request(osdc, req, false); + ret = ceph_osdc_wait_request(osdc, req); + if (ret >= 0) + ceph_copy_from_page_vector(pages, buf, 0, ret); +out_req: + ceph_osdc_put_request(req); return ret; } @@ -4348,8 +4197,8 @@ static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) if (!ondisk) return -ENOMEM; - ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_oid.name, - 0, size, ondisk); + ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid, + &rbd_dev->header_oloc, ondisk, size); if (ret < 0) goto out; if ((size_t)ret < size) { @@ -4781,7 +4630,7 @@ static const struct attribute_group *rbd_attr_groups[] = { static void rbd_dev_release(struct device *dev); -static struct device_type rbd_device_type = { +static const struct device_type rbd_device_type = { .name = "rbd", .groups = rbd_attr_groups, .release = rbd_dev_release, @@ -4876,8 +4725,9 @@ static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc, INIT_LIST_HEAD(&rbd_dev->node); init_rwsem(&rbd_dev->header_rwsem); + rbd_dev->header.data_pool_id = CEPH_NOPOOL; ceph_oid_init(&rbd_dev->header_oid); - ceph_oloc_init(&rbd_dev->header_oloc); + rbd_dev->header_oloc.pool = spec->pool_id; mutex_init(&rbd_dev->watch_mutex); rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED; @@ -4899,12 +4749,6 @@ static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc, rbd_dev->rbd_client = rbdc; rbd_dev->spec = spec; - rbd_dev->layout.stripe_unit = 1 << RBD_MAX_OBJ_ORDER; - rbd_dev->layout.stripe_count = 1; - rbd_dev->layout.object_size = 1 << RBD_MAX_OBJ_ORDER; - rbd_dev->layout.pool_id = spec->pool_id; - RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL); - return rbd_dev; } @@ -4970,10 +4814,10 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, __le64 size; } __attribute__ ((packed)) size_buf = { 0 }; - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, - "rbd", "get_size", - &snapid, sizeof (snapid), - &size_buf, sizeof (size_buf)); + ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, + &rbd_dev->header_oloc, "get_size", + &snapid, sizeof(snapid), + &size_buf, sizeof(size_buf)); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) return ret; @@ -5010,9 +4854,9 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) if (!reply_buf) return -ENOMEM; - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, - "rbd", "get_object_prefix", NULL, 0, - reply_buf, RBD_OBJ_PREFIX_LEN_MAX); + ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, + &rbd_dev->header_oloc, "get_object_prefix", + NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out; @@ -5045,10 +4889,10 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, u64 unsup; int ret; - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, - "rbd", "get_features", - &snapid, sizeof (snapid), - &features_buf, sizeof (features_buf)); + ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, + &rbd_dev->header_oloc, "get_features", + &snapid, sizeof(snapid), + &features_buf, sizeof(features_buf)); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) return ret; @@ -5107,10 +4951,9 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) } snapid = cpu_to_le64(rbd_dev->spec->snap_id); - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, - "rbd", "get_parent", - &snapid, sizeof (snapid), - reply_buf, size); + ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, + &rbd_dev->header_oloc, "get_parent", + &snapid, sizeof(snapid), reply_buf, size); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out_err; @@ -5210,9 +5053,9 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) u64 stripe_count; int ret; - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, - "rbd", "get_stripe_unit_count", NULL, 0, - (char *)&striping_info_buf, size); + ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, + &rbd_dev->header_oloc, "get_stripe_unit_count", + NULL, 0, &striping_info_buf, size); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) return ret; @@ -5226,7 +5069,7 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) * out, and only fail if the image has non-default values. */ ret = -EINVAL; - obj_size = (u64)1 << rbd_dev->header.obj_order; + obj_size = rbd_obj_bytes(&rbd_dev->header); p = &striping_info_buf; stripe_unit = ceph_decode_64(&p); if (stripe_unit != obj_size) { @@ -5247,8 +5090,27 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) return 0; } +static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev) +{ + __le64 data_pool_id; + int ret; + + ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, + &rbd_dev->header_oloc, "get_data_pool", + NULL, 0, &data_pool_id, sizeof(data_pool_id)); + if (ret < 0) + return ret; + if (ret < sizeof(data_pool_id)) + return -EBADMSG; + + rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id); + WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL); + return 0; +} + static char *rbd_dev_image_name(struct rbd_device *rbd_dev) { + CEPH_DEFINE_OID_ONSTACK(oid); size_t image_id_size; char *image_id; void *p; @@ -5276,10 +5138,10 @@ static char *rbd_dev_image_name(struct rbd_device *rbd_dev) if (!reply_buf) goto out; - ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY, - "rbd", "dir_get_name", - image_id, image_id_size, - reply_buf, size); + ceph_oid_printf(&oid, "%s", RBD_DIRECTORY); + ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, + "dir_get_name", image_id, image_id_size, + reply_buf, size); if (ret < 0) goto out; p = reply_buf; @@ -5458,9 +5320,9 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) if (!reply_buf) return -ENOMEM; - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, - "rbd", "get_snapcontext", NULL, 0, - reply_buf, size); + ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, + &rbd_dev->header_oloc, "get_snapcontext", + NULL, 0, reply_buf, size); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out; @@ -5523,10 +5385,9 @@ static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, return ERR_PTR(-ENOMEM); snapid = cpu_to_le64(snap_id); - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, - "rbd", "get_snapshot_name", - &snapid, sizeof (snapid), - reply_buf, size); + ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, + &rbd_dev->header_oloc, "get_snapshot_name", + &snapid, sizeof(snapid), reply_buf, size); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) { snap_name = ERR_PTR(ret); @@ -5833,7 +5694,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) { int ret; size_t size; - char *object_name; + CEPH_DEFINE_OID_ONSTACK(oid); void *response; char *image_id; @@ -5853,12 +5714,12 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) * First, see if the format 2 image id file exists, and if * so, get the image's persistent id from it. */ - size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name); - object_name = kmalloc(size, GFP_NOIO); - if (!object_name) - return -ENOMEM; - sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name); - dout("rbd id object name is %s\n", object_name); + ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX, + rbd_dev->spec->image_name); + if (ret) + return ret; + + dout("rbd id object name is %s\n", oid.name); /* Response will be an encoded string, which includes a length */ @@ -5871,9 +5732,9 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) /* If it doesn't exist we'll assume it's a format 1 image */ - ret = rbd_obj_method_sync(rbd_dev, object_name, - "rbd", "get_id", NULL, 0, - response, RBD_IMAGE_ID_LEN_MAX); + ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, + "get_id", NULL, 0, + response, RBD_IMAGE_ID_LEN_MAX); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret == -ENOENT) { image_id = kstrdup("", GFP_KERNEL); @@ -5896,8 +5757,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) } out: kfree(response); - kfree(object_name); - + ceph_oid_destroy(&oid); return ret; } @@ -5944,14 +5804,20 @@ static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev) if (ret < 0) goto out_err; } - /* No support for crypto and compression type format 2 images */ + if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) { + ret = rbd_dev_v2_data_pool(rbd_dev); + if (ret) + goto out_err; + } + + rbd_init_layout(rbd_dev); return 0; + out_err: rbd_dev->header.features = 0; kfree(rbd_dev->header.object_prefix); rbd_dev->header.object_prefix = NULL; - return ret; } @@ -6077,8 +5943,6 @@ static int rbd_dev_header_name(struct rbd_device *rbd_dev) /* Record the header object name for this rbd image. */ rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); - - rbd_dev->header_oloc.pool = rbd_dev->layout.pool_id; if (rbd_dev->image_format == 1) ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", spec->image_name, RBD_SUFFIX); @@ -6471,27 +6335,16 @@ static int rbd_slab_init(void) if (!rbd_obj_request_cache) goto out_err; - rbd_assert(!rbd_segment_name_cache); - rbd_segment_name_cache = kmem_cache_create("rbd_segment_name", - CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL); - if (rbd_segment_name_cache) - return 0; -out_err: - kmem_cache_destroy(rbd_obj_request_cache); - rbd_obj_request_cache = NULL; + return 0; +out_err: kmem_cache_destroy(rbd_img_request_cache); rbd_img_request_cache = NULL; - return -ENOMEM; } static void rbd_slab_exit(void) { - rbd_assert(rbd_segment_name_cache); - kmem_cache_destroy(rbd_segment_name_cache); - rbd_segment_name_cache = NULL; - rbd_assert(rbd_obj_request_cache); kmem_cache_destroy(rbd_obj_request_cache); rbd_obj_request_cache = NULL; diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h index 94f367db27b0..62ff50d3e7a6 100644 --- a/drivers/block/rbd_types.h +++ b/drivers/block/rbd_types.h @@ -25,8 +25,8 @@ */ #define RBD_HEADER_PREFIX "rbd_header." -#define RBD_DATA_PREFIX "rbd_data." #define RBD_ID_PREFIX "rbd_id." +#define RBD_V2_DATA_FORMAT "%s.%016llx" #define RBD_LOCK_NAME "rbd_lock" #define RBD_LOCK_TAG "internal" @@ -42,13 +42,14 @@ enum rbd_notify_op { /* * For format version 1, rbd image 'foo' consists of objects * foo.rbd - image metadata - * rb.<idhi>.<idlo>.00000000 - * rb.<idhi>.<idlo>.00000001 + * rb.<idhi>.<idlo>.<extra>.000000000000 + * rb.<idhi>.<idlo>.<extra>.000000000001 * ... - data * There is no notion of a persistent image id in rbd format 1. */ #define RBD_SUFFIX ".rbd" +#define RBD_V1_DATA_FORMAT "%s.%012llx" #define RBD_DIRECTORY "rbd_directory" #define RBD_INFO "rbd_info" @@ -57,9 +58,6 @@ enum rbd_notify_op { #define RBD_MIN_OBJ_ORDER 16 #define RBD_MAX_OBJ_ORDER 30 -#define RBD_COMP_NONE 0 -#define RBD_CRYPT_NONE 0 - #define RBD_HEADER_TEXT "<<< Rados Block Device Image >>>\n" #define RBD_HEADER_SIGNATURE "RBD" #define RBD_HEADER_VERSION "001.005" diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 7ce35aec8c76..f297a9e18642 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -391,6 +391,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) nr_pages = i; if (nr_pages > 0) { len = nr_pages << PAGE_SHIFT; + osd_req_op_extent_update(req, 0, len); break; } goto out_pages; @@ -771,7 +772,7 @@ static int ceph_writepages_start(struct address_space *mapping, wbc->sync_mode == WB_SYNC_NONE ? "NONE" : (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); - if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { if (ci->i_wrbuffer_ref > 0) { pr_warn_ratelimited( "writepage_start %p %lld forced umount\n", @@ -1017,8 +1018,7 @@ new_request: &ci->i_layout, vino, offset, &len, 0, num_ops, CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_WRITE | - CEPH_OSD_FLAG_ONDISK, + CEPH_OSD_FLAG_WRITE, snapc, truncate_seq, truncate_size, false); if (IS_ERR(req)) { @@ -1028,8 +1028,7 @@ new_request: min(num_ops, CEPH_OSD_SLAB_OPS), CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_WRITE | - CEPH_OSD_FLAG_ONDISK, + CEPH_OSD_FLAG_WRITE, snapc, truncate_seq, truncate_size, true); BUG_ON(IS_ERR(req)); @@ -1194,7 +1193,7 @@ static int ceph_update_writeable_page(struct file *file, int r; struct ceph_snap_context *snapc, *oldest; - if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { dout(" page %p forced umount\n", page); unlock_page(page); return -EIO; @@ -1681,8 +1680,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), 0, &len, 0, 1, - CEPH_OSD_OP_CREATE, - CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, + CEPH_OSD_OP_CREATE, CEPH_OSD_FLAG_WRITE, NULL, 0, 0, false); if (IS_ERR(req)) { err = PTR_ERR(req); @@ -1699,8 +1697,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), 0, &len, 1, 3, - CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, + CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, NULL, ci->i_truncate_seq, ci->i_truncate_size, false); if (IS_ERR(req)) { @@ -1873,7 +1870,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, goto out_unlock; } - wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK; + wr_req->r_flags = CEPH_OSD_FLAG_WRITE; osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL); ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc); ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid); diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 5bc5d37b1217..4e7421caf380 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -234,7 +234,7 @@ void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp) fscache_enable_cookie(ci->fscache, ceph_fscache_can_enable, inode); if (fscache_cookie_enabled(ci->fscache)) { - dout("fscache_file_set_cookie %p %p enabing cache\n", + dout("fscache_file_set_cookie %p %p enabling cache\n", inode, filp); } } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 94fd76d04683..cd966f276a8d 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -867,7 +867,7 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci) /* * Return caps we have registered with the MDS(s) as 'wanted'. */ -int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) +int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check) { struct ceph_cap *cap; struct rb_node *p; @@ -875,7 +875,7 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); - if (!__cap_is_valid(cap)) + if (check && !__cap_is_valid(cap)) continue; if (cap == ci->i_auth_cap) mds_wanted |= cap->mds_wanted; @@ -1184,6 +1184,13 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, delayed = 1; } ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH); + if (want & ~cap->mds_wanted) { + /* user space may open/close single file frequently. + * This avoids droping mds_wanted immediately after + * requesting new mds_wanted. + */ + __cap_set_timeouts(mdsc, ci); + } cap->issued &= retain; /* drop bits we don't want */ if (cap->implemented & ~cap->issued) { @@ -2084,8 +2091,6 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); - ceph_sync_write_wait(inode); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret < 0) goto out; @@ -2477,23 +2482,22 @@ again: if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) { int mds_wanted; - if (ACCESS_ONCE(mdsc->fsc->mount_state) == + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { dout("get_cap_refs %p forced umount\n", inode); *err = -EIO; ret = 1; goto out_unlock; } - mds_wanted = __ceph_caps_mds_wanted(ci); - if ((mds_wanted & need) != need) { + mds_wanted = __ceph_caps_mds_wanted(ci, false); + if (need & ~(mds_wanted & need)) { dout("get_cap_refs %p caps were dropped" " (session killed?)\n", inode); *err = -ESTALE; ret = 1; goto out_unlock; } - if ((mds_wanted & file_wanted) == - (file_wanted & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR))) + if (!(file_wanted & ~mds_wanted)) ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED; } @@ -3404,6 +3408,7 @@ retry: tcap->implemented |= issued; if (cap == ci->i_auth_cap) ci->i_auth_cap = tcap; + if (!list_empty(&ci->i_cap_flush_list) && ci->i_auth_cap == tcap) { spin_lock(&mdsc->cap_dirty_lock); @@ -3417,9 +3422,18 @@ retry: } else if (tsession) { /* add placeholder for the export tagert */ int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; + tcap = new_cap; ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0, t_seq - 1, t_mseq, (u64)-1, flag, &new_cap); + if (!list_empty(&ci->i_cap_flush_list) && + ci->i_auth_cap == tcap) { + spin_lock(&mdsc->cap_dirty_lock); + list_move_tail(&ci->i_flushing_item, + &tcap->session->s_cap_flushing); + spin_unlock(&mdsc->cap_dirty_lock); + } + __ceph_remove_cap(cap, false); goto out_unlock; } @@ -3924,9 +3938,10 @@ int ceph_encode_inode_release(void **p, struct inode *inode, } int ceph_encode_dentry_release(void **p, struct dentry *dentry, + struct inode *dir, int mds, int drop, int unless) { - struct inode *dir = d_inode(dentry->d_parent); + struct dentry *parent = NULL; struct ceph_mds_request_release *rel = *p; struct ceph_dentry_info *di = ceph_dentry(dentry); int force = 0; @@ -3941,9 +3956,14 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, spin_lock(&dentry->d_lock); if (di->lease_session && di->lease_session->s_mds == mds) force = 1; + if (!dir) { + parent = dget(dentry->d_parent); + dir = d_inode(parent); + } spin_unlock(&dentry->d_lock); ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force); + dput(parent); spin_lock(&dentry->d_lock); if (ret && di->lease_session && di->lease_session->s_mds == mds) { diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 39ff678e567f..f2ae393e2c31 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -70,7 +70,7 @@ static int mdsc_show(struct seq_file *s, void *p) seq_printf(s, "%s", ceph_mds_op_name(req->r_op)); - if (req->r_got_unsafe) + if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) seq_puts(s, "\t(unsafe)"); else seq_puts(s, "\t"); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 8ab1fdf0bd49..3e9ad501addf 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -371,7 +371,7 @@ more: /* hints to request -> mds selection code */ req->r_direct_mode = USE_AUTH_MDS; req->r_direct_hash = ceph_frag_value(frag); - req->r_direct_is_hash = true; + __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); if (fi->last_name) { req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL); if (!req->r_path2) { @@ -417,7 +417,7 @@ more: fi->frag = frag; fi->last_readdir = req; - if (req->r_did_prepopulate) { + if (test_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags)) { fi->readdir_cache_idx = req->r_readdir_cache_idx; if (fi->readdir_cache_idx < 0) { /* preclude from marking dir ordered */ @@ -752,7 +752,8 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, mask |= CEPH_CAP_XATTR_SHARED; req->r_args.getattr.mask = cpu_to_le32(mask); - req->r_locked_dir = dir; + req->r_parent = dir; + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); err = ceph_mdsc_do_request(mdsc, NULL, req); err = ceph_handle_snapdir(req, dentry, err); dentry = ceph_finish_lookup(req, dentry, err); @@ -813,7 +814,8 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry, } req->r_dentry = dget(dentry); req->r_num_caps = 2; - req->r_locked_dir = dir; + req->r_parent = dir; + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_args.mknod.mode = cpu_to_le32(mode); req->r_args.mknod.rdev = cpu_to_le32(rdev); req->r_dentry_drop = CEPH_CAP_FILE_SHARED; @@ -864,7 +866,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, ceph_mdsc_put_request(req); goto out; } - req->r_locked_dir = dir; + req->r_parent = dir; + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_dentry_drop = CEPH_CAP_FILE_SHARED; @@ -913,7 +916,8 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) req->r_dentry = dget(dentry); req->r_num_caps = 2; - req->r_locked_dir = dir; + req->r_parent = dir; + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_args.mkdir.mode = cpu_to_le32(mode); req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; @@ -957,7 +961,8 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_old_dentry = dget(old_dentry); - req->r_locked_dir = dir; + req->r_parent = dir; + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; /* release LINK_SHARED on source inode (mds will lock it) */ @@ -1023,7 +1028,8 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) } req->r_dentry = dget(dentry); req->r_num_caps = 2; - req->r_locked_dir = dir; + req->r_parent = dir; + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; req->r_inode_drop = drop_caps_for_unlink(inode); @@ -1066,7 +1072,8 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, req->r_num_caps = 2; req->r_old_dentry = dget(old_dentry); req->r_old_dentry_dir = old_dir; - req->r_locked_dir = new_dir; + req->r_parent = new_dir; + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; req->r_dentry_drop = CEPH_CAP_FILE_SHARED; @@ -1194,7 +1201,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) struct inode *dir; if (flags & LOOKUP_RCU) { - parent = ACCESS_ONCE(dentry->d_parent); + parent = READ_ONCE(dentry->d_parent); dir = d_inode_rcu(parent); if (!dir) return -ECHILD; @@ -1237,11 +1244,12 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) return -ECHILD; op = ceph_snap(dir) == CEPH_SNAPDIR ? - CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_GETATTR; + CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); if (!IS_ERR(req)) { req->r_dentry = dget(dentry); - req->r_num_caps = op == CEPH_MDS_OP_GETATTR ? 1 : 2; + req->r_num_caps = 2; + req->r_parent = dir; mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; if (ceph_security_xattr_wanted(dir)) diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 180bbef760f2..e8f11fa565c5 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -207,7 +207,8 @@ static int ceph_get_name(struct dentry *parent, char *name, req->r_inode = d_inode(child); ihold(d_inode(child)); req->r_ino2 = ceph_vino(d_inode(parent)); - req->r_locked_dir = d_inode(parent); + req->r_parent = d_inode(parent); + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_num_caps = 2; err = ceph_mdsc_do_request(mdsc, NULL, req); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 045d30d26624..26cc95421cca 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -283,7 +283,7 @@ int ceph_open(struct inode *inode, struct file *file) spin_lock(&ci->i_ceph_lock); if (__ceph_is_any_real_caps(ci) && (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) { - int mds_wanted = __ceph_caps_mds_wanted(ci); + int mds_wanted = __ceph_caps_mds_wanted(ci, true); int issued = __ceph_caps_issued(ci, NULL); dout("open %p fmode %d want %s issued %s using existing\n", @@ -379,7 +379,8 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, mask |= CEPH_CAP_XATTR_SHARED; req->r_args.open.mask = cpu_to_le32(mask); - req->r_locked_dir = dir; /* caller holds dir->i_mutex */ + req->r_parent = dir; + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); err = ceph_mdsc_do_request(mdsc, (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, req); @@ -758,9 +759,7 @@ static void ceph_aio_retry_work(struct work_struct *work) goto out; } - req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | - CEPH_OSD_FLAG_ONDISK | - CEPH_OSD_FLAG_WRITE; + req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE; ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc); ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid); @@ -794,89 +793,6 @@ out: kfree(aio_work); } -/* - * Write commit request unsafe callback, called to tell us when a - * request is unsafe (that is, in flight--has been handed to the - * messenger to send to its target osd). It is called again when - * we've received a response message indicating the request is - * "safe" (its CEPH_OSD_FLAG_ONDISK flag is set), or when a request - * is completed early (and unsuccessfully) due to a timeout or - * interrupt. - * - * This is used if we requested both an ACK and ONDISK commit reply - * from the OSD. - */ -static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) -{ - struct ceph_inode_info *ci = ceph_inode(req->r_inode); - - dout("%s %p tid %llu %ssafe\n", __func__, req, req->r_tid, - unsafe ? "un" : ""); - if (unsafe) { - ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); - spin_lock(&ci->i_unsafe_lock); - list_add_tail(&req->r_unsafe_item, - &ci->i_unsafe_writes); - spin_unlock(&ci->i_unsafe_lock); - - complete_all(&req->r_completion); - } else { - spin_lock(&ci->i_unsafe_lock); - list_del_init(&req->r_unsafe_item); - spin_unlock(&ci->i_unsafe_lock); - ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); - } -} - -/* - * Wait on any unsafe replies for the given inode. First wait on the - * newest request, and make that the upper bound. Then, if there are - * more requests, keep waiting on the oldest as long as it is still older - * than the original request. - */ -void ceph_sync_write_wait(struct inode *inode) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - struct list_head *head = &ci->i_unsafe_writes; - struct ceph_osd_request *req; - u64 last_tid; - - if (!S_ISREG(inode->i_mode)) - return; - - spin_lock(&ci->i_unsafe_lock); - if (list_empty(head)) - goto out; - - /* set upper bound as _last_ entry in chain */ - - req = list_last_entry(head, struct ceph_osd_request, - r_unsafe_item); - last_tid = req->r_tid; - - do { - ceph_osdc_get_request(req); - spin_unlock(&ci->i_unsafe_lock); - - dout("sync_write_wait on tid %llu (until %llu)\n", - req->r_tid, last_tid); - wait_for_completion(&req->r_done_completion); - ceph_osdc_put_request(req); - - spin_lock(&ci->i_unsafe_lock); - /* - * from here on look at first entry in chain, since we - * only want to wait for anything older than last_tid - */ - if (list_empty(head)) - break; - req = list_first_entry(head, struct ceph_osd_request, - r_unsafe_item); - } while (req->r_tid < last_tid); -out: - spin_unlock(&ci->i_unsafe_lock); -} - static ssize_t ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, struct ceph_snap_context *snapc, @@ -915,9 +831,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, if (ret2 < 0) dout("invalidate_inode_pages2_range returned %d\n", ret2); - flags = CEPH_OSD_FLAG_ORDERSNAP | - CEPH_OSD_FLAG_ONDISK | - CEPH_OSD_FLAG_WRITE; + flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE; } else { flags = CEPH_OSD_FLAG_READ; } @@ -1116,10 +1030,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, if (ret < 0) dout("invalidate_inode_pages2_range returned %d\n", ret); - flags = CEPH_OSD_FLAG_ORDERSNAP | - CEPH_OSD_FLAG_ONDISK | - CEPH_OSD_FLAG_WRITE | - CEPH_OSD_FLAG_ACK; + flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE; while ((len = iov_iter_count(from)) > 0) { size_t left; @@ -1165,8 +1076,6 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, goto out; } - /* get a second commit callback */ - req->r_unsafe_callback = ceph_sync_write_unsafe; req->r_inode = inode; osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, @@ -1616,8 +1525,7 @@ static int ceph_zero_partial_object(struct inode *inode, ceph_vino(inode), offset, length, 0, 1, op, - CEPH_OSD_FLAG_WRITE | - CEPH_OSD_FLAG_ONDISK, + CEPH_OSD_FLAG_WRITE, NULL, 0, 0, false); if (IS_ERR(req)) { ret = PTR_ERR(req); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 5e659d054b40..fd8f771f99b7 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -499,7 +499,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_rdcache_gen = 0; ci->i_rdcache_revoking = 0; - INIT_LIST_HEAD(&ci->i_unsafe_writes); INIT_LIST_HEAD(&ci->i_unsafe_dirops); INIT_LIST_HEAD(&ci->i_unsafe_iops); spin_lock_init(&ci->i_unsafe_lock); @@ -583,14 +582,6 @@ int ceph_drop_inode(struct inode *inode) return 1; } -void ceph_evict_inode(struct inode *inode) -{ - /* wait unsafe sync writes */ - ceph_sync_write_wait(inode); - truncate_inode_pages_final(&inode->i_data); - clear_inode(inode); -} - static inline blkcnt_t calc_inode_blocks(u64 size) { return (size + (1<<9) - 1) >> 9; @@ -1016,7 +1007,9 @@ out: static void update_dentry_lease(struct dentry *dentry, struct ceph_mds_reply_lease *lease, struct ceph_mds_session *session, - unsigned long from_time) + unsigned long from_time, + struct ceph_vino *tgt_vino, + struct ceph_vino *dir_vino) { struct ceph_dentry_info *di = ceph_dentry(dentry); long unsigned duration = le32_to_cpu(lease->duration_ms); @@ -1024,13 +1017,27 @@ static void update_dentry_lease(struct dentry *dentry, long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000; struct inode *dir; + /* + * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that + * we expect a negative dentry. + */ + if (!tgt_vino && d_really_is_positive(dentry)) + return; + + if (tgt_vino && (d_really_is_negative(dentry) || + !ceph_ino_compare(d_inode(dentry), tgt_vino))) + return; + spin_lock(&dentry->d_lock); dout("update_dentry_lease %p duration %lu ms ttl %lu\n", dentry, duration, ttl); - /* make lease_rdcache_gen match directory */ dir = d_inode(dentry->d_parent); + /* make sure parent matches dir_vino */ + if (!ceph_ino_compare(dir, dir_vino)) + goto out_unlock; + /* only track leases on regular dentries */ if (ceph_snap(dir) != CEPH_NOSNAP) goto out_unlock; @@ -1108,61 +1115,27 @@ out: * * Called with snap_rwsem (read). */ -int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, - struct ceph_mds_session *session) +int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) { + struct ceph_mds_session *session = req->r_session; struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct inode *in = NULL; - struct ceph_vino vino; + struct ceph_vino tvino, dvino; struct ceph_fs_client *fsc = ceph_sb_to_client(sb); int err = 0; dout("fill_trace %p is_dentry %d is_target %d\n", req, rinfo->head->is_dentry, rinfo->head->is_target); -#if 0 - /* - * Debugging hook: - * - * If we resend completed ops to a recovering mds, we get no - * trace. Since that is very rare, pretend this is the case - * to ensure the 'no trace' handlers in the callers behave. - * - * Fill in inodes unconditionally to avoid breaking cap - * invariants. - */ - if (rinfo->head->op & CEPH_MDS_OP_WRITE) { - pr_info("fill_trace faking empty trace on %lld %s\n", - req->r_tid, ceph_mds_op_name(rinfo->head->op)); - if (rinfo->head->is_dentry) { - rinfo->head->is_dentry = 0; - err = fill_inode(req->r_locked_dir, - &rinfo->diri, rinfo->dirfrag, - session, req->r_request_started, -1); - } - if (rinfo->head->is_target) { - rinfo->head->is_target = 0; - ininfo = rinfo->targeti.in; - vino.ino = le64_to_cpu(ininfo->ino); - vino.snap = le64_to_cpu(ininfo->snapid); - in = ceph_get_inode(sb, vino); - err = fill_inode(in, &rinfo->targeti, NULL, - session, req->r_request_started, - req->r_fmode); - iput(in); - } - } -#endif - if (!rinfo->head->is_target && !rinfo->head->is_dentry) { dout("fill_trace reply is empty!\n"); - if (rinfo->head->result == 0 && req->r_locked_dir) + if (rinfo->head->result == 0 && req->r_parent) ceph_invalidate_dir_request(req); return 0; } if (rinfo->head->is_dentry) { - struct inode *dir = req->r_locked_dir; + struct inode *dir = req->r_parent; if (dir) { err = fill_inode(dir, NULL, @@ -1188,8 +1161,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, dname.name = rinfo->dname; dname.len = rinfo->dname_len; dname.hash = full_name_hash(parent, dname.name, dname.len); - vino.ino = le64_to_cpu(rinfo->targeti.in->ino); - vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); + tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); + tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); retry_lookup: dn = d_lookup(parent, &dname); dout("d_lookup on parent=%p name=%.*s got %p\n", @@ -1206,8 +1179,8 @@ retry_lookup: } err = 0; } else if (d_really_is_positive(dn) && - (ceph_ino(d_inode(dn)) != vino.ino || - ceph_snap(d_inode(dn)) != vino.snap)) { + (ceph_ino(d_inode(dn)) != tvino.ino || + ceph_snap(d_inode(dn)) != tvino.snap)) { dout(" dn %p points to wrong inode %p\n", dn, d_inode(dn)); d_delete(dn); @@ -1221,10 +1194,10 @@ retry_lookup: } if (rinfo->head->is_target) { - vino.ino = le64_to_cpu(rinfo->targeti.in->ino); - vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); + tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); + tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); - in = ceph_get_inode(sb, vino); + in = ceph_get_inode(sb, tvino); if (IS_ERR(in)) { err = PTR_ERR(in); goto done; @@ -1233,8 +1206,8 @@ retry_lookup: err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL, session, req->r_request_started, - (!req->r_aborted && rinfo->head->result == 0) ? - req->r_fmode : -1, + (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) && + rinfo->head->result == 0) ? req->r_fmode : -1, &req->r_caps_reservation); if (err < 0) { pr_err("fill_inode badness %p %llx.%llx\n", @@ -1247,8 +1220,9 @@ retry_lookup: * ignore null lease/binding on snapdir ENOENT, or else we * will have trouble splicing in the virtual snapdir later */ - if (rinfo->head->is_dentry && !req->r_aborted && - req->r_locked_dir && + if (rinfo->head->is_dentry && + !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) && + test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name, fsc->mount_options->snapdir_name, req->r_dentry->d_name.len))) { @@ -1257,17 +1231,19 @@ retry_lookup: * mknod symlink mkdir : null -> new inode * unlink : linked -> null */ - struct inode *dir = req->r_locked_dir; + struct inode *dir = req->r_parent; struct dentry *dn = req->r_dentry; bool have_dir_cap, have_lease; BUG_ON(!dn); BUG_ON(!dir); BUG_ON(d_inode(dn->d_parent) != dir); - BUG_ON(ceph_ino(dir) != - le64_to_cpu(rinfo->diri.in->ino)); - BUG_ON(ceph_snap(dir) != - le64_to_cpu(rinfo->diri.in->snapid)); + + dvino.ino = le64_to_cpu(rinfo->diri.in->ino); + dvino.snap = le64_to_cpu(rinfo->diri.in->snapid); + + BUG_ON(ceph_ino(dir) != dvino.ino); + BUG_ON(ceph_snap(dir) != dvino.snap); /* do we have a lease on the whole dir? */ have_dir_cap = @@ -1319,12 +1295,13 @@ retry_lookup: ceph_dir_clear_ordered(dir); dout("d_delete %p\n", dn); d_delete(dn); - } else { - if (have_lease && d_unhashed(dn)) + } else if (have_lease) { + if (d_unhashed(dn)) d_add(dn, NULL); update_dentry_lease(dn, rinfo->dlease, session, - req->r_request_started); + req->r_request_started, + NULL, &dvino); } goto done; } @@ -1347,15 +1324,19 @@ retry_lookup: have_lease = false; } - if (have_lease) + if (have_lease) { + tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); + tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); update_dentry_lease(dn, rinfo->dlease, session, - req->r_request_started); + req->r_request_started, + &tvino, &dvino); + } dout(" final dn %p\n", dn); - } else if (!req->r_aborted && - (req->r_op == CEPH_MDS_OP_LOOKUPSNAP || - req->r_op == CEPH_MDS_OP_MKSNAP)) { + } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP || + req->r_op == CEPH_MDS_OP_MKSNAP) && + !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { struct dentry *dn = req->r_dentry; - struct inode *dir = req->r_locked_dir; + struct inode *dir = req->r_parent; /* fill out a snapdir LOOKUPSNAP dentry */ BUG_ON(!dn); @@ -1370,6 +1351,26 @@ retry_lookup: goto done; } req->r_dentry = dn; /* may have spliced */ + } else if (rinfo->head->is_dentry) { + struct ceph_vino *ptvino = NULL; + + if ((le32_to_cpu(rinfo->diri.in->cap.caps) & CEPH_CAP_FILE_SHARED) || + le32_to_cpu(rinfo->dlease->duration_ms)) { + dvino.ino = le64_to_cpu(rinfo->diri.in->ino); + dvino.snap = le64_to_cpu(rinfo->diri.in->snapid); + + if (rinfo->head->is_target) { + tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); + tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); + ptvino = &tvino; + } + + update_dentry_lease(req->r_dentry, rinfo->dlease, + session, req->r_request_started, ptvino, + &dvino); + } else { + dout("%s: no dentry lease or dir cap\n", __func__); + } } done: dout("fill_trace done err=%d\n", err); @@ -1478,7 +1479,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, u32 fpos_offset; struct ceph_readdir_cache_control cache_ctl = {}; - if (req->r_aborted) + if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) return readdir_prepopulate_inodes_only(req, session); if (rinfo->hash_order && req->r_path2) { @@ -1523,14 +1524,14 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, /* FIXME: release caps/leases if error occurs */ for (i = 0; i < rinfo->dir_nr; i++) { struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; - struct ceph_vino vino; + struct ceph_vino tvino, dvino; dname.name = rde->name; dname.len = rde->name_len; dname.hash = full_name_hash(parent, dname.name, dname.len); - vino.ino = le64_to_cpu(rde->inode.in->ino); - vino.snap = le64_to_cpu(rde->inode.in->snapid); + tvino.ino = le64_to_cpu(rde->inode.in->ino); + tvino.snap = le64_to_cpu(rde->inode.in->snapid); if (rinfo->hash_order) { u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, @@ -1559,8 +1560,8 @@ retry_lookup: goto out; } } else if (d_really_is_positive(dn) && - (ceph_ino(d_inode(dn)) != vino.ino || - ceph_snap(d_inode(dn)) != vino.snap)) { + (ceph_ino(d_inode(dn)) != tvino.ino || + ceph_snap(d_inode(dn)) != tvino.snap)) { dout(" dn %p points to wrong inode %p\n", dn, d_inode(dn)); d_delete(dn); @@ -1572,7 +1573,7 @@ retry_lookup: if (d_really_is_positive(dn)) { in = d_inode(dn); } else { - in = ceph_get_inode(parent->d_sb, vino); + in = ceph_get_inode(parent->d_sb, tvino); if (IS_ERR(in)) { dout("new_inode badness\n"); d_drop(dn); @@ -1617,8 +1618,9 @@ retry_lookup: ceph_dentry(dn)->offset = rde->offset; + dvino = ceph_vino(d_inode(parent)); update_dentry_lease(dn, rde->lease, req->r_session, - req->r_request_started); + req->r_request_started, &tvino, &dvino); if (err == 0 && skipped == 0 && cache_ctl.index >= 0) { ret = fill_readdir_cache(d_inode(parent), dn, @@ -1632,7 +1634,7 @@ next_item: } out: if (err == 0 && skipped == 0) { - req->r_did_prepopulate = true; + set_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags); req->r_readdir_cache_idx = cache_ctl.index; } ceph_readdir_cache_release(&cache_ctl); @@ -1720,7 +1722,7 @@ static void ceph_invalidate_work(struct work_struct *work) mutex_lock(&ci->i_truncate_mutex); - if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n", inode, ceph_ino(inode)); mapping_set_error(inode->i_mapping, -EIO); diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 7d752d53353a..4c9c72f26eb9 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -25,7 +25,7 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg) l.stripe_count = ci->i_layout.stripe_count; l.object_size = ci->i_layout.object_size; l.data_pool = ci->i_layout.pool_id; - l.preferred_osd = (s32)-1; + l.preferred_osd = -1; if (copy_to_user(arg, &l, sizeof(l))) return -EFAULT; } @@ -97,7 +97,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) nl.data_pool = ci->i_layout.pool_id; /* this is obsolete, and always -1 */ - nl.preferred_osd = le64_to_cpu(-1); + nl.preferred_osd = -1; err = __validate_layout(mdsc, &nl); if (err) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index c9d2e553a6c4..c681762d76e6 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -547,8 +547,8 @@ void ceph_mdsc_release_request(struct kref *kref) ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); iput(req->r_inode); } - if (req->r_locked_dir) - ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); + if (req->r_parent) + ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); iput(req->r_target_inode); if (req->r_dentry) dput(req->r_dentry); @@ -628,6 +628,9 @@ static void __unregister_request(struct ceph_mds_client *mdsc, { dout("__unregister_request %p tid %lld\n", req, req->r_tid); + /* Never leave an unregistered request on an unsafe list! */ + list_del_init(&req->r_unsafe_item); + if (req->r_tid == mdsc->oldest_tid) { struct rb_node *p = rb_next(&req->r_node); mdsc->oldest_tid = 0; @@ -644,13 +647,15 @@ static void __unregister_request(struct ceph_mds_client *mdsc, erase_request(&mdsc->request_tree, req); - if (req->r_unsafe_dir && req->r_got_unsafe) { + if (req->r_unsafe_dir && + test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); spin_lock(&ci->i_unsafe_lock); list_del_init(&req->r_unsafe_dir_item); spin_unlock(&ci->i_unsafe_lock); } - if (req->r_target_inode && req->r_got_unsafe) { + if (req->r_target_inode && + test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); spin_lock(&ci->i_unsafe_lock); list_del_init(&req->r_unsafe_target_item); @@ -668,6 +673,28 @@ static void __unregister_request(struct ceph_mds_client *mdsc, } /* + * Walk back up the dentry tree until we hit a dentry representing a + * non-snapshot inode. We do this using the rcu_read_lock (which must be held + * when calling this) to ensure that the objects won't disappear while we're + * working with them. Once we hit a candidate dentry, we attempt to take a + * reference to it, and return that as the result. + */ +static struct inode *get_nonsnap_parent(struct dentry *dentry) +{ + struct inode *inode = NULL; + + while (dentry && !IS_ROOT(dentry)) { + inode = d_inode_rcu(dentry); + if (!inode || ceph_snap(inode) == CEPH_NOSNAP) + break; + dentry = dentry->d_parent; + } + if (inode) + inode = igrab(inode); + return inode; +} + +/* * Choose mds to send request to next. If there is a hint set in the * request (e.g., due to a prior forward hint from the mds), use that. * Otherwise, consult frag tree and/or caps to identify the @@ -675,19 +702,6 @@ static void __unregister_request(struct ceph_mds_client *mdsc, * * Called under mdsc->mutex. */ -static struct dentry *get_nonsnap_parent(struct dentry *dentry) -{ - /* - * we don't need to worry about protecting the d_parent access - * here because we never renaming inside the snapped namespace - * except to resplice to another snapdir, and either the old or new - * result is a valid result. - */ - while (!IS_ROOT(dentry) && ceph_snap(d_inode(dentry)) != CEPH_NOSNAP) - dentry = dentry->d_parent; - return dentry; -} - static int __choose_mds(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { @@ -697,7 +711,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, int mode = req->r_direct_mode; int mds = -1; u32 hash = req->r_direct_hash; - bool is_hash = req->r_direct_is_hash; + bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); /* * is there a specific mds we should try? ignore hint if we have @@ -717,30 +731,39 @@ static int __choose_mds(struct ceph_mds_client *mdsc, inode = NULL; if (req->r_inode) { inode = req->r_inode; + ihold(inode); } else if (req->r_dentry) { /* ignore race with rename; old or new d_parent is okay */ - struct dentry *parent = req->r_dentry->d_parent; - struct inode *dir = d_inode(parent); + struct dentry *parent; + struct inode *dir; + + rcu_read_lock(); + parent = req->r_dentry->d_parent; + dir = req->r_parent ? : d_inode_rcu(parent); - if (dir->i_sb != mdsc->fsc->sb) { - /* not this fs! */ + if (!dir || dir->i_sb != mdsc->fsc->sb) { + /* not this fs or parent went negative */ inode = d_inode(req->r_dentry); + if (inode) + ihold(inode); } else if (ceph_snap(dir) != CEPH_NOSNAP) { /* direct snapped/virtual snapdir requests * based on parent dir inode */ - struct dentry *dn = get_nonsnap_parent(parent); - inode = d_inode(dn); + inode = get_nonsnap_parent(parent); dout("__choose_mds using nonsnap parent %p\n", inode); } else { /* dentry target */ inode = d_inode(req->r_dentry); if (!inode || mode == USE_AUTH_MDS) { /* dir + name */ - inode = dir; + inode = igrab(dir); hash = ceph_dentry_hash(dir, req->r_dentry); is_hash = true; + } else { + ihold(inode); } } + rcu_read_unlock(); } dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash, @@ -769,7 +792,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, (int)r, frag.ndist); if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= CEPH_MDS_STATE_ACTIVE) - return mds; + goto out; } /* since this file/dir wasn't known to be @@ -784,7 +807,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, inode, ceph_vinop(inode), frag.frag, mds); if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= CEPH_MDS_STATE_ACTIVE) - return mds; + goto out; } } } @@ -797,6 +820,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node); if (!cap) { spin_unlock(&ci->i_ceph_lock); + iput(inode); goto random; } mds = cap->session->s_mds; @@ -804,6 +828,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc, inode, ceph_vinop(inode), mds, cap == ci->i_auth_cap ? "auth " : "", cap); spin_unlock(&ci->i_ceph_lock); +out: + iput(inode); return mds; random: @@ -1036,7 +1062,6 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, while (!list_empty(&session->s_unsafe)) { req = list_first_entry(&session->s_unsafe, struct ceph_mds_request, r_unsafe_item); - list_del_init(&req->r_unsafe_item); pr_warn_ratelimited(" dropping unsafe request %llu\n", req->r_tid); __unregister_request(mdsc, req); @@ -1146,7 +1171,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ci->i_ceph_flags |= CEPH_I_CAP_DROPPED; if (ci->i_wrbuffer_ref > 0 && - ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) + READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) invalidate = true; while (!list_empty(&ci->i_cap_flush_list)) { @@ -1775,18 +1800,23 @@ retry: return path; } -static int build_dentry_path(struct dentry *dentry, +static int build_dentry_path(struct dentry *dentry, struct inode *dir, const char **ppath, int *ppathlen, u64 *pino, int *pfreepath) { char *path; - if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_NOSNAP) { - *pino = ceph_ino(d_inode(dentry->d_parent)); + rcu_read_lock(); + if (!dir) + dir = d_inode_rcu(dentry->d_parent); + if (dir && ceph_snap(dir) == CEPH_NOSNAP) { + *pino = ceph_ino(dir); + rcu_read_unlock(); *ppath = dentry->d_name.name; *ppathlen = dentry->d_name.len; return 0; } + rcu_read_unlock(); path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); if (IS_ERR(path)) return PTR_ERR(path); @@ -1822,8 +1852,8 @@ static int build_inode_path(struct inode *inode, * an explicit ino+path. */ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, - const char *rpath, u64 rino, - const char **ppath, int *pathlen, + struct inode *rdiri, const char *rpath, + u64 rino, const char **ppath, int *pathlen, u64 *ino, int *freepath) { int r = 0; @@ -1833,7 +1863,8 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode), ceph_snap(rinode)); } else if (rdentry) { - r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath); + r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino, + freepath); dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath); } else if (rpath || rino) { @@ -1866,7 +1897,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, int ret; ret = set_request_path_attr(req->r_inode, req->r_dentry, - req->r_path1, req->r_ino1.ino, + req->r_parent, req->r_path1, req->r_ino1.ino, &path1, &pathlen1, &ino1, &freepath1); if (ret < 0) { msg = ERR_PTR(ret); @@ -1874,6 +1905,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, } ret = set_request_path_attr(NULL, req->r_old_dentry, + req->r_old_dentry_dir, req->r_path2, req->r_ino2.ino, &path2, &pathlen2, &ino2, &freepath2); if (ret < 0) { @@ -1927,10 +1959,13 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, mds, req->r_inode_drop, req->r_inode_unless, 0); if (req->r_dentry_drop) releases += ceph_encode_dentry_release(&p, req->r_dentry, - mds, req->r_dentry_drop, req->r_dentry_unless); + req->r_parent, mds, req->r_dentry_drop, + req->r_dentry_unless); if (req->r_old_dentry_drop) releases += ceph_encode_dentry_release(&p, req->r_old_dentry, - mds, req->r_old_dentry_drop, req->r_old_dentry_unless); + req->r_old_dentry_dir, mds, + req->r_old_dentry_drop, + req->r_old_dentry_unless); if (req->r_old_inode_drop) releases += ceph_encode_inode_release(&p, d_inode(req->r_old_dentry), @@ -2012,7 +2047,7 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); - if (req->r_got_unsafe) { + if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { void *p; /* * Replay. Do not regenerate message (and rebuild @@ -2061,16 +2096,16 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, rhead = msg->front.iov_base; rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); - if (req->r_got_unsafe) + if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) flags |= CEPH_MDS_FLAG_REPLAY; - if (req->r_locked_dir) + if (req->r_parent) flags |= CEPH_MDS_FLAG_WANT_DENTRY; rhead->flags = cpu_to_le32(flags); rhead->num_fwd = req->r_num_fwd; rhead->num_retry = req->r_attempts - 1; rhead->ino = 0; - dout(" r_locked_dir = %p\n", req->r_locked_dir); + dout(" r_parent = %p\n", req->r_parent); return 0; } @@ -2084,8 +2119,8 @@ static int __do_request(struct ceph_mds_client *mdsc, int mds = -1; int err = 0; - if (req->r_err || req->r_got_result) { - if (req->r_aborted) + if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { + if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) __unregister_request(mdsc, req); goto out; } @@ -2096,12 +2131,12 @@ static int __do_request(struct ceph_mds_client *mdsc, err = -EIO; goto finish; } - if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { dout("do_request forced umount\n"); err = -EIO; goto finish; } - if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) { + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) { if (mdsc->mdsmap_err) { err = mdsc->mdsmap_err; dout("do_request mdsmap err %d\n", err); @@ -2215,7 +2250,7 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds) while (p) { req = rb_entry(p, struct ceph_mds_request, r_node); p = rb_next(p); - if (req->r_got_unsafe) + if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) continue; if (req->r_attempts > 0) continue; /* only new requests */ @@ -2250,11 +2285,11 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, dout("do_request on %p\n", req); - /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */ + /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */ if (req->r_inode) ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); - if (req->r_locked_dir) - ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); + if (req->r_parent) + ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); if (req->r_old_dentry_dir) ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), CEPH_CAP_PIN); @@ -2289,7 +2324,7 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, mutex_lock(&mdsc->mutex); /* only abort if we didn't race with a real reply */ - if (req->r_got_result) { + if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { err = le32_to_cpu(req->r_reply_info.head->result); } else if (err < 0) { dout("aborted request %lld with %d\n", req->r_tid, err); @@ -2301,10 +2336,10 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, */ mutex_lock(&req->r_fill_mutex); req->r_err = err; - req->r_aborted = true; + set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); mutex_unlock(&req->r_fill_mutex); - if (req->r_locked_dir && + if (req->r_parent && (req->r_op & CEPH_MDS_OP_WRITE)) ceph_invalidate_dir_request(req); } else { @@ -2323,7 +2358,7 @@ out: */ void ceph_invalidate_dir_request(struct ceph_mds_request *req) { - struct inode *inode = req->r_locked_dir; + struct inode *inode = req->r_parent; dout("invalidate_dir_request %p (complete, lease(s))\n", inode); @@ -2379,14 +2414,14 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) } /* dup? */ - if ((req->r_got_unsafe && !head->safe) || - (req->r_got_safe && head->safe)) { + if ((test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags) && !head->safe) || + (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags) && head->safe)) { pr_warn("got a dup %s reply on %llu from mds%d\n", head->safe ? "safe" : "unsafe", tid, mds); mutex_unlock(&mdsc->mutex); goto out; } - if (req->r_got_safe) { + if (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags)) { pr_warn("got unsafe after safe on %llu from mds%d\n", tid, mds); mutex_unlock(&mdsc->mutex); @@ -2425,10 +2460,10 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) if (head->safe) { - req->r_got_safe = true; + set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags); __unregister_request(mdsc, req); - if (req->r_got_unsafe) { + if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { /* * We already handled the unsafe response, now do the * cleanup. No need to examine the response; the MDS @@ -2437,7 +2472,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) * useful we could do with a revised return value. */ dout("got safe reply %llu, mds%d\n", tid, mds); - list_del_init(&req->r_unsafe_item); /* last unsafe request during umount? */ if (mdsc->stopping && !__get_oldest_req(mdsc)) @@ -2446,7 +2480,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) goto out; } } else { - req->r_got_unsafe = true; + set_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags); list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe); if (req->r_unsafe_dir) { struct ceph_inode_info *ci = @@ -2486,7 +2520,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) /* insert trace into our cache */ mutex_lock(&req->r_fill_mutex); current->journal_info = req; - err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); + err = ceph_fill_trace(mdsc->fsc->sb, req); if (err == 0) { if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || req->r_op == CEPH_MDS_OP_LSSNAP)) @@ -2500,7 +2534,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) if (realm) ceph_put_snap_realm(mdsc, realm); - if (err == 0 && req->r_got_unsafe && req->r_target_inode) { + if (err == 0 && req->r_target_inode && + test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); spin_lock(&ci->i_unsafe_lock); list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops); @@ -2508,12 +2543,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) } out_err: mutex_lock(&mdsc->mutex); - if (!req->r_aborted) { + if (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { if (err) { req->r_err = err; } else { req->r_reply = ceph_msg_get(msg); - req->r_got_result = true; + set_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags); } } else { dout("reply arrived after request %lld was aborted\n", tid); @@ -2557,7 +2592,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, goto out; /* dup reply? */ } - if (req->r_aborted) { + if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { dout("forward tid %llu aborted, unregistering\n", tid); __unregister_request(mdsc, req); } else if (fwd_seq <= req->r_num_fwd) { @@ -2567,7 +2602,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, /* resend. forward race not possible; mds would drop */ dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds); BUG_ON(req->r_err); - BUG_ON(req->r_got_result); + BUG_ON(test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)); req->r_attempts = 0; req->r_num_fwd = fwd_seq; req->r_resend_mds = next_mds; @@ -2732,7 +2767,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, while (p) { req = rb_entry(p, struct ceph_mds_request, r_node); p = rb_next(p); - if (req->r_got_unsafe) + if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) continue; if (req->r_attempts == 0) continue; /* only old requests */ @@ -3556,7 +3591,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) { u64 want_tid, want_flush; - if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) return; dout("sync\n"); @@ -3587,7 +3622,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) */ static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped) { - if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) return true; return atomic_read(&mdsc->num_sessions) <= skipped; } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 3c6f77b7bb02..ac0475a2daa7 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -202,9 +202,18 @@ struct ceph_mds_request { char *r_path1, *r_path2; struct ceph_vino r_ino1, r_ino2; - struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */ + struct inode *r_parent; /* parent dir inode */ struct inode *r_target_inode; /* resulting inode */ +#define CEPH_MDS_R_DIRECT_IS_HASH (1) /* r_direct_hash is valid */ +#define CEPH_MDS_R_ABORTED (2) /* call was aborted */ +#define CEPH_MDS_R_GOT_UNSAFE (3) /* got an unsafe reply */ +#define CEPH_MDS_R_GOT_SAFE (4) /* got a safe reply */ +#define CEPH_MDS_R_GOT_RESULT (5) /* got a result */ +#define CEPH_MDS_R_DID_PREPOPULATE (6) /* prepopulated readdir */ +#define CEPH_MDS_R_PARENT_LOCKED (7) /* is r_parent->i_rwsem wlocked? */ + unsigned long r_req_flags; + struct mutex r_fill_mutex; union ceph_mds_request_args r_args; @@ -216,7 +225,6 @@ struct ceph_mds_request { /* for choosing which mds to send this request to */ int r_direct_mode; u32 r_direct_hash; /* choose dir frag based on this dentry hash */ - bool r_direct_is_hash; /* true if r_direct_hash is valid */ /* data payload is used for xattr ops */ struct ceph_pagelist *r_pagelist; @@ -234,7 +242,6 @@ struct ceph_mds_request { struct ceph_mds_reply_info_parsed r_reply_info; struct page *r_locked_page; int r_err; - bool r_aborted; unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */ unsigned long r_started; /* start time to measure timeout against */ @@ -262,9 +269,7 @@ struct ceph_mds_request { ceph_mds_request_callback_t r_callback; ceph_mds_request_wait_callback_t r_wait_for_completion; struct list_head r_unsafe_item; /* per-session unsafe list item */ - bool r_got_unsafe, r_got_safe, r_got_result; - bool r_did_prepopulate; long long r_dir_release_cnt; long long r_dir_ordered_cnt; int r_readdir_cache_idx; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 6bd20d707bfd..0ec8d0114e57 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -757,7 +757,6 @@ static const struct super_operations ceph_super_ops = { .destroy_inode = ceph_destroy_inode, .write_inode = ceph_write_inode, .drop_inode = ceph_drop_inode, - .evict_inode = ceph_evict_inode, .sync_fs = ceph_sync_fs, .put_super = ceph_put_super, .show_options = ceph_show_options, @@ -952,6 +951,14 @@ static int ceph_register_bdi(struct super_block *sb, fsc->backing_dev_info.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; + if (fsc->mount_options->rsize > fsc->mount_options->rasize && + fsc->mount_options->rsize >= PAGE_SIZE) + fsc->backing_dev_info.io_pages = + (fsc->mount_options->rsize + PAGE_SIZE - 1) + >> PAGE_SHIFT; + else if (fsc->mount_options->rsize == 0) + fsc->backing_dev_info.io_pages = ULONG_MAX; + err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld", atomic_long_inc_return(&bdi_seq)); if (!err) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 3373b61faefd..e9410bcf4113 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -45,8 +45,8 @@ #define ceph_test_mount_opt(fsc, opt) \ (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) -#define CEPH_RSIZE_DEFAULT 0 /* max read size */ -#define CEPH_RASIZE_DEFAULT (8192*1024) /* readahead */ +#define CEPH_RSIZE_DEFAULT (64*1024*1024) /* max read size */ +#define CEPH_RASIZE_DEFAULT (8192*1024) /* max readahead */ #define CEPH_MAX_READDIR_DEFAULT 1024 #define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) #define CEPH_SNAPDIRNAME_DEFAULT ".snap" @@ -343,7 +343,6 @@ struct ceph_inode_info { u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */ u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */ - struct list_head i_unsafe_writes; /* uncommitted sync writes */ struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */ struct list_head i_unsafe_iops; /* uncommitted mds inode ops */ spinlock_t i_unsafe_lock; @@ -602,7 +601,7 @@ static inline int __ceph_caps_wanted(struct ceph_inode_info *ci) } /* what the mds thinks we want */ -extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci); +extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check); extern void ceph_caps_init(struct ceph_mds_client *mdsc); extern void ceph_caps_finalize(struct ceph_mds_client *mdsc); @@ -753,7 +752,6 @@ extern const struct inode_operations ceph_file_iops; extern struct inode *ceph_alloc_inode(struct super_block *sb); extern void ceph_destroy_inode(struct inode *inode); extern int ceph_drop_inode(struct inode *inode); -extern void ceph_evict_inode(struct inode *inode); extern struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino); @@ -764,8 +762,7 @@ extern void ceph_fill_file_time(struct inode *inode, int issued, u64 time_warp_seq, struct timespec *ctime, struct timespec *mtime, struct timespec *atime); extern int ceph_fill_trace(struct super_block *sb, - struct ceph_mds_request *req, - struct ceph_mds_session *session); + struct ceph_mds_request *req); extern int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct ceph_mds_session *session); @@ -904,6 +901,7 @@ extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc); extern int ceph_encode_inode_release(void **p, struct inode *inode, int mds, int drop, int unless, int force); extern int ceph_encode_dentry_release(void **p, struct dentry *dn, + struct inode *dir, int mds, int drop, int unless); extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, @@ -933,7 +931,7 @@ extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, extern int ceph_release(struct inode *inode, struct file *filp); extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, char *data, size_t len); -extern void ceph_sync_write_wait(struct inode *inode); + /* dir.c */ extern const struct file_operations ceph_dir_fops; extern const struct file_operations ceph_snapdir_fops; diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 03a6653d329a..2ea0c282f3dc 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -22,7 +22,6 @@ struct ceph_osd_client; * completion callback for async writepages */ typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *); -typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool); #define CEPH_HOMELESS_OSD -1 @@ -170,15 +169,12 @@ struct ceph_osd_request { unsigned int r_num_ops; int r_result; - bool r_got_reply; struct ceph_osd_client *r_osdc; struct kref r_kref; bool r_mempool; - struct completion r_completion; - struct completion r_done_completion; /* fsync waiter */ + struct completion r_completion; /* private to osd_client.c */ ceph_osdc_callback_t r_callback; - ceph_osdc_unsafe_callback_t r_unsafe_callback; struct list_head r_unsafe_item; struct inode *r_inode; /* for use by callbacks */ diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 9a9041784dcf..938656f70807 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -57,7 +57,7 @@ static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool) case CEPH_POOL_TYPE_EC: return false; default: - BUG_ON(1); + BUG(); } } @@ -82,13 +82,6 @@ void ceph_oloc_copy(struct ceph_object_locator *dest, void ceph_oloc_destroy(struct ceph_object_locator *oloc); /* - * Maximum supported by kernel client object name length - * - * (probably outdated: must be >= RBD_MAX_MD_NAME_LEN -- currently 100) - */ -#define CEPH_MAX_OID_NAME_LEN 100 - -/* * 51-char inline_name is long enough for all cephfs and all but one * rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be * arbitrarily long (~PAGE_SIZE). It's done once during rbd map; all @@ -173,8 +166,8 @@ struct ceph_osdmap { * the list of osds that store+replicate them. */ struct crush_map *crush; - struct mutex crush_scratch_mutex; - int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3]; + struct mutex crush_workspace_mutex; + void *crush_workspace; }; static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd) diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 5c0da61cb763..5d0018782d50 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -50,7 +50,7 @@ struct ceph_timespec { #define CEPH_PG_LAYOUT_LINEAR 2 #define CEPH_PG_LAYOUT_HYBRID 3 -#define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */ +#define CEPH_PG_MAX_SIZE 32 /* max # osds in a single pg */ /* * placement group. diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index be8f12b8f195..fbecbd089d75 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -135,13 +135,6 @@ struct crush_bucket { __u32 size; /* num items */ __s32 *items; - /* - * cached random permutation: used for uniform bucket and for - * the linear search fallback for the other bucket types. - */ - __u32 perm_x; /* @x for which *perm is defined */ - __u32 perm_n; /* num elements of *perm that are permuted/defined */ - __u32 *perm; }; struct crush_bucket_uniform { @@ -211,6 +204,21 @@ struct crush_map { * device fails. */ __u8 chooseleaf_stable; + /* + * This value is calculated after decode or construction by + * the builder. It is exposed here (rather than having a + * 'build CRUSH working space' function) so that callers can + * reserve a static buffer, allocate space on the stack, or + * otherwise avoid calling into the heap allocator if they + * want to. The size of the working space depends on the map, + * while the size of the scratch vector passed to the mapper + * depends on the size of the desired result set. + * + * Nothing stops the caller from allocating both in one swell + * foop and passing in two points, though. + */ + size_t working_size; + #ifndef __KERNEL__ /* * version 0 (original) of straw_calc has various flaws. version 1 @@ -248,4 +256,23 @@ static inline int crush_calc_tree_node(int i) return ((i+1) << 1)-1; } +/* + * These data structures are private to the CRUSH implementation. They + * are exposed in this header file because builder needs their + * definitions to calculate the total working size. + * + * Moving this out of the crush map allow us to treat the CRUSH map as + * immutable within the mapper and removes the requirement for a CRUSH + * map lock. + */ +struct crush_work_bucket { + __u32 perm_x; /* @x for which *perm is defined */ + __u32 perm_n; /* num elements of *perm that are permuted/defined */ + __u32 *perm; /* Permutation of the bucket's items */ +}; + +struct crush_work { + struct crush_work_bucket **work; /* Per-bucket working store */ +}; + #endif diff --git a/include/linux/crush/mapper.h b/include/linux/crush/mapper.h index 5dfd5b1125d2..c95e19e1ff11 100644 --- a/include/linux/crush/mapper.h +++ b/include/linux/crush/mapper.h @@ -15,6 +15,20 @@ extern int crush_do_rule(const struct crush_map *map, int ruleno, int x, int *result, int result_max, const __u32 *weights, int weight_max, - int *scratch); + void *cwin); + +/* + * Returns the exact amount of workspace that will need to be used + * for a given combination of crush_map and result_max. The caller can + * then allocate this much on its own, either on the stack, in a + * per-thread long-lived buffer, or however it likes. + */ +static inline size_t crush_work_size(const struct crush_map *map, + int result_max) +{ + return map->working_size + result_max * 3 * sizeof(__u32); +} + +void crush_init_workspace(const struct crush_map *map, void *v); #endif diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c index 50f040fdb2a9..b9233b990399 100644 --- a/net/ceph/cls_lock_client.c +++ b/net/ceph/cls_lock_client.c @@ -69,8 +69,8 @@ int ceph_cls_lock(struct ceph_osd_client *osdc, dout("%s lock_name %s type %d cookie %s tag %s desc %s flags 0x%x\n", __func__, lock_name, type, cookie, tag, desc, flags); ret = ceph_osdc_call(osdc, oid, oloc, "lock", "lock", - CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - lock_op_page, lock_op_buf_size, NULL, NULL); + CEPH_OSD_FLAG_WRITE, lock_op_page, + lock_op_buf_size, NULL, NULL); dout("%s: status %d\n", __func__, ret); __free_page(lock_op_page); @@ -117,8 +117,8 @@ int ceph_cls_unlock(struct ceph_osd_client *osdc, dout("%s lock_name %s cookie %s\n", __func__, lock_name, cookie); ret = ceph_osdc_call(osdc, oid, oloc, "lock", "unlock", - CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - unlock_op_page, unlock_op_buf_size, NULL, NULL); + CEPH_OSD_FLAG_WRITE, unlock_op_page, + unlock_op_buf_size, NULL, NULL); dout("%s: status %d\n", __func__, ret); __free_page(unlock_op_page); @@ -170,8 +170,8 @@ int ceph_cls_break_lock(struct ceph_osd_client *osdc, dout("%s lock_name %s cookie %s locker %s%llu\n", __func__, lock_name, cookie, ENTITY_NAME(*locker)); ret = ceph_osdc_call(osdc, oid, oloc, "lock", "break_lock", - CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - break_op_page, break_op_buf_size, NULL, NULL); + CEPH_OSD_FLAG_WRITE, break_op_page, + break_op_buf_size, NULL, NULL); dout("%s: status %d\n", __func__, ret); __free_page(break_op_page); @@ -278,7 +278,7 @@ int ceph_cls_lock_info(struct ceph_osd_client *osdc, int get_info_op_buf_size; int name_len = strlen(lock_name); struct page *get_info_op_page, *reply_page; - size_t reply_len; + size_t reply_len = PAGE_SIZE; void *p, *end; int ret; diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c index 80d7c3a97cb8..5bf94c04f645 100644 --- a/net/ceph/crush/crush.c +++ b/net/ceph/crush/crush.c @@ -45,7 +45,6 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p) void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b) { - kfree(b->h.perm); kfree(b->h.items); kfree(b); } @@ -54,14 +53,12 @@ void crush_destroy_bucket_list(struct crush_bucket_list *b) { kfree(b->item_weights); kfree(b->sum_weights); - kfree(b->h.perm); kfree(b->h.items); kfree(b); } void crush_destroy_bucket_tree(struct crush_bucket_tree *b) { - kfree(b->h.perm); kfree(b->h.items); kfree(b->node_weights); kfree(b); @@ -71,7 +68,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b) { kfree(b->straws); kfree(b->item_weights); - kfree(b->h.perm); kfree(b->h.items); kfree(b); } @@ -79,7 +75,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b) void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b) { kfree(b->item_weights); - kfree(b->h.perm); kfree(b->h.items); kfree(b); } diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 130ab407c5ec..b5cd8c21bfdf 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -54,7 +54,6 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size return -1; } - /* * bucket choose methods * @@ -72,59 +71,60 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size * Since this is expensive, we optimize for the r=0 case, which * captures the vast majority of calls. */ -static int bucket_perm_choose(struct crush_bucket *bucket, +static int bucket_perm_choose(const struct crush_bucket *bucket, + struct crush_work_bucket *work, int x, int r) { unsigned int pr = r % bucket->size; unsigned int i, s; /* start a new permutation if @x has changed */ - if (bucket->perm_x != (__u32)x || bucket->perm_n == 0) { + if (work->perm_x != (__u32)x || work->perm_n == 0) { dprintk("bucket %d new x=%d\n", bucket->id, x); - bucket->perm_x = x; + work->perm_x = x; /* optimize common r=0 case */ if (pr == 0) { s = crush_hash32_3(bucket->hash, x, bucket->id, 0) % bucket->size; - bucket->perm[0] = s; - bucket->perm_n = 0xffff; /* magic value, see below */ + work->perm[0] = s; + work->perm_n = 0xffff; /* magic value, see below */ goto out; } for (i = 0; i < bucket->size; i++) - bucket->perm[i] = i; - bucket->perm_n = 0; - } else if (bucket->perm_n == 0xffff) { + work->perm[i] = i; + work->perm_n = 0; + } else if (work->perm_n == 0xffff) { /* clean up after the r=0 case above */ for (i = 1; i < bucket->size; i++) - bucket->perm[i] = i; - bucket->perm[bucket->perm[0]] = 0; - bucket->perm_n = 1; + work->perm[i] = i; + work->perm[work->perm[0]] = 0; + work->perm_n = 1; } /* calculate permutation up to pr */ - for (i = 0; i < bucket->perm_n; i++) - dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]); - while (bucket->perm_n <= pr) { - unsigned int p = bucket->perm_n; + for (i = 0; i < work->perm_n; i++) + dprintk(" perm_choose have %d: %d\n", i, work->perm[i]); + while (work->perm_n <= pr) { + unsigned int p = work->perm_n; /* no point in swapping the final entry */ if (p < bucket->size - 1) { i = crush_hash32_3(bucket->hash, x, bucket->id, p) % (bucket->size - p); if (i) { - unsigned int t = bucket->perm[p + i]; - bucket->perm[p + i] = bucket->perm[p]; - bucket->perm[p] = t; + unsigned int t = work->perm[p + i]; + work->perm[p + i] = work->perm[p]; + work->perm[p] = t; } dprintk(" perm_choose swap %d with %d\n", p, p+i); } - bucket->perm_n++; + work->perm_n++; } for (i = 0; i < bucket->size; i++) - dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]); + dprintk(" perm_choose %d: %d\n", i, work->perm[i]); - s = bucket->perm[pr]; + s = work->perm[pr]; out: dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id, bucket->size, x, r, pr, s); @@ -132,14 +132,14 @@ out: } /* uniform */ -static int bucket_uniform_choose(struct crush_bucket_uniform *bucket, - int x, int r) +static int bucket_uniform_choose(const struct crush_bucket_uniform *bucket, + struct crush_work_bucket *work, int x, int r) { - return bucket_perm_choose(&bucket->h, x, r); + return bucket_perm_choose(&bucket->h, work, x, r); } /* list */ -static int bucket_list_choose(struct crush_bucket_list *bucket, +static int bucket_list_choose(const struct crush_bucket_list *bucket, int x, int r) { int i; @@ -155,8 +155,9 @@ static int bucket_list_choose(struct crush_bucket_list *bucket, w *= bucket->sum_weights[i]; w = w >> 16; /*dprintk(" scaled %llx\n", w);*/ - if (w < bucket->item_weights[i]) + if (w < bucket->item_weights[i]) { return bucket->h.items[i]; + } } dprintk("bad list sums for bucket %d\n", bucket->h.id); @@ -192,7 +193,7 @@ static int terminal(int x) return x & 1; } -static int bucket_tree_choose(struct crush_bucket_tree *bucket, +static int bucket_tree_choose(const struct crush_bucket_tree *bucket, int x, int r) { int n; @@ -224,7 +225,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket, /* straw */ -static int bucket_straw_choose(struct crush_bucket_straw *bucket, +static int bucket_straw_choose(const struct crush_bucket_straw *bucket, int x, int r) { __u32 i; @@ -301,7 +302,7 @@ static __u64 crush_ln(unsigned int xin) * */ -static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket, +static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket, int x, int r) { unsigned int i, high = 0; @@ -344,37 +345,42 @@ static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket, high_draw = draw; } } + return bucket->h.items[high]; } -static int crush_bucket_choose(struct crush_bucket *in, int x, int r) +static int crush_bucket_choose(const struct crush_bucket *in, + struct crush_work_bucket *work, + int x, int r) { dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); BUG_ON(in->size == 0); switch (in->alg) { case CRUSH_BUCKET_UNIFORM: - return bucket_uniform_choose((struct crush_bucket_uniform *)in, - x, r); + return bucket_uniform_choose( + (const struct crush_bucket_uniform *)in, + work, x, r); case CRUSH_BUCKET_LIST: - return bucket_list_choose((struct crush_bucket_list *)in, + return bucket_list_choose((const struct crush_bucket_list *)in, x, r); case CRUSH_BUCKET_TREE: - return bucket_tree_choose((struct crush_bucket_tree *)in, + return bucket_tree_choose((const struct crush_bucket_tree *)in, x, r); case CRUSH_BUCKET_STRAW: - return bucket_straw_choose((struct crush_bucket_straw *)in, - x, r); + return bucket_straw_choose( + (const struct crush_bucket_straw *)in, + x, r); case CRUSH_BUCKET_STRAW2: - return bucket_straw2_choose((struct crush_bucket_straw2 *)in, - x, r); + return bucket_straw2_choose( + (const struct crush_bucket_straw2 *)in, + x, r); default: dprintk("unknown bucket %d alg %d\n", in->id, in->alg); return in->items[0]; } } - /* * true if device is marked "out" (failed, fully offloaded) * of the cluster @@ -416,7 +422,8 @@ static int is_out(const struct crush_map *map, * @parent_r: r value passed from the parent */ static int crush_choose_firstn(const struct crush_map *map, - struct crush_bucket *bucket, + struct crush_work *work, + const struct crush_bucket *bucket, const __u32 *weight, int weight_max, int x, int numrep, int type, int *out, int outpos, @@ -434,7 +441,7 @@ static int crush_choose_firstn(const struct crush_map *map, int rep; unsigned int ftotal, flocal; int retry_descent, retry_bucket, skip_rep; - struct crush_bucket *in = bucket; + const struct crush_bucket *in = bucket; int r; int i; int item = 0; @@ -473,9 +480,13 @@ static int crush_choose_firstn(const struct crush_map *map, if (local_fallback_retries > 0 && flocal >= (in->size>>1) && flocal > local_fallback_retries) - item = bucket_perm_choose(in, x, r); + item = bucket_perm_choose( + in, work->work[-1-in->id], + x, r); else - item = crush_bucket_choose(in, x, r); + item = crush_bucket_choose( + in, work->work[-1-in->id], + x, r); if (item >= map->max_devices) { dprintk(" bad item %d\n", item); skip_rep = 1; @@ -518,19 +529,21 @@ static int crush_choose_firstn(const struct crush_map *map, sub_r = r >> (vary_r-1); else sub_r = 0; - if (crush_choose_firstn(map, - map->buckets[-1-item], - weight, weight_max, - x, stable ? 1 : outpos+1, 0, - out2, outpos, count, - recurse_tries, 0, - local_retries, - local_fallback_retries, - 0, - vary_r, - stable, - NULL, - sub_r) <= outpos) + if (crush_choose_firstn( + map, + work, + map->buckets[-1-item], + weight, weight_max, + x, stable ? 1 : outpos+1, 0, + out2, outpos, count, + recurse_tries, 0, + local_retries, + local_fallback_retries, + 0, + vary_r, + stable, + NULL, + sub_r) <= outpos) /* didn't get leaf */ reject = 1; } else { @@ -539,14 +552,12 @@ static int crush_choose_firstn(const struct crush_map *map, } } - if (!reject) { + if (!reject && !collide) { /* out? */ if (itemtype == 0) reject = is_out(map, weight, weight_max, item, x); - else - reject = 0; } reject: @@ -600,7 +611,8 @@ reject: * */ static void crush_choose_indep(const struct crush_map *map, - struct crush_bucket *bucket, + struct crush_work *work, + const struct crush_bucket *bucket, const __u32 *weight, int weight_max, int x, int left, int numrep, int type, int *out, int outpos, @@ -610,7 +622,7 @@ static void crush_choose_indep(const struct crush_map *map, int *out2, int parent_r) { - struct crush_bucket *in = bucket; + const struct crush_bucket *in = bucket; int endpos = outpos + left; int rep; unsigned int ftotal; @@ -678,7 +690,9 @@ static void crush_choose_indep(const struct crush_map *map, break; } - item = crush_bucket_choose(in, x, r); + item = crush_bucket_choose( + in, work->work[-1-in->id], + x, r); if (item >= map->max_devices) { dprintk(" bad item %d\n", item); out[rep] = CRUSH_ITEM_NONE; @@ -724,13 +738,15 @@ static void crush_choose_indep(const struct crush_map *map, if (recurse_to_leaf) { if (item < 0) { - crush_choose_indep(map, - map->buckets[-1-item], - weight, weight_max, - x, 1, numrep, 0, - out2, rep, - recurse_tries, 0, - 0, NULL, r); + crush_choose_indep( + map, + work, + map->buckets[-1-item], + weight, weight_max, + x, 1, numrep, 0, + out2, rep, + recurse_tries, 0, + 0, NULL, r); if (out2[rep] == CRUSH_ITEM_NONE) { /* placed nothing; no leaf */ break; @@ -781,6 +797,53 @@ static void crush_choose_indep(const struct crush_map *map, #endif } + +/* + * This takes a chunk of memory and sets it up to be a shiny new + * working area for a CRUSH placement computation. It must be called + * on any newly allocated memory before passing it in to + * crush_do_rule. It may be used repeatedly after that, so long as the + * map has not changed. If the map /has/ changed, you must make sure + * the working size is no smaller than what was allocated and re-run + * crush_init_workspace. + * + * If you do retain the working space between calls to crush, make it + * thread-local. + */ +void crush_init_workspace(const struct crush_map *map, void *v) +{ + struct crush_work *w = v; + __s32 b; + + /* + * We work by moving through the available space and setting + * values and pointers as we go. + * + * It's a bit like Forth's use of the 'allot' word since we + * set the pointer first and then reserve the space for it to + * point to by incrementing the point. + */ + v += sizeof(struct crush_work *); + w->work = v; + v += map->max_buckets * sizeof(struct crush_work_bucket *); + for (b = 0; b < map->max_buckets; ++b) { + if (!map->buckets[b]) + continue; + + w->work[b] = v; + switch (map->buckets[b]->alg) { + default: + v += sizeof(struct crush_work_bucket); + break; + } + w->work[b]->perm_x = 0; + w->work[b]->perm_n = 0; + w->work[b]->perm = v; + v += map->buckets[b]->size * sizeof(__u32); + } + BUG_ON(v - (void *)w != map->working_size); +} + /** * crush_do_rule - calculate a mapping with the given input and rule * @map: the crush_map @@ -790,24 +853,25 @@ static void crush_choose_indep(const struct crush_map *map, * @result_max: maximum result size * @weight: weight vector (for map leaves) * @weight_max: size of weight vector - * @scratch: scratch vector for private use; must be >= 3 * result_max + * @cwin: pointer to at least crush_work_size() bytes of memory */ int crush_do_rule(const struct crush_map *map, int ruleno, int x, int *result, int result_max, const __u32 *weight, int weight_max, - int *scratch) + void *cwin) { int result_len; - int *a = scratch; - int *b = scratch + result_max; - int *c = scratch + result_max*2; + struct crush_work *cw = cwin; + int *a = cwin + map->working_size; + int *b = a + result_max; + int *c = b + result_max; + int *w = a; + int *o = b; int recurse_to_leaf; - int *w; int wsize = 0; - int *o; int osize; int *tmp; - struct crush_rule *rule; + const struct crush_rule *rule; __u32 step; int i, j; int numrep; @@ -835,12 +899,10 @@ int crush_do_rule(const struct crush_map *map, rule = map->rules[ruleno]; result_len = 0; - w = a; - o = b; for (step = 0; step < rule->len; step++) { int firstn = 0; - struct crush_rule_step *curstep = &rule->steps[step]; + const struct crush_rule_step *curstep = &rule->steps[step]; switch (curstep->op) { case CRUSH_RULE_TAKE: @@ -936,6 +998,7 @@ int crush_do_rule(const struct crush_map *map, recurse_tries = choose_tries; osize += crush_choose_firstn( map, + cw, map->buckets[bno], weight, weight_max, x, numrep, @@ -956,6 +1019,7 @@ int crush_do_rule(const struct crush_map *map, numrep : (result_max-osize)); crush_choose_indep( map, + cw, map->buckets[bno], weight, weight_max, x, out_size, numrep, @@ -997,5 +1061,6 @@ int crush_do_rule(const struct crush_map *map, break; } } + return result_len; } diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index 292e33bd916e..85747b7f91a9 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -3,6 +3,7 @@ #include <linux/err.h> #include <linux/scatterlist.h> +#include <linux/sched.h> #include <linux/slab.h> #include <crypto/aes.h> #include <crypto/skcipher.h> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index f3378ba1a828..b65bbf9f45eb 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -460,7 +460,6 @@ static void request_init(struct ceph_osd_request *req) kref_init(&req->r_kref); init_completion(&req->r_completion); - init_completion(&req->r_done_completion); RB_CLEAR_NODE(&req->r_node); RB_CLEAR_NODE(&req->r_mc_node); INIT_LIST_HEAD(&req->r_unsafe_item); @@ -672,7 +671,8 @@ void osd_req_op_extent_update(struct ceph_osd_request *osd_req, BUG_ON(length > previous); op->extent.length = length; - op->indata_len -= previous - length; + if (op->op == CEPH_OSD_OP_WRITE || op->op == CEPH_OSD_OP_WRITEFULL) + op->indata_len -= previous - length; } EXPORT_SYMBOL(osd_req_op_extent_update); @@ -1636,7 +1636,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) bool need_send = false; bool promoted = false; - WARN_ON(req->r_tid || req->r_got_reply); + WARN_ON(req->r_tid); dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); again: @@ -1704,17 +1704,10 @@ promote: static void account_request(struct ceph_osd_request *req) { - unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK; + WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK)); + WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE))); - if (req->r_flags & CEPH_OSD_FLAG_READ) { - WARN_ON(req->r_flags & mask); - req->r_flags |= CEPH_OSD_FLAG_ACK; - } else if (req->r_flags & CEPH_OSD_FLAG_WRITE) - WARN_ON(!(req->r_flags & mask)); - else - WARN_ON(1); - - WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask); + req->r_flags |= CEPH_OSD_FLAG_ONDISK; atomic_inc(&req->r_osdc->num_requests); } @@ -1749,15 +1742,15 @@ static void finish_request(struct ceph_osd_request *req) static void __complete_request(struct ceph_osd_request *req) { - if (req->r_callback) + if (req->r_callback) { + dout("%s req %p tid %llu cb %pf result %d\n", __func__, req, + req->r_tid, req->r_callback, req->r_result); req->r_callback(req); - else - complete_all(&req->r_completion); + } } /* - * Note that this is open-coded in handle_reply(), which has to deal - * with ack vs commit, dup acks, etc. + * This is open-coded in handle_reply(). */ static void complete_request(struct ceph_osd_request *req, int err) { @@ -1766,7 +1759,7 @@ static void complete_request(struct ceph_osd_request *req, int err) req->r_result = err; finish_request(req); __complete_request(req); - complete_all(&req->r_done_completion); + complete_all(&req->r_completion); ceph_osdc_put_request(req); } @@ -1792,7 +1785,7 @@ static void cancel_request(struct ceph_osd_request *req) cancel_map_check(req); finish_request(req); - complete_all(&req->r_done_completion); + complete_all(&req->r_completion); ceph_osdc_put_request(req); } @@ -2169,7 +2162,6 @@ static void linger_commit_cb(struct ceph_osd_request *req) mutex_lock(&lreq->lock); dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq, lreq->linger_id, req->r_result); - WARN_ON(!__linger_registered(lreq)); linger_reg_commit_complete(lreq, req->r_result); lreq->committed = true; @@ -2785,31 +2777,8 @@ e_inval: } /* - * We are done with @req if - * - @m is a safe reply, or - * - @m is an unsafe reply and we didn't want a safe one - */ -static bool done_request(const struct ceph_osd_request *req, - const struct MOSDOpReply *m) -{ - return (m->result < 0 || - (m->flags & CEPH_OSD_FLAG_ONDISK) || - !(req->r_flags & CEPH_OSD_FLAG_ONDISK)); -} - -/* - * handle osd op reply. either call the callback if it is specified, - * or do the completion to wake up the waiting thread. - * - * ->r_unsafe_callback is set? yes no - * - * first reply is OK (needed r_cb/r_completion, r_cb/r_completion, - * any or needed/got safe) r_done_completion r_done_completion - * - * first reply is unsafe r_unsafe_cb(true) (nothing) - * - * when we get the safe reply r_unsafe_cb(false), r_cb/r_completion, - * r_done_completion r_done_completion + * Handle MOSDOpReply. Set ->r_result and call the callback if it is + * specified. */ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) { @@ -2818,7 +2787,6 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) struct MOSDOpReply m; u64 tid = le64_to_cpu(msg->hdr.tid); u32 data_len = 0; - bool already_acked; int ret; int i; @@ -2897,50 +2865,22 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) le32_to_cpu(msg->hdr.data_len), req->r_tid); goto fail_request; } - dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__, - req, req->r_tid, req->r_got_reply, m.result, data_len); - - already_acked = req->r_got_reply; - if (!already_acked) { - req->r_result = m.result ?: data_len; - req->r_replay_version = m.replay_version; /* struct */ - req->r_got_reply = true; - } else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) { - dout("req %p tid %llu dup ack\n", req, req->r_tid); - goto out_unlock_session; - } - - if (done_request(req, &m)) { - finish_request(req); - if (req->r_linger) { - WARN_ON(req->r_unsafe_callback); - dout("req %p tid %llu cb (locked)\n", req, req->r_tid); - __complete_request(req); - } - } + dout("%s req %p tid %llu result %d data_len %u\n", __func__, + req, req->r_tid, m.result, data_len); + /* + * Since we only ever request ONDISK, we should only ever get + * one (type of) reply back. + */ + WARN_ON(!(m.flags & CEPH_OSD_FLAG_ONDISK)); + req->r_result = m.result ?: data_len; + finish_request(req); mutex_unlock(&osd->lock); up_read(&osdc->lock); - if (done_request(req, &m)) { - if (already_acked && req->r_unsafe_callback) { - dout("req %p tid %llu safe-cb\n", req, req->r_tid); - req->r_unsafe_callback(req, false); - } else if (!req->r_linger) { - dout("req %p tid %llu cb\n", req, req->r_tid); - __complete_request(req); - } - complete_all(&req->r_done_completion); - ceph_osdc_put_request(req); - } else { - if (req->r_unsafe_callback) { - dout("req %p tid %llu unsafe-cb\n", req, req->r_tid); - req->r_unsafe_callback(req, true); - } else { - WARN_ON(1); - } - } - + __complete_request(req); + complete_all(&req->r_completion); + ceph_osdc_put_request(req); return; fail_request: @@ -3540,7 +3480,7 @@ again: up_read(&osdc->lock); dout("%s waiting on req %p tid %llu last_tid %llu\n", __func__, req, req->r_tid, last_tid); - wait_for_completion(&req->r_done_completion); + wait_for_completion(&req->r_completion); ceph_osdc_put_request(req); goto again; } @@ -3599,7 +3539,7 @@ ceph_osdc_watch(struct ceph_osd_client *osdc, ceph_oid_copy(&lreq->t.base_oid, oid); ceph_oloc_copy(&lreq->t.base_oloc, oloc); - lreq->t.flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; + lreq->t.flags = CEPH_OSD_FLAG_WRITE; lreq->mtime = CURRENT_TIME; lreq->reg_req = alloc_linger_request(lreq); @@ -3657,7 +3597,7 @@ int ceph_osdc_unwatch(struct ceph_osd_client *osdc, ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); - req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; + req->r_flags = CEPH_OSD_FLAG_WRITE; req->r_mtime = CURRENT_TIME; osd_req_op_watch_init(req, 0, lreq->linger_id, CEPH_OSD_WATCH_OP_UNWATCH); @@ -4022,7 +3962,7 @@ EXPORT_SYMBOL(ceph_osdc_maybe_request_map); * Execute an OSD class method on an object. * * @flags: CEPH_OSD_FLAG_* - * @resp_len: out param for reply length + * @resp_len: in/out param for reply length */ int ceph_osdc_call(struct ceph_osd_client *osdc, struct ceph_object_id *oid, @@ -4035,6 +3975,9 @@ int ceph_osdc_call(struct ceph_osd_client *osdc, struct ceph_osd_request *req; int ret; + if (req_len > PAGE_SIZE || (resp_page && *resp_len > PAGE_SIZE)) + return -E2BIG; + req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO); if (!req) return -ENOMEM; @@ -4053,7 +3996,7 @@ int ceph_osdc_call(struct ceph_osd_client *osdc, 0, false, false); if (resp_page) osd_req_op_cls_response_data_pages(req, 0, &resp_page, - PAGE_SIZE, 0, false, false); + *resp_len, 0, false, false); ceph_osdc_start_request(osdc, req, false); ret = ceph_osdc_wait_request(osdc, req); @@ -4220,8 +4163,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, int page_align = off & ~PAGE_MASK; req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1, - CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, + CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc, truncate_seq, truncate_size, true); if (IS_ERR(req)) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index d2436880b305..6824c0ec8373 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -153,6 +153,32 @@ bad: return -EINVAL; } +static void crush_finalize(struct crush_map *c) +{ + __s32 b; + + /* Space for the array of pointers to per-bucket workspace */ + c->working_size = sizeof(struct crush_work) + + c->max_buckets * sizeof(struct crush_work_bucket *); + + for (b = 0; b < c->max_buckets; b++) { + if (!c->buckets[b]) + continue; + + switch (c->buckets[b]->alg) { + default: + /* + * The base case, permutation variables and + * the pointer to the permutation array. + */ + c->working_size += sizeof(struct crush_work_bucket); + break; + } + /* Every bucket has a permutation array. */ + c->working_size += c->buckets[b]->size * sizeof(__u32); + } +} + static struct crush_map *crush_decode(void *pbyval, void *end) { struct crush_map *c; @@ -246,10 +272,6 @@ static struct crush_map *crush_decode(void *pbyval, void *end) b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS); if (b->items == NULL) goto badmem; - b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS); - if (b->perm == NULL) - goto badmem; - b->perm_n = 0; ceph_decode_need(p, end, b->size*sizeof(u32), bad); for (j = 0; j < b->size; j++) @@ -368,6 +390,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end) dout("crush decode tunable chooseleaf_stable = %d\n", c->chooseleaf_stable); + crush_finalize(c); + done: dout("crush_decode success\n"); return c; @@ -719,7 +743,7 @@ struct ceph_osdmap *ceph_osdmap_alloc(void) map->pool_max = -1; map->pg_temp = RB_ROOT; map->primary_temp = RB_ROOT; - mutex_init(&map->crush_scratch_mutex); + mutex_init(&map->crush_workspace_mutex); return map; } @@ -753,6 +777,7 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) kfree(map->osd_weight); kfree(map->osd_addr); kfree(map->osd_primary_affinity); + kfree(map->crush_workspace); kfree(map); } @@ -808,6 +833,31 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) return 0; } +static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) +{ + void *workspace; + size_t work_size; + + if (IS_ERR(crush)) + return PTR_ERR(crush); + + work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE); + dout("%s work_size %zu bytes\n", __func__, work_size); + workspace = kmalloc(work_size, GFP_NOIO); + if (!workspace) { + crush_destroy(crush); + return -ENOMEM; + } + crush_init_workspace(crush, workspace); + + if (map->crush) + crush_destroy(map->crush); + kfree(map->crush_workspace); + map->crush = crush; + map->crush_workspace = workspace; + return 0; +} + #define OSDMAP_WRAPPER_COMPAT_VER 7 #define OSDMAP_CLIENT_DATA_COMPAT_VER 1 @@ -1214,13 +1264,9 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) /* crush */ ceph_decode_32_safe(p, end, len, e_inval); - map->crush = crush_decode(*p, min(*p + len, end)); - if (IS_ERR(map->crush)) { - err = PTR_ERR(map->crush); - map->crush = NULL; + err = osdmap_set_crush(map, crush_decode(*p, min(*p + len, end))); + if (err) goto bad; - } - *p += len; /* ignore the rest */ *p = end; @@ -1375,7 +1421,6 @@ e_inval: struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, struct ceph_osdmap *map) { - struct crush_map *newcrush = NULL; struct ceph_fsid fsid; u32 epoch = 0; struct ceph_timespec modified; @@ -1414,12 +1459,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, /* new crush? */ ceph_decode_32_safe(p, end, len, e_inval); if (len > 0) { - newcrush = crush_decode(*p, min(*p+len, end)); - if (IS_ERR(newcrush)) { - err = PTR_ERR(newcrush); - newcrush = NULL; + err = osdmap_set_crush(map, + crush_decode(*p, min(*p + len, end))); + if (err) goto bad; - } *p += len; } @@ -1439,12 +1482,6 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, map->epoch++; map->modified = modified; - if (newcrush) { - if (map->crush) - crush_destroy(map->crush); - map->crush = newcrush; - newcrush = NULL; - } /* new_pools */ err = decode_new_pools(p, end, map); @@ -1505,8 +1542,6 @@ bad: print_hex_dump(KERN_DEBUG, "osdmap: ", DUMP_PREFIX_OFFSET, 16, 1, start, end - start, true); - if (newcrush) - crush_destroy(newcrush); return ERR_PTR(err); } @@ -1942,10 +1977,10 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x, BUG_ON(result_max > CEPH_PG_MAX_SIZE); - mutex_lock(&map->crush_scratch_mutex); + mutex_lock(&map->crush_workspace_mutex); r = crush_do_rule(map->crush, ruleno, x, result, result_max, - weight, weight_max, map->crush_scratch_ary); - mutex_unlock(&map->crush_scratch_mutex); + weight, weight_max, map->crush_workspace); + mutex_unlock(&map->crush_workspace_mutex); return r; } @@ -1978,8 +2013,14 @@ static void pg_to_raw_osds(struct ceph_osdmap *osdmap, return; } - len = do_crush(osdmap, ruleno, pps, raw->osds, - min_t(int, pi->size, ARRAY_SIZE(raw->osds)), + if (pi->size > ARRAY_SIZE(raw->osds)) { + pr_err_ratelimited("pool %lld ruleset %d type %d too wide: size %d > %zu\n", + pi->id, pi->crush_ruleset, pi->type, pi->size, + ARRAY_SIZE(raw->osds)); + return; + } + + len = do_crush(osdmap, ruleno, pps, raw->osds, pi->size, osdmap->osd_weight, osdmap->max_osd); if (len < 0) { pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", diff --git a/net/ceph/snapshot.c b/net/ceph/snapshot.c index 154683f5f14c..705414e78ae0 100644 --- a/net/ceph/snapshot.c +++ b/net/ceph/snapshot.c @@ -18,8 +18,6 @@ * 02110-1301, USA. */ -#include <stddef.h> - #include <linux/types.h> #include <linux/export.h> #include <linux/ceph/libceph.h> |