diff options
Diffstat (limited to 'net')
43 files changed, 1135 insertions, 444 deletions
diff --git a/net/ceph/buffer.c b/net/ceph/buffer.c index bf3e6a13c215..621b5f65407f 100644 --- a/net/ceph/buffer.c +++ b/net/ceph/buffer.c @@ -6,6 +6,7 @@ #include <linux/ceph/buffer.h> #include <linux/ceph/decode.h> +#include <linux/ceph/libceph.h> /* for ceph_kv{malloc,free} */ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) { @@ -15,16 +16,10 @@ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) if (!b) return NULL; - b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN); - if (b->vec.iov_base) { - b->is_vmalloc = false; - } else { - b->vec.iov_base = __vmalloc(len, gfp | __GFP_HIGHMEM, PAGE_KERNEL); - if (!b->vec.iov_base) { - kfree(b); - return NULL; - } - b->is_vmalloc = true; + b->vec.iov_base = ceph_kvmalloc(len, gfp); + if (!b->vec.iov_base) { + kfree(b); + return NULL; } kref_init(&b->kref); @@ -40,12 +35,7 @@ void ceph_buffer_release(struct kref *kref) struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref); dout("buffer_release %p\n", b); - if (b->vec.iov_base) { - if (b->is_vmalloc) - vfree(b->vec.iov_base); - else - kfree(b->vec.iov_base); - } + ceph_kvfree(b->vec.iov_base); kfree(b); } EXPORT_SYMBOL(ceph_buffer_release); diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 34b11ee8124e..67d7721d237e 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -15,6 +15,7 @@ #include <linux/slab.h> #include <linux/statfs.h> #include <linux/string.h> +#include <linux/vmalloc.h> #include <linux/nsproxy.h> #include <net/net_namespace.h> @@ -170,6 +171,25 @@ int ceph_compare_options(struct ceph_options *new_opt, } EXPORT_SYMBOL(ceph_compare_options); +void *ceph_kvmalloc(size_t size, gfp_t flags) +{ + if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { + void *ptr = kmalloc(size, flags | __GFP_NOWARN); + if (ptr) + return ptr; + } + + return __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL); +} + +void ceph_kvfree(const void *ptr) +{ + if (is_vmalloc_addr(ptr)) + vfree(ptr); + else + kfree(ptr); +} + static int parse_fsid(const char *str, struct ceph_fsid *fsid) { @@ -461,8 +481,8 @@ EXPORT_SYMBOL(ceph_client_id); * create a fresh client instance */ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, - unsigned int supported_features, - unsigned int required_features) + u64 supported_features, + u64 required_features) { struct ceph_client *client; struct ceph_entity_addr *myaddr = NULL; diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c index 089613234f03..16bc199d9a62 100644 --- a/net/ceph/crush/crush.c +++ b/net/ceph/crush/crush.c @@ -116,11 +116,14 @@ void crush_destroy(struct crush_map *map) if (map->rules) { __u32 b; for (b = 0; b < map->max_rules; b++) - kfree(map->rules[b]); + crush_destroy_rule(map->rules[b]); kfree(map->rules); } kfree(map); } - +void crush_destroy_rule(struct crush_rule *rule) +{ + kfree(rule); +} diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index cbd06a91941c..b703790b4e44 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -189,7 +189,7 @@ static int terminal(int x) static int bucket_tree_choose(struct crush_bucket_tree *bucket, int x, int r) { - int n, l; + int n; __u32 w; __u64 t; @@ -197,6 +197,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket, n = bucket->num_nodes >> 1; while (!terminal(n)) { + int l; /* pick point in [0, w) */ w = bucket->node_weights[n]; t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r, @@ -264,8 +265,12 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r) * true if device is marked "out" (failed, fully offloaded) * of the cluster */ -static int is_out(const struct crush_map *map, const __u32 *weight, int item, int x) +static int is_out(const struct crush_map *map, + const __u32 *weight, int weight_max, + int item, int x) { + if (item >= weight_max) + return 1; if (weight[item] >= 0x10000) return 0; if (weight[item] == 0) @@ -277,7 +282,7 @@ static int is_out(const struct crush_map *map, const __u32 *weight, int item, in } /** - * crush_choose - choose numrep distinct items of given type + * crush_choose_firstn - choose numrep distinct items of given type * @map: the crush_map * @bucket: the bucket we are choose an item from * @x: crush input value @@ -285,18 +290,24 @@ static int is_out(const struct crush_map *map, const __u32 *weight, int item, in * @type: the type of item to choose * @out: pointer to output vector * @outpos: our position in that vector - * @firstn: true if choosing "first n" items, false if choosing "indep" - * @recurse_to_leaf: true if we want one device under each item of given type - * @descend_once: true if we should only try one descent before giving up + * @tries: number of attempts to make + * @recurse_tries: number of attempts to have recursive chooseleaf make + * @local_tries: localized retries + * @local_fallback_tries: localized fallback retries + * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) * @out2: second output vector for leaf items (if @recurse_to_leaf) */ -static int crush_choose(const struct crush_map *map, - struct crush_bucket *bucket, - const __u32 *weight, - int x, int numrep, int type, - int *out, int outpos, - int firstn, int recurse_to_leaf, - int descend_once, int *out2) +static int crush_choose_firstn(const struct crush_map *map, + struct crush_bucket *bucket, + const __u32 *weight, int weight_max, + int x, int numrep, int type, + int *out, int outpos, + unsigned int tries, + unsigned int recurse_tries, + unsigned int local_tries, + unsigned int local_fallback_tries, + int recurse_to_leaf, + int *out2) { int rep; unsigned int ftotal, flocal; @@ -325,35 +336,17 @@ static int crush_choose(const struct crush_map *map, collide = 0; retry_bucket = 0; r = rep; - if (in->alg == CRUSH_BUCKET_UNIFORM) { - /* be careful */ - if (firstn || (__u32)numrep >= in->size) - /* r' = r + f_total */ - r += ftotal; - else if (in->size % numrep == 0) - /* r'=r+(n+1)*f_local */ - r += (numrep+1) * - (flocal+ftotal); - else - /* r' = r + n*f_local */ - r += numrep * (flocal+ftotal); - } else { - if (firstn) - /* r' = r + f_total */ - r += ftotal; - else - /* r' = r + n*f_local */ - r += numrep * (flocal+ftotal); - } + /* r' = r + f_total */ + r += ftotal; /* bucket choose */ if (in->size == 0) { reject = 1; goto reject; } - if (map->choose_local_fallback_tries > 0 && + if (local_fallback_tries > 0 && flocal >= (in->size>>1) && - flocal > map->choose_local_fallback_tries) + flocal > local_fallback_tries) item = bucket_perm_choose(in, x, r); else item = crush_bucket_choose(in, x, r); @@ -394,13 +387,15 @@ static int crush_choose(const struct crush_map *map, reject = 0; if (!collide && recurse_to_leaf) { if (item < 0) { - if (crush_choose(map, + if (crush_choose_firstn(map, map->buckets[-1-item], - weight, + weight, weight_max, x, outpos+1, 0, out2, outpos, - firstn, 0, - map->chooseleaf_descend_once, + recurse_tries, 0, + local_tries, + local_fallback_tries, + 0, NULL) <= outpos) /* didn't get leaf */ reject = 1; @@ -414,6 +409,7 @@ static int crush_choose(const struct crush_map *map, /* out? */ if (itemtype == 0) reject = is_out(map, weight, + weight_max, item, x); else reject = 0; @@ -424,17 +420,14 @@ reject: ftotal++; flocal++; - if (reject && descend_once) - /* let outer call try again */ - skip_rep = 1; - else if (collide && flocal <= map->choose_local_tries) + if (collide && flocal <= local_tries) /* retry locally a few times */ retry_bucket = 1; - else if (map->choose_local_fallback_tries > 0 && - flocal <= in->size + map->choose_local_fallback_tries) + else if (local_fallback_tries > 0 && + flocal <= in->size + local_fallback_tries) /* exhaustive bucket search */ retry_bucket = 1; - else if (ftotal <= map->choose_total_tries) + else if (ftotal <= tries) /* then retry descent */ retry_descent = 1; else @@ -464,21 +457,179 @@ reject: /** + * crush_choose_indep: alternative breadth-first positionally stable mapping + * + */ +static void crush_choose_indep(const struct crush_map *map, + struct crush_bucket *bucket, + const __u32 *weight, int weight_max, + int x, int left, int numrep, int type, + int *out, int outpos, + unsigned int tries, + unsigned int recurse_tries, + int recurse_to_leaf, + int *out2, + int parent_r) +{ + struct crush_bucket *in = bucket; + int endpos = outpos + left; + int rep; + unsigned int ftotal; + int r; + int i; + int item = 0; + int itemtype; + int collide; + + dprintk("CHOOSE%s INDEP bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", + bucket->id, x, outpos, numrep); + + /* initially my result is undefined */ + for (rep = outpos; rep < endpos; rep++) { + out[rep] = CRUSH_ITEM_UNDEF; + if (out2) + out2[rep] = CRUSH_ITEM_UNDEF; + } + + for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) { + for (rep = outpos; rep < endpos; rep++) { + if (out[rep] != CRUSH_ITEM_UNDEF) + continue; + + in = bucket; /* initial bucket */ + + /* choose through intervening buckets */ + for (;;) { + /* note: we base the choice on the position + * even in the nested call. that means that + * if the first layer chooses the same bucket + * in a different position, we will tend to + * choose a different item in that bucket. + * this will involve more devices in data + * movement and tend to distribute the load. + */ + r = rep + parent_r; + + /* be careful */ + if (in->alg == CRUSH_BUCKET_UNIFORM && + in->size % numrep == 0) + /* r'=r+(n+1)*f_total */ + r += (numrep+1) * ftotal; + else + /* r' = r + n*f_total */ + r += numrep * ftotal; + + /* bucket choose */ + if (in->size == 0) { + dprintk(" empty bucket\n"); + break; + } + + item = crush_bucket_choose(in, x, r); + if (item >= map->max_devices) { + dprintk(" bad item %d\n", item); + out[rep] = CRUSH_ITEM_NONE; + if (out2) + out2[rep] = CRUSH_ITEM_NONE; + left--; + break; + } + + /* desired type? */ + if (item < 0) + itemtype = map->buckets[-1-item]->type; + else + itemtype = 0; + dprintk(" item %d type %d\n", item, itemtype); + + /* keep going? */ + if (itemtype != type) { + if (item >= 0 || + (-1-item) >= map->max_buckets) { + dprintk(" bad item type %d\n", type); + out[rep] = CRUSH_ITEM_NONE; + if (out2) + out2[rep] = + CRUSH_ITEM_NONE; + left--; + break; + } + in = map->buckets[-1-item]; + continue; + } + + /* collision? */ + collide = 0; + for (i = outpos; i < endpos; i++) { + if (out[i] == item) { + collide = 1; + break; + } + } + if (collide) + break; + + if (recurse_to_leaf) { + if (item < 0) { + crush_choose_indep(map, + map->buckets[-1-item], + weight, weight_max, + x, 1, numrep, 0, + out2, rep, + recurse_tries, 0, + 0, NULL, r); + if (out2[rep] == CRUSH_ITEM_NONE) { + /* placed nothing; no leaf */ + break; + } + } else { + /* we already have a leaf! */ + out2[rep] = item; + } + } + + /* out? */ + if (itemtype == 0 && + is_out(map, weight, weight_max, item, x)) + break; + + /* yay! */ + out[rep] = item; + left--; + break; + } + } + } + for (rep = outpos; rep < endpos; rep++) { + if (out[rep] == CRUSH_ITEM_UNDEF) { + out[rep] = CRUSH_ITEM_NONE; + } + if (out2 && out2[rep] == CRUSH_ITEM_UNDEF) { + out2[rep] = CRUSH_ITEM_NONE; + } + } +} + +/** * crush_do_rule - calculate a mapping with the given input and rule * @map: the crush_map * @ruleno: the rule id * @x: hash input * @result: pointer to result vector * @result_max: maximum result size + * @weight: weight vector (for map leaves) + * @weight_max: size of weight vector + * @scratch: scratch vector for private use; must be >= 3 * result_max */ int crush_do_rule(const struct crush_map *map, int ruleno, int x, int *result, int result_max, - const __u32 *weight) + const __u32 *weight, int weight_max, + int *scratch) { int result_len; - int a[CRUSH_MAX_SET]; - int b[CRUSH_MAX_SET]; - int c[CRUSH_MAX_SET]; + int *a = scratch; + int *b = scratch + result_max; + int *c = scratch + result_max*2; int recurse_to_leaf; int *w; int wsize = 0; @@ -489,8 +640,10 @@ int crush_do_rule(const struct crush_map *map, __u32 step; int i, j; int numrep; - int firstn; - const int descend_once = 0; + int choose_tries = map->choose_total_tries; + int choose_local_tries = map->choose_local_tries; + int choose_local_fallback_tries = map->choose_local_fallback_tries; + int choose_leaf_tries = 0; if ((__u32)ruleno >= map->max_rules) { dprintk(" bad ruleno %d\n", ruleno); @@ -503,29 +656,49 @@ int crush_do_rule(const struct crush_map *map, o = b; for (step = 0; step < rule->len; step++) { + int firstn = 0; struct crush_rule_step *curstep = &rule->steps[step]; - firstn = 0; switch (curstep->op) { case CRUSH_RULE_TAKE: w[0] = curstep->arg1; wsize = 1; break; - case CRUSH_RULE_CHOOSE_LEAF_FIRSTN: + case CRUSH_RULE_SET_CHOOSE_TRIES: + if (curstep->arg1 > 0) + choose_tries = curstep->arg1; + break; + + case CRUSH_RULE_SET_CHOOSELEAF_TRIES: + if (curstep->arg1 > 0) + choose_leaf_tries = curstep->arg1; + break; + + case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES: + if (curstep->arg1 > 0) + choose_local_tries = curstep->arg1; + break; + + case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES: + if (curstep->arg1 > 0) + choose_local_fallback_tries = curstep->arg1; + break; + + case CRUSH_RULE_CHOOSELEAF_FIRSTN: case CRUSH_RULE_CHOOSE_FIRSTN: firstn = 1; /* fall through */ - case CRUSH_RULE_CHOOSE_LEAF_INDEP: + case CRUSH_RULE_CHOOSELEAF_INDEP: case CRUSH_RULE_CHOOSE_INDEP: if (wsize == 0) break; recurse_to_leaf = curstep->op == - CRUSH_RULE_CHOOSE_LEAF_FIRSTN || + CRUSH_RULE_CHOOSELEAF_FIRSTN || curstep->op == - CRUSH_RULE_CHOOSE_LEAF_INDEP; + CRUSH_RULE_CHOOSELEAF_INDEP; /* reset output */ osize = 0; @@ -543,22 +716,51 @@ int crush_do_rule(const struct crush_map *map, continue; } j = 0; - osize += crush_choose(map, - map->buckets[-1-w[i]], - weight, - x, numrep, - curstep->arg2, - o+osize, j, - firstn, - recurse_to_leaf, - descend_once, c+osize); + if (firstn) { + int recurse_tries; + if (choose_leaf_tries) + recurse_tries = + choose_leaf_tries; + else if (map->chooseleaf_descend_once) + recurse_tries = 1; + else + recurse_tries = choose_tries; + osize += crush_choose_firstn( + map, + map->buckets[-1-w[i]], + weight, weight_max, + x, numrep, + curstep->arg2, + o+osize, j, + choose_tries, + recurse_tries, + choose_local_tries, + choose_local_fallback_tries, + recurse_to_leaf, + c+osize); + } else { + crush_choose_indep( + map, + map->buckets[-1-w[i]], + weight, weight_max, + x, numrep, numrep, + curstep->arg2, + o+osize, j, + choose_tries, + choose_leaf_tries ? + choose_leaf_tries : 1, + recurse_to_leaf, + c+osize, + 0); + osize += numrep; + } } if (recurse_to_leaf) /* copy final _leaf_ values to output set */ memcpy(o, c, osize*sizeof(*o)); - /* swap t and w arrays */ + /* swap o and w arrays */ tmp = o; o = w; w = tmp; diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 83661cdc0766..258a382e75ed 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -132,7 +132,8 @@ static int osdc_show(struct seq_file *s, void *pp) req->r_osd ? req->r_osd->o_osd : -1, req->r_pgid.pool, req->r_pgid.seed); - seq_printf(s, "%.*s", req->r_oid_len, req->r_oid); + seq_printf(s, "%.*s", req->r_base_oid.name_len, + req->r_base_oid.name); if (req->r_reassert_version.epoch) seq_printf(s, "\t%u'%llu", diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 4a5df7b1cc9f..30efc5c18622 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -15,6 +15,7 @@ #include <linux/dns_resolver.h> #include <net/tcp.h> +#include <linux/ceph/ceph_features.h> #include <linux/ceph/libceph.h> #include <linux/ceph/messenger.h> #include <linux/ceph/decode.h> @@ -777,13 +778,12 @@ static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor, bio = data->bio; BUG_ON(!bio); - BUG_ON(!bio->bi_vcnt); cursor->resid = min(length, data->bio_length); cursor->bio = bio; - cursor->vector_index = 0; - cursor->vector_offset = 0; - cursor->last_piece = length <= bio->bi_io_vec[0].bv_len; + cursor->bvec_iter = bio->bi_iter; + cursor->last_piece = + cursor->resid <= bio_iter_len(bio, cursor->bvec_iter); } static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor, @@ -792,71 +792,67 @@ static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor, { struct ceph_msg_data *data = cursor->data; struct bio *bio; - struct bio_vec *bio_vec; - unsigned int index; + struct bio_vec bio_vec; BUG_ON(data->type != CEPH_MSG_DATA_BIO); bio = cursor->bio; BUG_ON(!bio); - index = cursor->vector_index; - BUG_ON(index >= (unsigned int) bio->bi_vcnt); + bio_vec = bio_iter_iovec(bio, cursor->bvec_iter); - bio_vec = &bio->bi_io_vec[index]; - BUG_ON(cursor->vector_offset >= bio_vec->bv_len); - *page_offset = (size_t) (bio_vec->bv_offset + cursor->vector_offset); + *page_offset = (size_t) bio_vec.bv_offset; BUG_ON(*page_offset >= PAGE_SIZE); if (cursor->last_piece) /* pagelist offset is always 0 */ *length = cursor->resid; else - *length = (size_t) (bio_vec->bv_len - cursor->vector_offset); + *length = (size_t) bio_vec.bv_len; BUG_ON(*length > cursor->resid); BUG_ON(*page_offset + *length > PAGE_SIZE); - return bio_vec->bv_page; + return bio_vec.bv_page; } static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { struct bio *bio; - struct bio_vec *bio_vec; - unsigned int index; + struct bio_vec bio_vec; BUG_ON(cursor->data->type != CEPH_MSG_DATA_BIO); bio = cursor->bio; BUG_ON(!bio); - index = cursor->vector_index; - BUG_ON(index >= (unsigned int) bio->bi_vcnt); - bio_vec = &bio->bi_io_vec[index]; + bio_vec = bio_iter_iovec(bio, cursor->bvec_iter); /* Advance the cursor offset */ BUG_ON(cursor->resid < bytes); cursor->resid -= bytes; - cursor->vector_offset += bytes; - if (cursor->vector_offset < bio_vec->bv_len) + + bio_advance_iter(bio, &cursor->bvec_iter, bytes); + + if (bytes < bio_vec.bv_len) return false; /* more bytes to process in this segment */ - BUG_ON(cursor->vector_offset != bio_vec->bv_len); /* Move on to the next segment, and possibly the next bio */ - if (++index == (unsigned int) bio->bi_vcnt) { + if (!cursor->bvec_iter.bi_size) { bio = bio->bi_next; - index = 0; + cursor->bio = bio; + if (bio) + cursor->bvec_iter = bio->bi_iter; + else + memset(&cursor->bvec_iter, 0, + sizeof(cursor->bvec_iter)); } - cursor->bio = bio; - cursor->vector_index = index; - cursor->vector_offset = 0; if (!cursor->last_piece) { BUG_ON(!cursor->resid); BUG_ON(!bio); /* A short read is OK, so use <= rather than == */ - if (cursor->resid <= bio->bi_io_vec[index].bv_len) + if (cursor->resid <= bio_iter_len(bio, cursor->bvec_iter)) cursor->last_piece = true; } @@ -1865,7 +1861,9 @@ int ceph_parse_ips(const char *c, const char *end, port = (port * 10) + (*p - '0'); p++; } - if (port > 65535 || port == 0) + if (port == 0) + port = CEPH_MON_PORT; + else if (port > 65535) goto bad; } else { port = CEPH_MON_PORT; @@ -1945,7 +1943,8 @@ static int process_connect(struct ceph_connection *con) { u64 sup_feat = con->msgr->supported_features; u64 req_feat = con->msgr->required_features; - u64 server_feat = le64_to_cpu(con->in_reply.features); + u64 server_feat = ceph_sanitize_features( + le64_to_cpu(con->in_reply.features)); int ret; dout("process_connect on %p tag %d\n", con, (int)con->in_tag); @@ -2853,8 +2852,8 @@ static void con_fault(struct ceph_connection *con) */ void ceph_messenger_init(struct ceph_messenger *msgr, struct ceph_entity_addr *myaddr, - u32 supported_features, - u32 required_features, + u64 supported_features, + u64 required_features, bool nocrc) { msgr->supported_features = supported_features; @@ -3126,15 +3125,8 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, INIT_LIST_HEAD(&m->data); /* front */ - m->front_max = front_len; if (front_len) { - if (front_len > PAGE_CACHE_SIZE) { - m->front.iov_base = __vmalloc(front_len, flags, - PAGE_KERNEL); - m->front_is_vmalloc = true; - } else { - m->front.iov_base = kmalloc(front_len, flags); - } + m->front.iov_base = ceph_kvmalloc(front_len, flags); if (m->front.iov_base == NULL) { dout("ceph_msg_new can't allocate %d bytes\n", front_len); @@ -3143,7 +3135,7 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, } else { m->front.iov_base = NULL; } - m->front.iov_len = front_len; + m->front_alloc_len = m->front.iov_len = front_len; dout("ceph_msg_new %p front %d\n", m, front_len); return m; @@ -3256,10 +3248,7 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) void ceph_msg_kfree(struct ceph_msg *m) { dout("msg_kfree %p\n", m); - if (m->front_is_vmalloc) - vfree(m->front.iov_base); - else - kfree(m->front.iov_base); + ceph_kvfree(m->front.iov_base); kmem_cache_free(ceph_msg_cache, m); } @@ -3301,8 +3290,8 @@ EXPORT_SYMBOL(ceph_msg_last_put); void ceph_msg_dump(struct ceph_msg *msg) { - pr_debug("msg_dump %p (front_max %d length %zd)\n", msg, - msg->front_max, msg->data_length); + pr_debug("msg_dump %p (front_alloc_len %d length %zd)\n", msg, + msg->front_alloc_len, msg->data_length); print_hex_dump(KERN_DEBUG, "header: ", DUMP_PREFIX_OFFSET, 16, 1, &msg->hdr, sizeof(msg->hdr), true); diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 1fe25cd29d0e..2ac9ef35110b 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -152,7 +152,7 @@ static int __open_session(struct ceph_mon_client *monc) /* initiatiate authentication handshake */ ret = ceph_auth_build_hello(monc->auth, monc->m_auth->front.iov_base, - monc->m_auth->front_max); + monc->m_auth->front_alloc_len); __send_prepared_auth_request(monc, ret); } else { dout("open_session mon%d already open\n", monc->cur_mon); @@ -196,7 +196,7 @@ static void __send_subscribe(struct ceph_mon_client *monc) int num; p = msg->front.iov_base; - end = p + msg->front_max; + end = p + msg->front_alloc_len; num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap; ceph_encode_32(&p, num); @@ -897,7 +897,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc, ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, msg->front.iov_len, monc->m_auth->front.iov_base, - monc->m_auth->front_max); + monc->m_auth->front_alloc_len); if (ret < 0) { monc->client->auth_err = ret; wake_up_all(&monc->client->auth_wq); @@ -939,7 +939,7 @@ static int __validate_auth(struct ceph_mon_client *monc) return 0; ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base, - monc->m_auth->front_max); + monc->m_auth->front_alloc_len); if (ret <= 0) return ret; /* either an error, or no need to authenticate */ __send_prepared_auth_request(monc, ret); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 2b4b32aaa893..0676f2b199d6 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -338,7 +338,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, msg_size = 4 + 4 + 8 + 8 + 4+8; msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ msg_size += 1 + 8 + 4 + 4; /* pg_t */ - msg_size += 4 + MAX_OBJ_NAME_SIZE; + msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */ msg_size += 2 + num_ops*sizeof(struct ceph_osd_op); msg_size += 8; /* snapid */ msg_size += 8; /* snap_seq */ @@ -368,6 +368,9 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, INIT_LIST_HEAD(&req->r_req_lru_item); INIT_LIST_HEAD(&req->r_osd_item); + req->r_base_oloc.pool = -1; + req->r_target_oloc.pool = -1; + /* create reply message */ if (use_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); @@ -761,11 +764,11 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, if (num_ops > 1) osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); - req->r_file_layout = *layout; /* keep a copy */ + req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout); - snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", - vino.ino, objnum); - req->r_oid_len = strlen(req->r_oid); + snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name), + "%llx.%08llx", vino.ino, objnum); + req->r_base_oid.name_len = strlen(req->r_base_oid.name); return req; } @@ -1044,8 +1047,8 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) !ceph_con_opened(&osd->o_con)) { struct ceph_osd_request *req; - dout(" osd addr hasn't changed and connection never opened," - " letting msgr retry"); + dout("osd addr hasn't changed and connection never opened, " + "letting msgr retry\n"); /* touch each r_stamp for handle_timeout()'s benfit */ list_for_each_entry(req, &osd->o_requests, r_osd_item) req->r_stamp = jiffies; @@ -1232,6 +1235,61 @@ void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, EXPORT_SYMBOL(ceph_osdc_set_request_linger); /* + * Returns whether a request should be blocked from being sent + * based on the current osdmap and osd_client settings. + * + * Caller should hold map_sem for read. + */ +static bool __req_should_be_paused(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD); + bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) || + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); + return (req->r_flags & CEPH_OSD_FLAG_READ && pauserd) || + (req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr); +} + +/* + * Calculate mapping of a request to a PG. Takes tiering into account. + */ +static int __calc_request_pg(struct ceph_osdmap *osdmap, + struct ceph_osd_request *req, + struct ceph_pg *pg_out) +{ + bool need_check_tiering; + + need_check_tiering = false; + if (req->r_target_oloc.pool == -1) { + req->r_target_oloc = req->r_base_oloc; /* struct */ + need_check_tiering = true; + } + if (req->r_target_oid.name_len == 0) { + ceph_oid_copy(&req->r_target_oid, &req->r_base_oid); + need_check_tiering = true; + } + + if (need_check_tiering && + (req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { + struct ceph_pg_pool_info *pi; + + pi = ceph_pg_pool_by_id(osdmap, req->r_target_oloc.pool); + if (pi) { + if ((req->r_flags & CEPH_OSD_FLAG_READ) && + pi->read_tier >= 0) + req->r_target_oloc.pool = pi->read_tier; + if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && + pi->write_tier >= 0) + req->r_target_oloc.pool = pi->write_tier; + } + /* !pi is caught in ceph_oloc_oid_to_pg() */ + } + + return ceph_oloc_oid_to_pg(osdmap, &req->r_target_oloc, + &req->r_target_oid, pg_out); +} + +/* * Pick an osd (the first 'up' osd in the pg), allocate the osd struct * (as needed), and set the request r_osd appropriately. If there is * no up osd, set r_osd to NULL. Move the request to the appropriate list @@ -1248,10 +1306,11 @@ static int __map_request(struct ceph_osd_client *osdc, int acting[CEPH_PG_MAX_SIZE]; int o = -1, num = 0; int err; + bool was_paused; dout("map_request %p tid %lld\n", req, req->r_tid); - err = ceph_calc_ceph_pg(&pgid, req->r_oid, osdc->osdmap, - ceph_file_layout_pg_pool(req->r_file_layout)); + + err = __calc_request_pg(osdc->osdmap, req, &pgid); if (err) { list_move(&req->r_req_lru_item, &osdc->req_notarget); return err; @@ -1264,12 +1323,18 @@ static int __map_request(struct ceph_osd_client *osdc, num = err; } + was_paused = req->r_paused; + req->r_paused = __req_should_be_paused(osdc, req); + if (was_paused && !req->r_paused) + force_resend = 1; + if ((!force_resend && req->r_osd && req->r_osd->o_osd == o && req->r_sent >= req->r_osd->o_incarnation && req->r_num_pg_osds == num && memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) || - (req->r_osd == NULL && o == -1)) + (req->r_osd == NULL && o == -1) || + req->r_paused) return 0; /* no change */ dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n", @@ -1331,7 +1396,7 @@ static void __send_request(struct ceph_osd_client *osdc, /* fill in message content that changes each time we send it */ put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch); put_unaligned_le32(req->r_flags, req->r_request_flags); - put_unaligned_le64(req->r_pgid.pool, req->r_request_pool); + put_unaligned_le64(req->r_target_oloc.pool, req->r_request_pool); p = req->r_request_pgid; ceph_encode_64(&p, req->r_pgid.pool); ceph_encode_32(&p, req->r_pgid.seed); @@ -1362,6 +1427,40 @@ static void __send_queued(struct ceph_osd_client *osdc) } /* + * Caller should hold map_sem for read and request_mutex. + */ +static int __ceph_osdc_start_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req, + bool nofail) +{ + int rc; + + __register_request(osdc, req); + req->r_sent = 0; + req->r_got_reply = 0; + rc = __map_request(osdc, req, 0); + if (rc < 0) { + if (nofail) { + dout("osdc_start_request failed map, " + " will retry %lld\n", req->r_tid); + rc = 0; + } else { + __unregister_request(osdc, req); + } + return rc; + } + + if (req->r_osd == NULL) { + dout("send_request %p no up osds in pg\n", req); + ceph_monc_request_next_osdmap(&osdc->client->monc); + } else { + __send_queued(osdc); + } + + return 0; +} + +/* * Timeout callback, called every N seconds when 1 or more osd * requests has been active for more than N seconds. When this * happens, we ping all OSDs with requests who have timed out to @@ -1432,6 +1531,109 @@ static void handle_osds_timeout(struct work_struct *work) round_jiffies_relative(delay)); } +static int ceph_oloc_decode(void **p, void *end, + struct ceph_object_locator *oloc) +{ + u8 struct_v, struct_cv; + u32 len; + void *struct_end; + int ret = 0; + + ceph_decode_need(p, end, 1 + 1 + 4, e_inval); + struct_v = ceph_decode_8(p); + struct_cv = ceph_decode_8(p); + if (struct_v < 3) { + pr_warn("got v %d < 3 cv %d of ceph_object_locator\n", + struct_v, struct_cv); + goto e_inval; + } + if (struct_cv > 6) { + pr_warn("got v %d cv %d > 6 of ceph_object_locator\n", + struct_v, struct_cv); + goto e_inval; + } + len = ceph_decode_32(p); + ceph_decode_need(p, end, len, e_inval); + struct_end = *p + len; + + oloc->pool = ceph_decode_64(p); + *p += 4; /* skip preferred */ + + len = ceph_decode_32(p); + if (len > 0) { + pr_warn("ceph_object_locator::key is set\n"); + goto e_inval; + } + + if (struct_v >= 5) { + len = ceph_decode_32(p); + if (len > 0) { + pr_warn("ceph_object_locator::nspace is set\n"); + goto e_inval; + } + } + + if (struct_v >= 6) { + s64 hash = ceph_decode_64(p); + if (hash != -1) { + pr_warn("ceph_object_locator::hash is set\n"); + goto e_inval; + } + } + + /* skip the rest */ + *p = struct_end; +out: + return ret; + +e_inval: + ret = -EINVAL; + goto out; +} + +static int ceph_redirect_decode(void **p, void *end, + struct ceph_request_redirect *redir) +{ + u8 struct_v, struct_cv; + u32 len; + void *struct_end; + int ret; + + ceph_decode_need(p, end, 1 + 1 + 4, e_inval); + struct_v = ceph_decode_8(p); + struct_cv = ceph_decode_8(p); + if (struct_cv > 1) { + pr_warn("got v %d cv %d > 1 of ceph_request_redirect\n", + struct_v, struct_cv); + goto e_inval; + } + len = ceph_decode_32(p); + ceph_decode_need(p, end, len, e_inval); + struct_end = *p + len; + + ret = ceph_oloc_decode(p, end, &redir->oloc); + if (ret) + goto out; + + len = ceph_decode_32(p); + if (len > 0) { + pr_warn("ceph_request_redirect::object_name is set\n"); + goto e_inval; + } + + len = ceph_decode_32(p); + *p += len; /* skip osd_instructions */ + + /* skip the rest */ + *p = struct_end; +out: + return ret; + +e_inval: + ret = -EINVAL; + goto out; +} + static void complete_request(struct ceph_osd_request *req) { complete_all(&req->r_safe_completion); /* fsync waiter */ @@ -1446,6 +1648,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, { void *p, *end; struct ceph_osd_request *req; + struct ceph_request_redirect redir; u64 tid; int object_len; unsigned int numops; @@ -1484,6 +1687,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, osdmap_epoch = ceph_decode_32(&p); /* lookup */ + down_read(&osdc->map_sem); mutex_lock(&osdc->request_mutex); req = __lookup_request(osdc, tid); if (req == NULL) { @@ -1525,10 +1729,40 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, for (i = 0; i < numops; i++) req->r_reply_op_result[i] = ceph_decode_32(&p); - already_completed = req->r_got_reply; + if (le16_to_cpu(msg->hdr.version) >= 6) { + p += 8 + 4; /* skip replay_version */ + p += 8; /* skip user_version */ - if (!req->r_got_reply) { + err = ceph_redirect_decode(&p, end, &redir); + if (err) + goto bad_put; + } else { + redir.oloc.pool = -1; + } + + if (redir.oloc.pool != -1) { + dout("redirect pool %lld\n", redir.oloc.pool); + + __unregister_request(osdc, req); + req->r_target_oloc = redir.oloc; /* struct */ + + /* + * Start redirect requests with nofail=true. If + * mapping fails, request will end up on the notarget + * list, waiting for the new osdmap (which can take + * a while), even though the original request mapped + * successfully. In the future we might want to follow + * original request's nofail setting here. + */ + err = __ceph_osdc_start_request(osdc, req, true); + BUG_ON(err); + + goto out_unlock; + } + + already_completed = req->r_got_reply; + if (!req->r_got_reply) { req->r_result = result; dout("handle_reply result %d bytes %d\n", req->r_result, bytes); @@ -1542,8 +1776,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, req->r_got_reply = 1; } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) { dout("handle_reply tid %llu dup ack\n", tid); - mutex_unlock(&osdc->request_mutex); - goto done; + goto out_unlock; } dout("handle_reply tid %llu flags %d\n", tid, flags); @@ -1558,6 +1791,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, __unregister_request(osdc, req); mutex_unlock(&osdc->request_mutex); + up_read(&osdc->map_sem); if (!already_completed) { if (req->r_unsafe_callback && @@ -1575,15 +1809,27 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, complete_request(req); } -done: +out: dout("req=%p req->r_linger=%d\n", req, req->r_linger); ceph_osdc_put_request(req); return; +out_unlock: + mutex_unlock(&osdc->request_mutex); + up_read(&osdc->map_sem); + goto out; bad_put: + req->r_result = -EIO; + __unregister_request(osdc, req); + if (req->r_callback) + req->r_callback(req, msg); + else + complete_all(&req->r_completion); + complete_request(req); ceph_osdc_put_request(req); bad_mutex: mutex_unlock(&osdc->request_mutex); + up_read(&osdc->map_sem); bad: pr_err("corrupt osd_op_reply got %d %d\n", (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len)); @@ -1613,14 +1859,17 @@ static void reset_changed_osds(struct ceph_osd_client *osdc) * * Caller should hold map_sem for read. */ -static void kick_requests(struct ceph_osd_client *osdc, int force_resend) +static void kick_requests(struct ceph_osd_client *osdc, bool force_resend, + bool force_resend_writes) { struct ceph_osd_request *req, *nreq; struct rb_node *p; int needmap = 0; int err; + bool force_resend_req; - dout("kick_requests %s\n", force_resend ? " (force resend)" : ""); + dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "", + force_resend_writes ? " (force resend writes)" : ""); mutex_lock(&osdc->request_mutex); for (p = rb_first(&osdc->requests); p; ) { req = rb_entry(p, struct ceph_osd_request, r_node); @@ -1645,7 +1894,10 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend) continue; } - err = __map_request(osdc, req, force_resend); + force_resend_req = force_resend || + (force_resend_writes && + req->r_flags & CEPH_OSD_FLAG_WRITE); + err = __map_request(osdc, req, force_resend_req); if (err < 0) continue; /* error */ if (req->r_osd == NULL) { @@ -1665,7 +1917,8 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend) r_linger_item) { dout("linger req=%p req->r_osd=%p\n", req, req->r_osd); - err = __map_request(osdc, req, force_resend); + err = __map_request(osdc, req, + force_resend || force_resend_writes); dout("__map_request returned %d\n", err); if (err == 0) continue; /* no change and no osd was specified */ @@ -1707,6 +1960,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) struct ceph_osdmap *newmap = NULL, *oldmap; int err; struct ceph_fsid fsid; + bool was_full; dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0); p = msg->front.iov_base; @@ -1720,6 +1974,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) down_write(&osdc->map_sem); + was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); + /* incremental maps */ ceph_decode_32_safe(&p, end, nr_maps, bad); dout(" %d inc maps\n", nr_maps); @@ -1744,7 +2000,10 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) ceph_osdmap_destroy(osdc->osdmap); osdc->osdmap = newmap; } - kick_requests(osdc, 0); + was_full = was_full || + ceph_osdmap_flag(osdc->osdmap, + CEPH_OSDMAP_FULL); + kick_requests(osdc, 0, was_full); } else { dout("ignoring incremental map %u len %d\n", epoch, maplen); @@ -1787,7 +2046,10 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) skipped_map = 1; ceph_osdmap_destroy(oldmap); } - kick_requests(osdc, skipped_map); + was_full = was_full || + ceph_osdmap_flag(osdc->osdmap, + CEPH_OSDMAP_FULL); + kick_requests(osdc, skipped_map, was_full); } p += maplen; nr_maps--; @@ -1804,7 +2066,9 @@ done: * we find out when we are no longer full and stop returning * ENOSPC. */ - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) || + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) ceph_monc_request_next_osdmap(&osdc->client->monc); mutex_lock(&osdc->request_mutex); @@ -2068,10 +2332,11 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, ceph_encode_32(&p, -1); /* preferred */ /* oid */ - ceph_encode_32(&p, req->r_oid_len); - memcpy(p, req->r_oid, req->r_oid_len); - dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len); - p += req->r_oid_len; + ceph_encode_32(&p, req->r_base_oid.name_len); + memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len); + dout("oid '%.*s' len %d\n", req->r_base_oid.name_len, + req->r_base_oid.name, req->r_base_oid.name_len); + p += req->r_base_oid.name_len; /* ops--can imply data */ ceph_encode_16(&p, (u16)req->r_num_ops); @@ -2125,34 +2390,16 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req, bool nofail) { - int rc = 0; + int rc; down_read(&osdc->map_sem); mutex_lock(&osdc->request_mutex); - __register_request(osdc, req); - req->r_sent = 0; - req->r_got_reply = 0; - rc = __map_request(osdc, req, 0); - if (rc < 0) { - if (nofail) { - dout("osdc_start_request failed map, " - " will retry %lld\n", req->r_tid); - rc = 0; - } else { - __unregister_request(osdc, req); - } - goto out_unlock; - } - if (req->r_osd == NULL) { - dout("send_request %p no up osds in pg\n", req); - ceph_monc_request_next_osdmap(&osdc->client->monc); - } else { - __send_queued(osdc); - } - rc = 0; -out_unlock: + + rc = __ceph_osdc_start_request(osdc, req, nofail); + mutex_unlock(&osdc->request_mutex); up_read(&osdc->map_sem); + return rc; } EXPORT_SYMBOL(ceph_osdc_start_request); @@ -2278,9 +2525,12 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) err = -ENOMEM; osdc->notify_wq = create_singlethread_workqueue("ceph-watch-notify"); if (!osdc->notify_wq) - goto out_msgpool; + goto out_msgpool_reply; + return 0; +out_msgpool_reply: + ceph_msgpool_destroy(&osdc->msgpool_op_reply); out_msgpool: ceph_msgpool_destroy(&osdc->msgpool_op); out_mempool: @@ -2454,7 +2704,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, struct ceph_osd_client *osdc = osd->o_osdc; struct ceph_msg *m; struct ceph_osd_request *req; - int front = le32_to_cpu(hdr->front_len); + int front_len = le32_to_cpu(hdr->front_len); int data_len = le32_to_cpu(hdr->data_len); u64 tid; @@ -2474,12 +2724,13 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, req->r_reply, req->r_reply->con); ceph_msg_revoke_incoming(req->r_reply); - if (front > req->r_reply->front.iov_len) { + if (front_len > req->r_reply->front_alloc_len) { pr_warning("get_reply front %d > preallocated %d (%u#%llu)\n", - front, (int)req->r_reply->front.iov_len, + front_len, req->r_reply->front_alloc_len, (unsigned int)con->peer_name.type, le64_to_cpu(con->peer_name.num)); - m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS, false); + m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS, + false); if (!m) goto out; ceph_msg_put(req->r_reply); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index dbd9a4792427..aade4a5c1c07 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -464,6 +464,11 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, u64 id) return NULL; } +struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id) +{ + return __lookup_pg_pool(&map->pg_pools, id); +} + const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id) { struct ceph_pg_pool_info *pi; @@ -514,8 +519,8 @@ static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) pr_warning("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv); return -EINVAL; } - if (cv > 7) { - pr_warning("got v %d cv %d > 7 of ceph_pg_pool\n", ev, cv); + if (cv > 9) { + pr_warning("got v %d cv %d > 9 of ceph_pg_pool\n", ev, cv); return -EINVAL; } len = ceph_decode_32(p); @@ -543,12 +548,34 @@ static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) *p += len; } - /* skip removed snaps */ + /* skip removed_snaps */ num = ceph_decode_32(p); *p += num * (8 + 8); *p += 8; /* skip auid */ pi->flags = ceph_decode_64(p); + *p += 4; /* skip crash_replay_interval */ + + if (ev >= 7) + *p += 1; /* skip min_size */ + + if (ev >= 8) + *p += 8 + 8; /* skip quota_max_* */ + + if (ev >= 9) { + /* skip tiers */ + num = ceph_decode_32(p); + *p += num * 8; + + *p += 8; /* skip tier_of */ + *p += 1; /* skip cache_mode */ + + pi->read_tier = ceph_decode_64(p); + pi->write_tier = ceph_decode_64(p); + } else { + pi->read_tier = -1; + pi->write_tier = -1; + } /* ignore the rest */ @@ -1090,25 +1117,40 @@ invalid: EXPORT_SYMBOL(ceph_calc_file_object_mapping); /* - * calculate an object layout (i.e. pgid) from an oid, - * file_layout, and osdmap + * Calculate mapping of a (oloc, oid) pair to a PG. Should only be + * called with target's (oloc, oid), since tiering isn't taken into + * account. */ -int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid, - struct ceph_osdmap *osdmap, uint64_t pool) +int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, + struct ceph_object_locator *oloc, + struct ceph_object_id *oid, + struct ceph_pg *pg_out) { - struct ceph_pg_pool_info *pool_info; + struct ceph_pg_pool_info *pi; - BUG_ON(!osdmap); - pool_info = __lookup_pg_pool(&osdmap->pg_pools, pool); - if (!pool_info) + pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool); + if (!pi) return -EIO; - pg->pool = pool; - pg->seed = ceph_str_hash(pool_info->object_hash, oid, strlen(oid)); - dout("%s '%s' pgid %lld.%x\n", __func__, oid, pg->pool, pg->seed); + pg_out->pool = oloc->pool; + pg_out->seed = ceph_str_hash(pi->object_hash, oid->name, + oid->name_len); + + dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name, + pg_out->pool, pg_out->seed); return 0; } -EXPORT_SYMBOL(ceph_calc_ceph_pg); +EXPORT_SYMBOL(ceph_oloc_oid_to_pg); + +static int crush_do_rule_ary(const struct crush_map *map, int ruleno, int x, + int *result, int result_max, + const __u32 *weight, int weight_max) +{ + int scratch[result_max * 3]; + + return crush_do_rule(map, ruleno, x, result, result_max, + weight, weight_max, scratch); +} /* * Calculate raw osd vector for the given pgid. Return pointer to osd @@ -1163,9 +1205,9 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, pool->pgp_num_mask) + (unsigned)pgid.pool; } - r = crush_do_rule(osdmap->crush, ruleno, pps, osds, - min_t(int, pool->size, *num), - osdmap->osd_weight); + r = crush_do_rule_ary(osdmap->crush, ruleno, pps, + osds, min_t(int, pool->size, *num), + osdmap->osd_weight, osdmap->max_osd); if (r < 0) { pr_err("error %d from crush rule: pool %lld ruleset %d type %d" " size %d\n", r, pgid.pool, pool->crush_ruleset, diff --git a/net/compat.c b/net/compat.c index dd32e34c1e2c..f50161fb812e 100644 --- a/net/compat.c +++ b/net/compat.c @@ -780,21 +780,16 @@ asmlinkage long compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg, if (flags & MSG_CMSG_COMPAT) return -EINVAL; - if (COMPAT_USE_64BIT_TIME) - return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, - flags | MSG_CMSG_COMPAT, - (struct timespec *) timeout); - if (timeout == NULL) return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, flags | MSG_CMSG_COMPAT, NULL); - if (get_compat_timespec(&ktspec, timeout)) + if (compat_get_timespec(&ktspec, timeout)) return -EFAULT; datagrams = __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, flags | MSG_CMSG_COMPAT, &ktspec); - if (datagrams > 0 && put_compat_timespec(&ktspec, timeout)) + if (datagrams > 0 && compat_put_timespec(&ktspec, timeout)) datagrams = -EFAULT; return datagrams; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 8f519dbb358b..5976ef0846bd 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -47,6 +47,8 @@ #include <linux/in.h> #include <linux/inet.h> #include <linux/slab.h> +#include <linux/tcp.h> +#include <linux/udp.h> #include <linux/netdevice.h> #ifdef CONFIG_NET_CLS_ACT #include <net/pkt_sched.h> @@ -2119,7 +2121,7 @@ EXPORT_SYMBOL_GPL(skb_zerocopy_headlen); /** * skb_zerocopy - Zero copy skb to skb * @to: destination buffer - * @source: source buffer + * @from: source buffer * @len: number of bytes to copy from source buffer * @hlen: size of linear headroom in destination buffer * @@ -3916,3 +3918,26 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) nf_reset_trace(skb); } EXPORT_SYMBOL_GPL(skb_scrub_packet); + +/** + * skb_gso_transport_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_transport_seglen is used to determine the real size of the + * individual segments, including Layer4 headers (TCP/UDP). + * + * The MAC/L2 or network (IP, IPv6) headers are not accounted for. + */ +unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) +{ + const struct skb_shared_info *shinfo = skb_shinfo(skb); + unsigned int hdr_len; + + if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) + hdr_len = tcp_hdrlen(skb); + else + hdr_len = sizeof(struct udphdr); + return hdr_len + shinfo->gso_size; +} +EXPORT_SYMBOL_GPL(skb_gso_transport_seglen); diff --git a/net/ieee802154/6lowpan_iphc.c b/net/ieee802154/6lowpan_iphc.c index 083f905bf109..860aa2d445ba 100644 --- a/net/ieee802154/6lowpan_iphc.c +++ b/net/ieee802154/6lowpan_iphc.c @@ -678,7 +678,7 @@ int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev, hc06_ptr += 3; } else { /* compress nothing */ - memcpy(hc06_ptr, &hdr, 4); + memcpy(hc06_ptr, hdr, 4); /* replace the top byte with new ECN | DSCP format */ *hc06_ptr = tmp; hc06_ptr += 4; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index e7a92fdb36f6..ec4f762efda5 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -178,7 +178,7 @@ static int ipgre_err(struct sk_buff *skb, u32 info, else itn = net_generic(net, ipgre_net_id); - iph = (const struct iphdr *)skb->data; + iph = (const struct iphdr *)(icmp_hdr(skb) + 1); t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, iph->daddr, iph->saddr, tpi->key); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 054a3e97d822..3d4da2c16b6a 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -314,7 +314,7 @@ static int ip_rcv_finish(struct sk_buff *skb) const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; - if (sysctl_ip_early_demux && !skb_dst(skb)) { + if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { const struct net_protocol *ipprot; int protocol = iph->protocol; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index c0e3cb72ad70..bd28f386bd02 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -40,6 +40,7 @@ #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/rculist.h> +#include <linux/err.h> #include <net/sock.h> #include <net/ip.h> @@ -930,7 +931,7 @@ int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, } rtnl_unlock(); - return PTR_RET(itn->fb_tunnel_dev); + return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev); } EXPORT_SYMBOL_GPL(ip_tunnel_init_net); diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 302d6fb1ff2b..51d54dc376f3 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -49,7 +49,7 @@ int ip6_rcv_finish(struct sk_buff *skb) { - if (sysctl_ip_early_demux && !skb_dst(skb)) { + if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { const struct inet6_protocol *ipprot; ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]); diff --git a/net/llc/llc_output.c b/net/llc/llc_output.c index 2dae8a5df23f..94425e421213 100644 --- a/net/llc/llc_output.c +++ b/net/llc/llc_output.c @@ -43,7 +43,7 @@ int llc_mac_hdr_init(struct sk_buff *skb, rc = 0; break; default: - WARN(1, "device type not supported: %d\n", skb->dev->type); + break; } return rc; } diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 7f01f2aec7b5..3849fd07a321 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1021,8 +1021,10 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, IEEE80211_P2P_OPPPS_ENABLE_BIT; err = ieee80211_assign_beacon(sdata, ¶ms->beacon); - if (err < 0) + if (err < 0) { + ieee80211_vif_release_channel(sdata); return err; + } changed |= err; err = drv_start_ap(sdata->local, sdata); @@ -1032,6 +1034,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, if (old) kfree_rcu(old, rcu_head); RCU_INIT_POINTER(sdata->u.ap.beacon, NULL); + ieee80211_vif_release_channel(sdata); return err; } @@ -1093,8 +1096,6 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev) kfree(sdata->u.ap.next_beacon); sdata->u.ap.next_beacon = NULL; - cancel_work_sync(&sdata->u.ap.request_smps_work); - /* turn off carrier for this interface and dependent VLANs */ list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) netif_carrier_off(vlan->dev); @@ -1106,6 +1107,7 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev) kfree_rcu(old_beacon, rcu_head); if (old_probe_resp) kfree_rcu(old_probe_resp, rcu_head); + sdata->u.ap.driver_smps_mode = IEEE80211_SMPS_OFF; __sta_info_flush(sdata, true); ieee80211_free_keys(sdata, true); @@ -2662,6 +2664,24 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, INIT_DELAYED_WORK(&roc->work, ieee80211_sw_roc_work); INIT_LIST_HEAD(&roc->dependents); + /* + * cookie is either the roc cookie (for normal roc) + * or the SKB (for mgmt TX) + */ + if (!txskb) { + /* local->mtx protects this */ + local->roc_cookie_counter++; + roc->cookie = local->roc_cookie_counter; + /* wow, you wrapped 64 bits ... more likely a bug */ + if (WARN_ON(roc->cookie == 0)) { + roc->cookie = 1; + local->roc_cookie_counter++; + } + *cookie = roc->cookie; + } else { + *cookie = (unsigned long)txskb; + } + /* if there's one pending or we're scanning, queue this one */ if (!list_empty(&local->roc_list) || local->scanning || local->radar_detect_enabled) @@ -2784,24 +2804,6 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, if (!queued) list_add_tail(&roc->list, &local->roc_list); - /* - * cookie is either the roc cookie (for normal roc) - * or the SKB (for mgmt TX) - */ - if (!txskb) { - /* local->mtx protects this */ - local->roc_cookie_counter++; - roc->cookie = local->roc_cookie_counter; - /* wow, you wrapped 64 bits ... more likely a bug */ - if (WARN_ON(roc->cookie == 0)) { - roc->cookie = 1; - local->roc_cookie_counter++; - } - *cookie = roc->cookie; - } else { - *cookie = (unsigned long)txskb; - } - return 0; } diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index dc3c28002e3e..afbe2b203c3e 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -466,7 +466,9 @@ void ieee80211_request_smps_ap_work(struct work_struct *work) u.ap.request_smps_work); sdata_lock(sdata); - __ieee80211_request_smps_ap(sdata, sdata->u.ap.driver_smps_mode); + if (sdata_dereference(sdata->u.ap.beacon, sdata)) + __ieee80211_request_smps_ap(sdata, + sdata->u.ap.driver_smps_mode); sdata_unlock(sdata); } diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 9c84b75f3de8..4453e2725e40 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -684,12 +684,9 @@ static void ieee80211_ibss_disconnect(struct ieee80211_sub_if_data *sdata) struct cfg80211_bss *cbss; struct beacon_data *presp; struct sta_info *sta; - int active_ibss; u16 capability; - active_ibss = ieee80211_sta_active_ibss(sdata); - - if (!active_ibss && !is_zero_ether_addr(ifibss->bssid)) { + if (!is_zero_ether_addr(ifibss->bssid)) { capability = WLAN_CAPABILITY_IBSS; if (ifibss->privacy) diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 9db71cf7a665..be198f42f1f7 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -413,20 +413,24 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local) return ret; } + mutex_lock(&local->iflist_mtx); + rcu_assign_pointer(local->monitor_sdata, sdata); + mutex_unlock(&local->iflist_mtx); + mutex_lock(&local->mtx); ret = ieee80211_vif_use_channel(sdata, &local->monitor_chandef, IEEE80211_CHANCTX_EXCLUSIVE); mutex_unlock(&local->mtx); if (ret) { + mutex_lock(&local->iflist_mtx); + rcu_assign_pointer(local->monitor_sdata, NULL); + mutex_unlock(&local->iflist_mtx); + synchronize_net(); drv_remove_interface(local, sdata); kfree(sdata); return ret; } - mutex_lock(&local->iflist_mtx); - rcu_assign_pointer(local->monitor_sdata, sdata); - mutex_unlock(&local->iflist_mtx); - return 0; } @@ -765,12 +769,19 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, ieee80211_roc_purge(local, sdata); - if (sdata->vif.type == NL80211_IFTYPE_STATION) + switch (sdata->vif.type) { + case NL80211_IFTYPE_STATION: ieee80211_mgd_stop(sdata); - - if (sdata->vif.type == NL80211_IFTYPE_ADHOC) + break; + case NL80211_IFTYPE_ADHOC: ieee80211_ibss_stop(sdata); - + break; + case NL80211_IFTYPE_AP: + cancel_work_sync(&sdata->u.ap.request_smps_work); + break; + default: + break; + } /* * Remove all stations associated with this interface. diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 5476a69b45c9..722151fa5dce 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -874,7 +874,7 @@ static int ieee80211_fragment(struct ieee80211_tx_data *tx, } /* adjust first fragment's length */ - skb->len = hdrlen + per_fragm; + skb_trim(skb, hdrlen + per_fragm); return 0; } diff --git a/net/rxrpc/ar-connection.c b/net/rxrpc/ar-connection.c index 4106ca95ec86..7bf5b5b9e8b9 100644 --- a/net/rxrpc/ar-connection.c +++ b/net/rxrpc/ar-connection.c @@ -381,6 +381,8 @@ static int rxrpc_connect_exclusive(struct rxrpc_sock *rx, rxrpc_assign_connection_id(conn); rx->conn = conn; + } else { + spin_lock(&trans->client_lock); } /* we've got a connection with a free channel and we can now attach the diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c index 898492a8d61b..34b5490dde65 100644 --- a/net/rxrpc/ar-recvmsg.c +++ b/net/rxrpc/ar-recvmsg.c @@ -180,7 +180,8 @@ int rxrpc_recvmsg(struct kiocb *iocb, struct socket *sock, if (copy > len - copied) copy = len - copied; - if (skb->ip_summed == CHECKSUM_UNNECESSARY) { + if (skb->ip_summed == CHECKSUM_UNNECESSARY || + skb->ip_summed == CHECKSUM_PARTIAL) { ret = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, copy); } else { @@ -353,6 +354,10 @@ csum_copy_error: if (continue_call) rxrpc_put_call(continue_call); rxrpc_kill_skb(skb); + if (!(flags & MSG_PEEK)) { + if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) + BUG(); + } skb_kill_datagram(&rx->sk, skb, flags); rxrpc_put_call(call); return -EAGAIN; diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index fbba5b0ec121..1cb413fead89 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -21,7 +21,6 @@ #include <net/netlink.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> -#include <net/tcp.h> /* Simple Token Bucket Filter. @@ -148,16 +147,10 @@ static u64 psched_ns_t2l(const struct psched_ratecfg *r, * Return length of individual segments of a gso packet, * including all headers (MAC, IP, TCP/UDP) */ -static unsigned int skb_gso_seglen(const struct sk_buff *skb) +static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) { unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); - const struct skb_shared_info *shinfo = skb_shinfo(skb); - - if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) - hdr_len += tcp_hdrlen(skb); - else - hdr_len += sizeof(struct udphdr); - return hdr_len + shinfo->gso_size; + return hdr_len + skb_gso_transport_seglen(skb); } /* GSO packet is too big, segment it so that tbf can transmit @@ -202,7 +195,7 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch) int ret; if (qdisc_pkt_len(skb) > q->max_size) { - if (skb_is_gso(skb) && skb_gso_seglen(skb) <= q->max_size) + if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size) return tbf_segment(skb, sch); return qdisc_reshape_fail(skb, sch); } diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 42fdfc634e56..6c0513a7f992 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -532,14 +532,7 @@ gss_setup_upcall(struct gss_auth *gss_auth, struct rpc_cred *cred) static void warn_gssd(void) { - static unsigned long ratelimit; - unsigned long now = jiffies; - - if (time_after(now, ratelimit)) { - printk(KERN_WARNING "RPC: AUTH_GSS upcall timed out.\n" - "Please check user daemon is running.\n"); - ratelimit = now + 15*HZ; - } + dprintk("AUTH_GSS upcall failed. Please check user daemon is running.\n"); } static inline int @@ -600,7 +593,6 @@ gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred) struct rpc_pipe *pipe; struct rpc_cred *cred = &gss_cred->gc_base; struct gss_upcall_msg *gss_msg; - unsigned long timeout; DEFINE_WAIT(wait); int err; @@ -608,17 +600,16 @@ gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred) __func__, from_kuid(&init_user_ns, cred->cr_uid)); retry: err = 0; - /* Default timeout is 15s unless we know that gssd is not running */ - timeout = 15 * HZ; - if (!sn->gssd_running) - timeout = HZ >> 2; + /* if gssd is down, just skip upcalling altogether */ + if (!gssd_running(net)) { + warn_gssd(); + return -EACCES; + } gss_msg = gss_setup_upcall(gss_auth, cred); if (PTR_ERR(gss_msg) == -EAGAIN) { err = wait_event_interruptible_timeout(pipe_version_waitqueue, - sn->pipe_version >= 0, timeout); + sn->pipe_version >= 0, 15 * HZ); if (sn->pipe_version < 0) { - if (err == 0) - sn->gssd_running = 0; warn_gssd(); err = -EACCES; } diff --git a/net/sunrpc/auth_gss/gss_krb5_keys.c b/net/sunrpc/auth_gss/gss_krb5_keys.c index 76e42e6be755..24589bd2a4b6 100644 --- a/net/sunrpc/auth_gss/gss_krb5_keys.c +++ b/net/sunrpc/auth_gss/gss_krb5_keys.c @@ -59,6 +59,7 @@ #include <linux/crypto.h> #include <linux/sunrpc/gss_krb5.h> #include <linux/sunrpc/xdr.h> +#include <linux/lcm.h> #ifdef RPC_DEBUG # define RPCDBG_FACILITY RPCDBG_AUTH @@ -72,7 +73,7 @@ static void krb5_nfold(u32 inbits, const u8 *in, u32 outbits, u8 *out) { - int a, b, c, lcm; + unsigned long ulcm; int byte, i, msbit; /* the code below is more readable if I make these bytes @@ -82,17 +83,7 @@ static void krb5_nfold(u32 inbits, const u8 *in, outbits >>= 3; /* first compute lcm(n,k) */ - - a = outbits; - b = inbits; - - while (b != 0) { - c = b; - b = a%b; - a = c; - } - - lcm = outbits*inbits/a; + ulcm = lcm(inbits, outbits); /* now do the real work */ @@ -101,7 +92,7 @@ static void krb5_nfold(u32 inbits, const u8 *in, /* this will end up cycling through k lcm(k,n)/k times, which is correct */ - for (i = lcm-1; i >= 0; i--) { + for (i = ulcm-1; i >= 0; i--) { /* compute the msbit in k which gets added into this byte */ msbit = ( /* first, start with the msbit in the first, diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c index 458f85e9b0ba..abbb7dcd1689 100644 --- a/net/sunrpc/auth_gss/gss_rpc_upcall.c +++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c @@ -137,7 +137,6 @@ void init_gssp_clnt(struct sunrpc_net *sn) { mutex_init(&sn->gssp_lock); sn->gssp_clnt = NULL; - init_waitqueue_head(&sn->gssp_wq); } int set_gssp_clnt(struct net *net) @@ -154,7 +153,6 @@ int set_gssp_clnt(struct net *net) sn->gssp_clnt = clnt; } mutex_unlock(&sn->gssp_lock); - wake_up(&sn->gssp_wq); return ret; } diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 008cdade5aae..0f73f4507746 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1263,65 +1263,34 @@ out: return ret; } -DEFINE_SPINLOCK(use_gssp_lock); - -static bool use_gss_proxy(struct net *net) -{ - struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); - - if (sn->use_gss_proxy != -1) - return sn->use_gss_proxy; - spin_lock(&use_gssp_lock); - /* - * If you wanted gss-proxy, you should have said so before - * starting to accept requests: - */ - sn->use_gss_proxy = 0; - spin_unlock(&use_gssp_lock); - return 0; -} - -#ifdef CONFIG_PROC_FS - +/* + * Try to set the sn->use_gss_proxy variable to a new value. We only allow + * it to be changed if it's currently undefined (-1). If it's any other value + * then return -EBUSY unless the type wouldn't have changed anyway. + */ static int set_gss_proxy(struct net *net, int type) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); - int ret = 0; + int ret; WARN_ON_ONCE(type != 0 && type != 1); - spin_lock(&use_gssp_lock); - if (sn->use_gss_proxy == -1 || sn->use_gss_proxy == type) - sn->use_gss_proxy = type; - else - ret = -EBUSY; - spin_unlock(&use_gssp_lock); - wake_up(&sn->gssp_wq); - return ret; -} - -static inline bool gssp_ready(struct sunrpc_net *sn) -{ - switch (sn->use_gss_proxy) { - case -1: - return false; - case 0: - return true; - case 1: - return sn->gssp_clnt; - } - WARN_ON_ONCE(1); - return false; + ret = cmpxchg(&sn->use_gss_proxy, -1, type); + if (ret != -1 && ret != type) + return -EBUSY; + return 0; } -static int wait_for_gss_proxy(struct net *net, struct file *file) +static bool use_gss_proxy(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); - if (file->f_flags & O_NONBLOCK && !gssp_ready(sn)) - return -EAGAIN; - return wait_event_interruptible(sn->gssp_wq, gssp_ready(sn)); + /* If use_gss_proxy is still undefined, then try to disable it */ + if (sn->use_gss_proxy == -1) + set_gss_proxy(net, 0); + return sn->use_gss_proxy; } +#ifdef CONFIG_PROC_FS static ssize_t write_gssp(struct file *file, const char __user *buf, size_t count, loff_t *ppos) @@ -1342,10 +1311,10 @@ static ssize_t write_gssp(struct file *file, const char __user *buf, return res; if (i != 1) return -EINVAL; - res = set_gss_proxy(net, 1); + res = set_gssp_clnt(net); if (res) return res; - res = set_gssp_clnt(net); + res = set_gss_proxy(net, 1); if (res) return res; return count; @@ -1355,16 +1324,12 @@ static ssize_t read_gssp(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct net *net = PDE_DATA(file_inode(file)); + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); unsigned long p = *ppos; char tbuf[10]; size_t len; - int ret; - ret = wait_for_gss_proxy(net, file); - if (ret) - return ret; - - snprintf(tbuf, sizeof(tbuf), "%d\n", use_gss_proxy(net)); + snprintf(tbuf, sizeof(tbuf), "%d\n", sn->use_gss_proxy); len = strlen(tbuf); if (p >= len) return 0; @@ -1626,8 +1591,7 @@ svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp) BUG_ON(integ_len % 4); *p++ = htonl(integ_len); *p++ = htonl(gc->gc_seq); - if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset, - integ_len)) + if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset, integ_len)) BUG(); if (resbuf->tail[0].iov_base == NULL) { if (resbuf->head[0].iov_len + RPC_MAX_AUTH_SIZE > PAGE_SIZE) @@ -1635,10 +1599,8 @@ svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp) resbuf->tail[0].iov_base = resbuf->head[0].iov_base + resbuf->head[0].iov_len; resbuf->tail[0].iov_len = 0; - resv = &resbuf->tail[0]; - } else { - resv = &resbuf->tail[0]; } + resv = &resbuf->tail[0]; mic.data = (u8 *)resv->iov_base + resv->iov_len + 4; if (gss_get_mic(gsd->rsci->mechctx, &integ_buf, &mic)) goto out_err; diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index e521d20e1970..ae333c1845bb 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -1111,9 +1111,7 @@ void qword_addhex(char **bpp, int *lp, char *buf, int blen) *bp++ = 'x'; len -= 2; while (blen && len >= 2) { - unsigned char c = *buf++; - *bp++ = '0' + ((c&0xf0)>>4) + (c>=0xa0)*('a'-'9'-1); - *bp++ = '0' + (c&0x0f) + ((c&0x0f)>=0x0a)*('a'-'9'-1); + bp = hex_byte_pack(bp, *buf++); len -= 2; blen--; } diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index f09b7db2c492..0edada973434 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1529,9 +1529,13 @@ call_refreshresult(struct rpc_task *task) task->tk_action = call_refresh; switch (status) { case 0: - if (rpcauth_uptodatecred(task)) + if (rpcauth_uptodatecred(task)) { task->tk_action = call_allocate; - return; + return; + } + /* Use rate-limiting and a max number of retries if refresh + * had status 0 but failed to update the cred. + */ case -ETIMEDOUT: rpc_delay(task, 3*HZ); case -EAGAIN: @@ -1729,6 +1733,7 @@ call_bind_status(struct rpc_task *task) return; case -ECONNREFUSED: /* connection problems */ case -ECONNRESET: + case -ECONNABORTED: case -ENOTCONN: case -EHOSTDOWN: case -EHOSTUNREACH: @@ -1799,7 +1804,9 @@ call_connect_status(struct rpc_task *task) return; case -ECONNREFUSED: case -ECONNRESET: + case -ECONNABORTED: case -ENETUNREACH: + case -EHOSTUNREACH: /* retry with existing socket, after a delay */ rpc_delay(task, 3*HZ); if (RPC_IS_SOFTCONN(task)) @@ -1902,6 +1909,7 @@ call_transmit_status(struct rpc_task *task) break; } case -ECONNRESET: + case -ECONNABORTED: case -ENOTCONN: case -EPIPE: rpc_task_force_reencode(task); @@ -2011,8 +2019,9 @@ call_status(struct rpc_task *task) xprt_conditional_disconnect(req->rq_xprt, req->rq_connect_cookie); break; - case -ECONNRESET: case -ECONNREFUSED: + case -ECONNRESET: + case -ECONNABORTED: rpc_force_rebind(clnt); rpc_delay(task, 3*HZ); case -EPIPE: diff --git a/net/sunrpc/netns.h b/net/sunrpc/netns.h index 779742cfc1ff..df5826876535 100644 --- a/net/sunrpc/netns.h +++ b/net/sunrpc/netns.h @@ -14,6 +14,7 @@ struct sunrpc_net { struct cache_detail *rsi_cache; struct super_block *pipefs_sb; + struct rpc_pipe *gssd_dummy; struct mutex pipefs_sb_lock; struct list_head all_clients; @@ -26,14 +27,11 @@ struct sunrpc_net { unsigned int rpcb_is_af_local : 1; struct mutex gssp_lock; - wait_queue_head_t gssp_wq; struct rpc_clnt *gssp_clnt; int use_gss_proxy; int pipe_version; atomic_t pipe_users; struct proc_dir_entry *use_gssp_proc; - - unsigned int gssd_running; }; extern int sunrpc_net_id; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index bf04b30a788a..b18554898562 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -17,6 +17,7 @@ #include <linux/fsnotify.h> #include <linux/kernel.h> #include <linux/rcupdate.h> +#include <linux/utsname.h> #include <asm/ioctls.h> #include <linux/poll.h> @@ -38,7 +39,7 @@ #define NET_NAME(net) ((net == &init_net) ? " (init_net)" : "") static struct file_system_type rpc_pipe_fs_type; - +static const struct rpc_pipe_ops gssd_dummy_pipe_ops; static struct kmem_cache *rpc_inode_cachep __read_mostly; @@ -216,14 +217,11 @@ rpc_destroy_inode(struct inode *inode) static int rpc_pipe_open(struct inode *inode, struct file *filp) { - struct net *net = inode->i_sb->s_fs_info; - struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct rpc_pipe *pipe; int first_open; int res = -ENXIO; mutex_lock(&inode->i_mutex); - sn->gssd_running = 1; pipe = RPC_I(inode)->pipe; if (pipe == NULL) goto out; @@ -1159,6 +1157,7 @@ enum { RPCAUTH_nfsd4_cb, RPCAUTH_cache, RPCAUTH_nfsd, + RPCAUTH_gssd, RPCAUTH_RootEOF }; @@ -1195,6 +1194,10 @@ static const struct rpc_filelist files[] = { .name = "nfsd", .mode = S_IFDIR | S_IRUGO | S_IXUGO, }, + [RPCAUTH_gssd] = { + .name = "gssd", + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + }, }; /* @@ -1208,13 +1211,24 @@ struct dentry *rpc_d_lookup_sb(const struct super_block *sb, } EXPORT_SYMBOL_GPL(rpc_d_lookup_sb); -void rpc_pipefs_init_net(struct net *net) +int rpc_pipefs_init_net(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + sn->gssd_dummy = rpc_mkpipe_data(&gssd_dummy_pipe_ops, 0); + if (IS_ERR(sn->gssd_dummy)) + return PTR_ERR(sn->gssd_dummy); + mutex_init(&sn->pipefs_sb_lock); - sn->gssd_running = 1; sn->pipe_version = -1; + return 0; +} + +void rpc_pipefs_exit_net(struct net *net) +{ + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + + rpc_destroy_pipe_data(sn->gssd_dummy); } /* @@ -1244,11 +1258,134 @@ void rpc_put_sb_net(const struct net *net) } EXPORT_SYMBOL_GPL(rpc_put_sb_net); +static const struct rpc_filelist gssd_dummy_clnt_dir[] = { + [0] = { + .name = "clntXX", + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + }, +}; + +static ssize_t +dummy_downcall(struct file *filp, const char __user *src, size_t len) +{ + return -EINVAL; +} + +static const struct rpc_pipe_ops gssd_dummy_pipe_ops = { + .upcall = rpc_pipe_generic_upcall, + .downcall = dummy_downcall, +}; + +/* + * Here we present a bogus "info" file to keep rpc.gssd happy. We don't expect + * that it will ever use this info to handle an upcall, but rpc.gssd expects + * that this file will be there and have a certain format. + */ +static int +rpc_show_dummy_info(struct seq_file *m, void *v) +{ + seq_printf(m, "RPC server: %s\n", utsname()->nodename); + seq_printf(m, "service: foo (1) version 0\n"); + seq_printf(m, "address: 127.0.0.1\n"); + seq_printf(m, "protocol: tcp\n"); + seq_printf(m, "port: 0\n"); + return 0; +} + +static int +rpc_dummy_info_open(struct inode *inode, struct file *file) +{ + return single_open(file, rpc_show_dummy_info, NULL); +} + +static const struct file_operations rpc_dummy_info_operations = { + .owner = THIS_MODULE, + .open = rpc_dummy_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct rpc_filelist gssd_dummy_info_file[] = { + [0] = { + .name = "info", + .i_fop = &rpc_dummy_info_operations, + .mode = S_IFREG | S_IRUSR, + }, +}; + +/** + * rpc_gssd_dummy_populate - create a dummy gssd pipe + * @root: root of the rpc_pipefs filesystem + * @pipe_data: pipe data created when netns is initialized + * + * Create a dummy set of directories and a pipe that gssd can hold open to + * indicate that it is up and running. + */ +static struct dentry * +rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data) +{ + int ret = 0; + struct dentry *gssd_dentry; + struct dentry *clnt_dentry = NULL; + struct dentry *pipe_dentry = NULL; + struct qstr q = QSTR_INIT(files[RPCAUTH_gssd].name, + strlen(files[RPCAUTH_gssd].name)); + + /* We should never get this far if "gssd" doesn't exist */ + gssd_dentry = d_hash_and_lookup(root, &q); + if (!gssd_dentry) + return ERR_PTR(-ENOENT); + + ret = rpc_populate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1, NULL); + if (ret) { + pipe_dentry = ERR_PTR(ret); + goto out; + } + + q.name = gssd_dummy_clnt_dir[0].name; + q.len = strlen(gssd_dummy_clnt_dir[0].name); + clnt_dentry = d_hash_and_lookup(gssd_dentry, &q); + if (!clnt_dentry) { + pipe_dentry = ERR_PTR(-ENOENT); + goto out; + } + + ret = rpc_populate(clnt_dentry, gssd_dummy_info_file, 0, 1, NULL); + if (ret) { + __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1); + pipe_dentry = ERR_PTR(ret); + goto out; + } + + pipe_dentry = rpc_mkpipe_dentry(clnt_dentry, "gssd", NULL, pipe_data); + if (IS_ERR(pipe_dentry)) { + __rpc_depopulate(clnt_dentry, gssd_dummy_info_file, 0, 1); + __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1); + } +out: + dput(clnt_dentry); + dput(gssd_dentry); + return pipe_dentry; +} + +static void +rpc_gssd_dummy_depopulate(struct dentry *pipe_dentry) +{ + struct dentry *clnt_dir = pipe_dentry->d_parent; + struct dentry *gssd_dir = clnt_dir->d_parent; + + __rpc_rmpipe(clnt_dir->d_inode, pipe_dentry); + __rpc_depopulate(clnt_dir, gssd_dummy_info_file, 0, 1); + __rpc_depopulate(gssd_dir, gssd_dummy_clnt_dir, 0, 1); + dput(pipe_dentry); +} + static int rpc_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; - struct dentry *root; + struct dentry *root, *gssd_dentry; struct net *net = data; struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); int err; @@ -1266,6 +1403,13 @@ rpc_fill_super(struct super_block *sb, void *data, int silent) return -ENOMEM; if (rpc_populate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF, NULL)) return -ENOMEM; + + gssd_dentry = rpc_gssd_dummy_populate(root, sn->gssd_dummy); + if (IS_ERR(gssd_dentry)) { + __rpc_depopulate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF); + return PTR_ERR(gssd_dentry); + } + dprintk("RPC: sending pipefs MOUNT notification for net %p%s\n", net, NET_NAME(net)); mutex_lock(&sn->pipefs_sb_lock); @@ -1280,6 +1424,7 @@ rpc_fill_super(struct super_block *sb, void *data, int silent) return 0; err_depopulate: + rpc_gssd_dummy_depopulate(gssd_dentry); blocking_notifier_call_chain(&rpc_pipefs_notifier_list, RPC_PIPEFS_UMOUNT, sb); @@ -1289,6 +1434,16 @@ err_depopulate: return err; } +bool +gssd_running(struct net *net) +{ + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + struct rpc_pipe *pipe = sn->gssd_dummy; + + return pipe->nreaders || pipe->nwriters; +} +EXPORT_SYMBOL_GPL(gssd_running); + static struct dentry * rpc_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 3d6498af9adc..cd30120de9e4 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -44,12 +44,17 @@ static __net_init int sunrpc_init_net(struct net *net) if (err) goto err_unixgid; - rpc_pipefs_init_net(net); + err = rpc_pipefs_init_net(net); + if (err) + goto err_pipefs; + INIT_LIST_HEAD(&sn->all_clients); spin_lock_init(&sn->rpc_client_lock); spin_lock_init(&sn->rpcb_clnt_lock); return 0; +err_pipefs: + unix_gid_cache_destroy(net); err_unixgid: ip_map_cache_destroy(net); err_ipmap: @@ -60,6 +65,7 @@ err_proc: static __net_exit void sunrpc_exit_net(struct net *net) { + rpc_pipefs_exit_net(net); unix_gid_cache_destroy(net); ip_map_cache_destroy(net); rpc_proc_exit(net); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index e7fbe368b4a3..5de6801cd924 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -916,9 +916,6 @@ static int __svc_register(struct net *net, const char *progname, #endif } - if (error < 0) - printk(KERN_WARNING "svc: failed to register %sv%u RPC " - "service (errno %d).\n", progname, version, -error); return error; } @@ -937,6 +934,7 @@ int svc_register(const struct svc_serv *serv, struct net *net, const unsigned short port) { struct svc_program *progp; + struct svc_version *vers; unsigned int i; int error = 0; @@ -946,7 +944,8 @@ int svc_register(const struct svc_serv *serv, struct net *net, for (progp = serv->sv_program; progp; progp = progp->pg_next) { for (i = 0; i < progp->pg_nvers; i++) { - if (progp->pg_vers[i] == NULL) + vers = progp->pg_vers[i]; + if (vers == NULL) continue; dprintk("svc: svc_register(%sv%d, %s, %u, %u)%s\n", @@ -955,16 +954,26 @@ int svc_register(const struct svc_serv *serv, struct net *net, proto == IPPROTO_UDP? "udp" : "tcp", port, family, - progp->pg_vers[i]->vs_hidden? - " (but not telling portmap)" : ""); + vers->vs_hidden ? + " (but not telling portmap)" : ""); - if (progp->pg_vers[i]->vs_hidden) + if (vers->vs_hidden) continue; error = __svc_register(net, progp->pg_name, progp->pg_prog, i, family, proto, port); - if (error < 0) + + if (vers->vs_rpcb_optnl) { + error = 0; + continue; + } + + if (error < 0) { + printk(KERN_WARNING "svc: failed to register " + "%sv%u RPC service (errno %d).\n", + progp->pg_name, i, -error); break; + } } } diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 1750048130a7..7d4df99f761f 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -749,6 +749,11 @@ static void xprt_connect_status(struct rpc_task *task) } switch (task->tk_status) { + case -ECONNREFUSED: + case -ECONNRESET: + case -ECONNABORTED: + case -ENETUNREACH: + case -EHOSTUNREACH: case -EAGAIN: dprintk("RPC: %5u xprt_connect_status: retrying\n", task->tk_pid); break; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 75b045e1cd50..817a1e523969 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -257,6 +257,7 @@ struct sock_xprt { void (*old_data_ready)(struct sock *, int); void (*old_state_change)(struct sock *); void (*old_write_space)(struct sock *); + void (*old_error_report)(struct sock *); }; /* @@ -274,6 +275,11 @@ struct sock_xprt { */ #define TCP_RPC_REPLY (1UL << 6) +static inline struct rpc_xprt *xprt_from_sock(struct sock *sk) +{ + return (struct rpc_xprt *) sk->sk_user_data; +} + static inline struct sockaddr *xs_addr(struct rpc_xprt *xprt) { return (struct sockaddr *) &xprt->addr; @@ -799,6 +805,7 @@ static void xs_save_old_callbacks(struct sock_xprt *transport, struct sock *sk) transport->old_data_ready = sk->sk_data_ready; transport->old_state_change = sk->sk_state_change; transport->old_write_space = sk->sk_write_space; + transport->old_error_report = sk->sk_error_report; } static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *sk) @@ -806,6 +813,34 @@ static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *s sk->sk_data_ready = transport->old_data_ready; sk->sk_state_change = transport->old_state_change; sk->sk_write_space = transport->old_write_space; + sk->sk_error_report = transport->old_error_report; +} + +/** + * xs_error_report - callback to handle TCP socket state errors + * @sk: socket + * + * Note: we don't call sock_error() since there may be a rpc_task + * using the socket, and so we don't want to clear sk->sk_err. + */ +static void xs_error_report(struct sock *sk) +{ + struct rpc_xprt *xprt; + int err; + + read_lock_bh(&sk->sk_callback_lock); + if (!(xprt = xprt_from_sock(sk))) + goto out; + + err = -sk->sk_err; + if (err == 0) + goto out; + dprintk("RPC: xs_error_report client %p, error=%d...\n", + xprt, -err); + trace_rpc_socket_error(xprt, sk->sk_socket, err); + xprt_wake_pending_tasks(xprt, err); + out: + read_unlock_bh(&sk->sk_callback_lock); } static void xs_reset_transport(struct sock_xprt *transport) @@ -885,11 +920,6 @@ static void xs_destroy(struct rpc_xprt *xprt) module_put(THIS_MODULE); } -static inline struct rpc_xprt *xprt_from_sock(struct sock *sk) -{ - return (struct rpc_xprt *) sk->sk_user_data; -} - static int xs_local_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) { struct xdr_skb_reader desc = { @@ -1869,6 +1899,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, sk->sk_user_data = xprt; sk->sk_data_ready = xs_local_data_ready; sk->sk_write_space = xs_udp_write_space; + sk->sk_error_report = xs_error_report; sk->sk_allocation = GFP_ATOMIC; xprt_clear_connected(xprt); @@ -2146,6 +2177,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) sk->sk_data_ready = xs_tcp_data_ready; sk->sk_state_change = xs_tcp_state_change; sk->sk_write_space = xs_tcp_write_space; + sk->sk_error_report = xs_error_report; sk->sk_allocation = GFP_ATOMIC; /* socket options */ @@ -2932,10 +2964,9 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args) /* * Once we've associated a backchannel xprt with a connection, - * we want to keep it around as long as long as the connection - * lasts, in case we need to start using it for a backchannel - * again; this reference won't be dropped until bc_xprt is - * destroyed. + * we want to keep it around as long as the connection lasts, + * in case we need to start using it for a backchannel again; + * this reference won't be dropped until bc_xprt is destroyed. */ xprt_get(xprt); args->bc_xprt->xpt_bc_xprt = xprt; diff --git a/net/wireless/core.c b/net/wireless/core.c index b5ff39a6f6ed..76ae6a605abb 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -203,8 +203,11 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, rdev->opencount--; - WARN_ON(rdev->scan_req && rdev->scan_req->wdev == wdev && - !rdev->scan_req->notified); + if (rdev->scan_req && rdev->scan_req->wdev == wdev) { + if (WARN_ON(!rdev->scan_req->notified)) + rdev->scan_req->aborted = true; + ___cfg80211_scan_done(rdev, false); + } } static int cfg80211_rfkill_set_block(void *data, bool blocked) @@ -440,9 +443,6 @@ int wiphy_register(struct wiphy *wiphy) int i; u16 ifmodes = wiphy->interface_modes; - /* support for 5/10 MHz is broken due to nl80211 API mess - disable */ - wiphy->flags &= ~WIPHY_FLAG_SUPPORTS_5_10_MHZ; - /* * There are major locking problems in nl80211/mac80211 for CSA, * disable for all drivers until this has been reworked. @@ -859,8 +859,11 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, break; case NETDEV_DOWN: cfg80211_update_iface_num(rdev, wdev->iftype, -1); - WARN_ON(rdev->scan_req && rdev->scan_req->wdev == wdev && - !rdev->scan_req->notified); + if (rdev->scan_req && rdev->scan_req->wdev == wdev) { + if (WARN_ON(!rdev->scan_req->notified)) + rdev->scan_req->aborted = true; + ___cfg80211_scan_done(rdev, false); + } if (WARN_ON(rdev->sched_scan_req && rdev->sched_scan_req->dev == wdev->netdev)) { diff --git a/net/wireless/core.h b/net/wireless/core.h index 9895ab16c051..40683004d523 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -62,6 +62,7 @@ struct cfg80211_registered_device { struct rb_root bss_tree; u32 bss_generation; struct cfg80211_scan_request *scan_req; /* protected by RTNL */ + struct sk_buff *scan_msg; struct cfg80211_sched_scan_request *sched_scan_req; unsigned long suspend_at; struct work_struct scan_done_wk; @@ -363,7 +364,8 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, struct key_params *params, int key_idx, bool pairwise, const u8 *mac_addr); void __cfg80211_scan_done(struct work_struct *wk); -void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev); +void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, + bool send_message); void __cfg80211_sched_scan_results(struct work_struct *wk); int __cfg80211_stop_sched_scan(struct cfg80211_registered_device *rdev, bool driver_initiated); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 179786494308..058aa0e1a462 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -1740,9 +1740,10 @@ static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb) * We can then retry with the larger buffer. */ if ((ret == -ENOBUFS || ret == -EMSGSIZE) && - !skb->len && + !skb->len && !state->split && cb->min_dump_alloc < 4096) { cb->min_dump_alloc = 4096; + state->split_start = 0; rtnl_unlock(); return 1; } @@ -5274,7 +5275,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) if (!rdev->ops->scan) return -EOPNOTSUPP; - if (rdev->scan_req) { + if (rdev->scan_req || rdev->scan_msg) { err = -EBUSY; goto unlock; } @@ -10116,40 +10117,31 @@ void nl80211_send_scan_start(struct cfg80211_registered_device *rdev, NL80211_MCGRP_SCAN, GFP_KERNEL); } -void nl80211_send_scan_done(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev) +struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, bool aborted) { struct sk_buff *msg; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) - return; + return NULL; if (nl80211_send_scan_msg(msg, rdev, wdev, 0, 0, 0, - NL80211_CMD_NEW_SCAN_RESULTS) < 0) { + aborted ? NL80211_CMD_SCAN_ABORTED : + NL80211_CMD_NEW_SCAN_RESULTS) < 0) { nlmsg_free(msg); - return; + return NULL; } - genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, - NL80211_MCGRP_SCAN, GFP_KERNEL); + return msg; } -void nl80211_send_scan_aborted(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev) +void nl80211_send_scan_result(struct cfg80211_registered_device *rdev, + struct sk_buff *msg) { - struct sk_buff *msg; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; - if (nl80211_send_scan_msg(msg, rdev, wdev, 0, 0, 0, - NL80211_CMD_SCAN_ABORTED) < 0) { - nlmsg_free(msg); - return; - } - genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, NL80211_MCGRP_SCAN, GFP_KERNEL); } diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index cb0216e1a004..1e6df9630f42 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -8,10 +8,10 @@ void nl80211_exit(void); void nl80211_notify_dev_rename(struct cfg80211_registered_device *rdev); void nl80211_send_scan_start(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); -void nl80211_send_scan_done(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev); -void nl80211_send_scan_aborted(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev); +struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, bool aborted); +void nl80211_send_scan_result(struct cfg80211_registered_device *rdev, + struct sk_buff *msg); void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev, struct net_device *netdev, u32 cmd); void nl80211_send_sched_scan_results(struct cfg80211_registered_device *rdev, diff --git a/net/wireless/scan.c b/net/wireless/scan.c index b528e31da2cf..d1ed4aebbbb7 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -161,18 +161,25 @@ static void __cfg80211_bss_expire(struct cfg80211_registered_device *dev, dev->bss_generation++; } -void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev) +void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, + bool send_message) { struct cfg80211_scan_request *request; struct wireless_dev *wdev; + struct sk_buff *msg; #ifdef CONFIG_CFG80211_WEXT union iwreq_data wrqu; #endif ASSERT_RTNL(); - request = rdev->scan_req; + if (rdev->scan_msg) { + nl80211_send_scan_result(rdev, rdev->scan_msg); + rdev->scan_msg = NULL; + return; + } + request = rdev->scan_req; if (!request) return; @@ -186,18 +193,16 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev) if (wdev->netdev) cfg80211_sme_scan_done(wdev->netdev); - if (request->aborted) { - nl80211_send_scan_aborted(rdev, wdev); - } else { - if (request->flags & NL80211_SCAN_FLAG_FLUSH) { - /* flush entries from previous scans */ - spin_lock_bh(&rdev->bss_lock); - __cfg80211_bss_expire(rdev, request->scan_start); - spin_unlock_bh(&rdev->bss_lock); - } - nl80211_send_scan_done(rdev, wdev); + if (!request->aborted && + request->flags & NL80211_SCAN_FLAG_FLUSH) { + /* flush entries from previous scans */ + spin_lock_bh(&rdev->bss_lock); + __cfg80211_bss_expire(rdev, request->scan_start); + spin_unlock_bh(&rdev->bss_lock); } + msg = nl80211_build_scan_msg(rdev, wdev, request->aborted); + #ifdef CONFIG_CFG80211_WEXT if (wdev->netdev && !request->aborted) { memset(&wrqu, 0, sizeof(wrqu)); @@ -211,6 +216,11 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev) rdev->scan_req = NULL; kfree(request); + + if (!send_message) + rdev->scan_msg = msg; + else + nl80211_send_scan_result(rdev, msg); } void __cfg80211_scan_done(struct work_struct *wk) @@ -221,7 +231,7 @@ void __cfg80211_scan_done(struct work_struct *wk) scan_done_wk); rtnl_lock(); - ___cfg80211_scan_done(rdev); + ___cfg80211_scan_done(rdev, true); rtnl_unlock(); } @@ -1079,7 +1089,7 @@ int cfg80211_wext_siwscan(struct net_device *dev, if (IS_ERR(rdev)) return PTR_ERR(rdev); - if (rdev->scan_req) { + if (rdev->scan_req || rdev->scan_msg) { err = -EBUSY; goto out; } @@ -1481,7 +1491,7 @@ int cfg80211_wext_giwscan(struct net_device *dev, if (IS_ERR(rdev)) return PTR_ERR(rdev); - if (rdev->scan_req) + if (rdev->scan_req || rdev->scan_msg) return -EAGAIN; res = ieee80211_scan_results(rdev, info, extra, data->length); diff --git a/net/wireless/sme.c b/net/wireless/sme.c index a63509118508..f04d4c32e96e 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -67,7 +67,7 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev) ASSERT_RDEV_LOCK(rdev); ASSERT_WDEV_LOCK(wdev); - if (rdev->scan_req) + if (rdev->scan_req || rdev->scan_msg) return -EBUSY; if (wdev->conn->params.channel) |