diff options
author | David S. Miller <davem@davemloft.net> | 2019-05-27 00:08:05 +0300 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2019-05-27 00:08:05 +0300 |
commit | 8fb91c3550c4666d4c37b5494b1c68aa9c3284a5 (patch) | |
tree | 117978cd906b0d9088608e0c98700ca4c80246ff /net/ipv4 | |
parent | ddf6ddb057f22445837df4d01bd966995d4426f7 (diff) | |
parent | 3c8fc87820446ce5b948dc17648509340102b818 (diff) | |
download | linux-8fb91c3550c4666d4c37b5494b1c68aa9c3284a5.tar.xz |
Merge branch 'inet-frags-avoid-possible-races-at-netns-dismantle'
Eric Dumazet says:
====================
inet: frags: avoid possible races at netns dismantle
This patch series fixes a race happening on netns dismantle with
frag queues. While rhashtable_free_and_destroy() is running,
concurrent timers might run inet_frag_kill() and attempt
rhashtable_remove_fast() calls. This is not allowed by
rhashtable logic.
Since I do not want to add expensive synchronize_rcu() calls
in the netns dismantle path, I had to no longer inline
netns_frags structures, but dynamically allocate them.
The ten first patches make this preparation, so that
the last patch clearly shows the fix.
As this patch series is not exactly trivial, I chose to
target 5.3. We will backport it once soaked a bit.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/inet_fragment.c | 98 | ||||
-rw-r--r-- | net/ipv4/ip_fragment.c | 67 | ||||
-rw-r--r-- | net/ipv4/proc.c | 4 |
3 files changed, 91 insertions, 78 deletions
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 737808e27f8b..6ca9523374da 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -124,34 +124,50 @@ void inet_frags_fini(struct inet_frags *f) } EXPORT_SYMBOL(inet_frags_fini); +/* called from rhashtable_free_and_destroy() at netns_frags dismantle */ static void inet_frags_free_cb(void *ptr, void *arg) { struct inet_frag_queue *fq = ptr; + int count; - /* If we can not cancel the timer, it means this frag_queue - * is already disappearing, we have nothing to do. - * Otherwise, we own a refcount until the end of this function. - */ - if (!del_timer(&fq->timer)) - return; + count = del_timer_sync(&fq->timer) ? 1 : 0; spin_lock_bh(&fq->lock); if (!(fq->flags & INET_FRAG_COMPLETE)) { fq->flags |= INET_FRAG_COMPLETE; - refcount_dec(&fq->refcnt); + count++; + } else if (fq->flags & INET_FRAG_HASH_DEAD) { + count++; } spin_unlock_bh(&fq->lock); - inet_frag_put(fq); + if (refcount_sub_and_test(count, &fq->refcnt)) + inet_frag_destroy(fq); +} + +static void fqdir_rwork_fn(struct work_struct *work) +{ + struct fqdir *fqdir = container_of(to_rcu_work(work), + struct fqdir, destroy_rwork); + + rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL); + kfree(fqdir); } -void inet_frags_exit_net(struct netns_frags *nf) +void fqdir_exit(struct fqdir *fqdir) { - nf->high_thresh = 0; /* prevent creation of new frags */ + fqdir->high_thresh = 0; /* prevent creation of new frags */ + + /* paired with READ_ONCE() in inet_frag_kill() : + * We want to prevent rhashtable_remove_fast() calls + */ + smp_store_release(&fqdir->dead, true); + + INIT_RCU_WORK(&fqdir->destroy_rwork, fqdir_rwork_fn); + queue_rcu_work(system_wq, &fqdir->destroy_rwork); - rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL); } -EXPORT_SYMBOL(inet_frags_exit_net); +EXPORT_SYMBOL(fqdir_exit); void inet_frag_kill(struct inet_frag_queue *fq) { @@ -159,11 +175,21 @@ void inet_frag_kill(struct inet_frag_queue *fq) refcount_dec(&fq->refcnt); if (!(fq->flags & INET_FRAG_COMPLETE)) { - struct netns_frags *nf = fq->net; + struct fqdir *fqdir = fq->fqdir; fq->flags |= INET_FRAG_COMPLETE; - rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params); - refcount_dec(&fq->refcnt); + rcu_read_lock(); + /* This READ_ONCE() is paired with smp_store_release() + * in inet_frags_exit_net(). + */ + if (!READ_ONCE(fqdir->dead)) { + rhashtable_remove_fast(&fqdir->rhashtable, &fq->node, + fqdir->f->rhash_params); + refcount_dec(&fq->refcnt); + } else { + fq->flags |= INET_FRAG_HASH_DEAD; + } + rcu_read_unlock(); } } EXPORT_SYMBOL(inet_frag_kill); @@ -172,7 +198,7 @@ static void inet_frag_destroy_rcu(struct rcu_head *head) { struct inet_frag_queue *q = container_of(head, struct inet_frag_queue, rcu); - struct inet_frags *f = q->net->f; + struct inet_frags *f = q->fqdir->f; if (f->destructor) f->destructor(q); @@ -203,7 +229,7 @@ EXPORT_SYMBOL(inet_frag_rbtree_purge); void inet_frag_destroy(struct inet_frag_queue *q) { - struct netns_frags *nf; + struct fqdir *fqdir; unsigned int sum, sum_truesize = 0; struct inet_frags *f; @@ -211,18 +237,18 @@ void inet_frag_destroy(struct inet_frag_queue *q) WARN_ON(del_timer(&q->timer) != 0); /* Release all fragment data. */ - nf = q->net; - f = nf->f; + fqdir = q->fqdir; + f = fqdir->f; sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments); sum = sum_truesize + f->qsize; call_rcu(&q->rcu, inet_frag_destroy_rcu); - sub_frag_mem_limit(nf, sum); + sub_frag_mem_limit(fqdir, sum); } EXPORT_SYMBOL(inet_frag_destroy); -static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, +static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir, struct inet_frags *f, void *arg) { @@ -232,9 +258,9 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, if (!q) return NULL; - q->net = nf; + q->fqdir = fqdir; f->constructor(q, arg); - add_frag_mem_limit(nf, f->qsize); + add_frag_mem_limit(fqdir, f->qsize); timer_setup(&q->timer, f->frag_expire, 0); spin_lock_init(&q->lock); @@ -243,21 +269,21 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, return q; } -static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, +static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir, void *arg, struct inet_frag_queue **prev) { - struct inet_frags *f = nf->f; + struct inet_frags *f = fqdir->f; struct inet_frag_queue *q; - q = inet_frag_alloc(nf, f, arg); + q = inet_frag_alloc(fqdir, f, arg); if (!q) { *prev = ERR_PTR(-ENOMEM); return NULL; } - mod_timer(&q->timer, jiffies + nf->timeout); + mod_timer(&q->timer, jiffies + fqdir->timeout); - *prev = rhashtable_lookup_get_insert_key(&nf->rhashtable, &q->key, + *prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key, &q->node, f->rhash_params); if (*prev) { q->flags |= INET_FRAG_COMPLETE; @@ -269,18 +295,18 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, } /* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */ -struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key) +struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key) { struct inet_frag_queue *fq = NULL, *prev; - if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) + if (!fqdir->high_thresh || frag_mem_limit(fqdir) > fqdir->high_thresh) return NULL; rcu_read_lock(); - prev = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params); + prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params); if (!prev) - fq = inet_frag_create(nf, key, &prev); + fq = inet_frag_create(fqdir, key, &prev); if (prev && !IS_ERR(prev)) { fq = prev; if (!refcount_inc_not_zero(&fq->refcnt)) @@ -391,7 +417,7 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, delta += head->truesize; if (delta) - add_frag_mem_limit(q->net, delta); + add_frag_mem_limit(q->fqdir, delta); /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part @@ -413,7 +439,7 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, head->truesize += clone->truesize; clone->csum = 0; clone->ip_summed = head->ip_summed; - add_frag_mem_limit(q->net, clone->truesize); + add_frag_mem_limit(q->fqdir, clone->truesize); skb_shinfo(head)->frag_list = clone; nextp = &clone->next; } else { @@ -466,7 +492,7 @@ void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, rbn = rbnext; } } - sub_frag_mem_limit(q->net, head->truesize); + sub_frag_mem_limit(q->fqdir, head->truesize); *nextp = NULL; skb_mark_not_on_list(head); @@ -494,7 +520,7 @@ struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q) if (head == q->fragments_tail) q->fragments_tail = NULL; - sub_frag_mem_limit(q->net, head->truesize); + sub_frag_mem_limit(q->fqdir, head->truesize); return head; } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index cf2b0a6a3337..1ffaec056821 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -82,15 +82,13 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, static void ip4_frag_init(struct inet_frag_queue *q, const void *a) { struct ipq *qp = container_of(q, struct ipq, q); - struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4, - frags); - struct net *net = container_of(ipv4, struct net, ipv4); + struct net *net = q->fqdir->net; const struct frag_v4_compare_key *key = a; q->key.v4 = *key; qp->ecn = 0; - qp->peer = q->net->max_dist ? + qp->peer = q->fqdir->max_dist ? inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) : NULL; } @@ -142,7 +140,7 @@ static void ip_expire(struct timer_list *t) int err; qp = container_of(frag, struct ipq, q); - net = container_of(qp->q.net, struct net, ipv4.frags); + net = qp->q.fqdir->net; rcu_read_lock(); spin_lock(&qp->q.lock); @@ -211,7 +209,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph, }; struct inet_frag_queue *q; - q = inet_frag_find(&net->ipv4.frags, &key); + q = inet_frag_find(net->ipv4.fqdir, &key); if (!q) return NULL; @@ -222,7 +220,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph, static int ip_frag_too_far(struct ipq *qp) { struct inet_peer *peer = qp->peer; - unsigned int max = qp->q.net->max_dist; + unsigned int max = qp->q.fqdir->max_dist; unsigned int start, end; int rc; @@ -236,12 +234,8 @@ static int ip_frag_too_far(struct ipq *qp) rc = qp->q.fragments_tail && (end - start) > max; - if (rc) { - struct net *net; - - net = container_of(qp->q.net, struct net, ipv4.frags); - __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); - } + if (rc) + __IP_INC_STATS(qp->q.fqdir->net, IPSTATS_MIB_REASMFAILS); return rc; } @@ -250,13 +244,13 @@ static int ip_frag_reinit(struct ipq *qp) { unsigned int sum_truesize = 0; - if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) { + if (!mod_timer(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) { refcount_inc(&qp->q.refcnt); return -ETIMEDOUT; } sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments); - sub_frag_mem_limit(qp->q.net, sum_truesize); + sub_frag_mem_limit(qp->q.fqdir, sum_truesize); qp->q.flags = 0; qp->q.len = 0; @@ -273,7 +267,7 @@ static int ip_frag_reinit(struct ipq *qp) /* Add new segment to existing queue. */ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { - struct net *net = container_of(qp->q.net, struct net, ipv4.frags); + struct net *net = qp->q.fqdir->net; int ihl, end, flags, offset; struct sk_buff *prev_tail; struct net_device *dev; @@ -352,7 +346,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) qp->q.stamp = skb->tstamp; qp->q.meat += skb->len; qp->ecn |= ecn; - add_frag_mem_limit(qp->q.net, skb->truesize); + add_frag_mem_limit(qp->q.fqdir, skb->truesize); if (offset == 0) qp->q.flags |= INET_FRAG_FIRST_IN; @@ -399,7 +393,7 @@ err: static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev) { - struct net *net = container_of(qp->q.net, struct net, ipv4.frags); + struct net *net = qp->q.fqdir->net; struct iphdr *iph; void *reasm_data; int len, err; @@ -544,30 +538,24 @@ static int dist_min; static struct ctl_table ip4_frags_ns_ctl_table[] = { { .procname = "ipfrag_high_thresh", - .data = &init_net.ipv4.frags.high_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra1 = &init_net.ipv4.frags.low_thresh }, { .procname = "ipfrag_low_thresh", - .data = &init_net.ipv4.frags.low_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra2 = &init_net.ipv4.frags.high_thresh }, { .procname = "ipfrag_time", - .data = &init_net.ipv4.frags.timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "ipfrag_max_dist", - .data = &init_net.ipv4.frags.max_dist, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -600,13 +588,13 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net) if (!table) goto err_alloc; - table[0].data = &net->ipv4.frags.high_thresh; - table[0].extra1 = &net->ipv4.frags.low_thresh; - table[1].data = &net->ipv4.frags.low_thresh; - table[1].extra2 = &net->ipv4.frags.high_thresh; - table[2].data = &net->ipv4.frags.timeout; - table[3].data = &net->ipv4.frags.max_dist; } + table[0].data = &net->ipv4.fqdir->high_thresh; + table[0].extra1 = &net->ipv4.fqdir->low_thresh; + table[1].data = &net->ipv4.fqdir->low_thresh; + table[1].extra2 = &net->ipv4.fqdir->high_thresh; + table[2].data = &net->ipv4.fqdir->timeout; + table[3].data = &net->ipv4.fqdir->max_dist; hdr = register_net_sysctl(net, "net/ipv4", table); if (!hdr) @@ -654,6 +642,9 @@ static int __net_init ipv4_frags_init_net(struct net *net) { int res; + res = fqdir_init(&net->ipv4.fqdir, &ip4_frags, net); + if (res < 0) + return res; /* Fragment cache limits. * * The fragment memory accounting code, (tries to) account for @@ -668,31 +659,27 @@ static int __net_init ipv4_frags_init_net(struct net *net) * we will prune down to 3MB, making room for approx 8 big 64K * fragments 8x128k. */ - net->ipv4.frags.high_thresh = 4 * 1024 * 1024; - net->ipv4.frags.low_thresh = 3 * 1024 * 1024; + net->ipv4.fqdir->high_thresh = 4 * 1024 * 1024; + net->ipv4.fqdir->low_thresh = 3 * 1024 * 1024; /* * Important NOTE! Fragment queue must be destroyed before MSL expires. * RFC791 is wrong proposing to prolongate timer each fragment arrival * by TTL. */ - net->ipv4.frags.timeout = IP_FRAG_TIME; + net->ipv4.fqdir->timeout = IP_FRAG_TIME; - net->ipv4.frags.max_dist = 64; - net->ipv4.frags.f = &ip4_frags; + net->ipv4.fqdir->max_dist = 64; - res = inet_frags_init_net(&net->ipv4.frags); - if (res < 0) - return res; res = ip4_frags_ns_ctl_register(net); if (res < 0) - inet_frags_exit_net(&net->ipv4.frags); + fqdir_exit(net->ipv4.fqdir); return res; } static void __net_exit ipv4_frags_exit_net(struct net *net) { ip4_frags_ns_ctl_unregister(net); - inet_frags_exit_net(&net->ipv4.frags); + fqdir_exit(net->ipv4.fqdir); } static struct pernet_operations ip4_frags_ops = { diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index c3610b37bb4c..b613572c6616 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -72,8 +72,8 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse_get(net, &raw_prot)); seq_printf(seq, "FRAG: inuse %u memory %lu\n", - atomic_read(&net->ipv4.frags.rhashtable.nelems), - frag_mem_limit(&net->ipv4.frags)); + atomic_read(&net->ipv4.fqdir->rhashtable.nelems), + frag_mem_limit(net->ipv4.fqdir)); return 0; } |