diff options
author | Florian Westphal <fw@strlen.de> | 2022-04-11 14:01:16 +0300 |
---|---|---|
committer | Pablo Neira Ayuso <pablo@netfilter.org> | 2022-05-13 19:51:28 +0300 |
commit | 2ed3bf188b33630cf9d93b996ebf001847a00b5a (patch) | |
tree | 20b1da28b9537729ec7e28a70bcb76c3e2f15650 /net/netfilter | |
parent | a997157e42e3119b13c644549a3d8381a1d825d6 (diff) | |
download | linux-2ed3bf188b33630cf9d93b996ebf001847a00b5a.tar.xz |
netfilter: ecache: use dedicated list for event redelivery
This disentangles event redelivery and the percpu dying list.
Because entries are now stored on a dedicated list, all
entries are in NFCT_ECACHE_DESTROY_FAIL state and all entries
still have confirmed bit set -- the reference count is at least 1.
The 'struct net' back-pointer can be removed as well.
The pcpu dying list will be removed eventually, it has no functionality.
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Diffstat (limited to 'net/netfilter')
-rw-r--r-- | net/netfilter/nf_conntrack_core.c | 33 | ||||
-rw-r--r-- | net/netfilter/nf_conntrack_ecache.c | 117 |
2 files changed, 80 insertions, 70 deletions
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 0164e5f522e8..ca1d1d105163 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -660,15 +660,12 @@ void nf_ct_destroy(struct nf_conntrack *nfct) } EXPORT_SYMBOL(nf_ct_destroy); -static void nf_ct_delete_from_lists(struct nf_conn *ct) +static void __nf_ct_delete_from_lists(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); unsigned int hash, reply_hash; unsigned int sequence; - nf_ct_helper_destroy(ct); - - local_bh_disable(); do { sequence = read_seqcount_begin(&nf_conntrack_generation); hash = hash_conntrack(net, @@ -681,12 +678,33 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct) clean_from_lists(ct); nf_conntrack_double_unlock(hash, reply_hash); +} +static void nf_ct_delete_from_lists(struct nf_conn *ct) +{ + nf_ct_helper_destroy(ct); + local_bh_disable(); + + __nf_ct_delete_from_lists(ct); nf_ct_add_to_dying_list(ct); local_bh_enable(); } +static void nf_ct_add_to_ecache_list(struct nf_conn *ct) +{ +#ifdef CONFIG_NF_CONNTRACK_EVENTS + struct nf_conntrack_net *cnet = nf_ct_pernet(nf_ct_net(ct)); + + spin_lock(&cnet->ecache.dying_lock); + hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, + &cnet->ecache.dying_list); + spin_unlock(&cnet->ecache.dying_lock); +#else + nf_ct_add_to_dying_list(ct); +#endif +} + bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) { struct nf_conn_tstamp *tstamp; @@ -709,7 +727,12 @@ bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) /* destroy event was not delivered. nf_ct_put will * be done by event cache worker on redelivery. */ - nf_ct_delete_from_lists(ct); + nf_ct_helper_destroy(ct); + local_bh_disable(); + __nf_ct_delete_from_lists(ct); + nf_ct_add_to_ecache_list(ct); + local_bh_enable(); + nf_conntrack_ecache_work(nf_ct_net(ct), NFCT_ECACHE_DESTROY_FAIL); return false; } diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 0cb2da0a759a..2752859479b2 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -16,7 +16,6 @@ #include <linux/vmalloc.h> #include <linux/stddef.h> #include <linux/err.h> -#include <linux/percpu.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/slab.h> @@ -29,8 +28,9 @@ static DEFINE_MUTEX(nf_ct_ecache_mutex); -#define ECACHE_RETRY_WAIT (HZ/10) -#define ECACHE_STACK_ALLOC (256 / sizeof(void *)) +#define DYING_NULLS_VAL ((1 << 30) + 1) +#define ECACHE_MAX_JIFFIES msecs_to_jiffies(10) +#define ECACHE_RETRY_JIFFIES msecs_to_jiffies(10) enum retry_state { STATE_CONGESTED, @@ -38,58 +38,58 @@ enum retry_state { STATE_DONE, }; -static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu) +static enum retry_state ecache_work_evict_list(struct nf_conntrack_net *cnet) { - struct nf_conn *refs[ECACHE_STACK_ALLOC]; + unsigned long stop = jiffies + ECACHE_MAX_JIFFIES; + struct hlist_nulls_head evicted_list; enum retry_state ret = STATE_DONE; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; - unsigned int evicted = 0; + unsigned int sent; - spin_lock(&pcpu->lock); + INIT_HLIST_NULLS_HEAD(&evicted_list, DYING_NULLS_VAL); - hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) { +next: + sent = 0; + spin_lock_bh(&cnet->ecache.dying_lock); + + hlist_nulls_for_each_entry_safe(h, n, &cnet->ecache.dying_list, hnnode) { struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); - struct nf_conntrack_ecache *e; - - if (!nf_ct_is_confirmed(ct)) - continue; - - /* This ecache access is safe because the ct is on the - * pcpu dying list and we hold the spinlock -- the entry - * cannot be free'd until after the lock is released. - * - * This is true even if ct has a refcount of 0: the - * cpu that is about to free the entry must remove it - * from the dying list and needs the lock to do so. - */ - e = nf_ct_ecache_find(ct); - if (!e || e->state != NFCT_ECACHE_DESTROY_FAIL) - continue; - /* ct is in NFCT_ECACHE_DESTROY_FAIL state, this means - * the worker owns this entry: the ct will remain valid - * until the worker puts its ct reference. + /* The worker owns all entries, ct remains valid until nf_ct_put + * in the loop below. */ if (nf_conntrack_event(IPCT_DESTROY, ct)) { ret = STATE_CONGESTED; break; } - e->state = NFCT_ECACHE_DESTROY_SENT; - refs[evicted] = ct; + hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); + hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, &evicted_list); - if (++evicted >= ARRAY_SIZE(refs)) { + if (time_after(stop, jiffies)) { ret = STATE_RESTART; break; } + + if (sent++ > 16) { + spin_unlock_bh(&cnet->ecache.dying_lock); + cond_resched(); + goto next; + } } - spin_unlock(&pcpu->lock); + spin_unlock_bh(&cnet->ecache.dying_lock); - /* can't _put while holding lock */ - while (evicted) - nf_ct_put(refs[--evicted]); + hlist_nulls_for_each_entry_safe(h, n, &evicted_list, hnnode) { + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + + hlist_nulls_add_fake(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); + hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); + nf_ct_put(ct); + + cond_resched(); + } return ret; } @@ -97,35 +97,20 @@ static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu) static void ecache_work(struct work_struct *work) { struct nf_conntrack_net *cnet = container_of(work, struct nf_conntrack_net, ecache.dwork.work); - struct netns_ct *ctnet = cnet->ecache.ct_net; - int cpu, delay = -1; - struct ct_pcpu *pcpu; - - local_bh_disable(); - - for_each_possible_cpu(cpu) { - enum retry_state ret; - - pcpu = per_cpu_ptr(ctnet->pcpu_lists, cpu); - - ret = ecache_work_evict_list(pcpu); - - switch (ret) { - case STATE_CONGESTED: - delay = ECACHE_RETRY_WAIT; - goto out; - case STATE_RESTART: - delay = 0; - break; - case STATE_DONE: - break; - } + int ret, delay = -1; + + ret = ecache_work_evict_list(cnet); + switch (ret) { + case STATE_CONGESTED: + delay = ECACHE_RETRY_JIFFIES; + break; + case STATE_RESTART: + delay = 0; + break; + case STATE_DONE: + break; } - out: - local_bh_enable(); - - ctnet->ecache_dwork_pending = delay > 0; if (delay >= 0) schedule_delayed_work(&cnet->ecache.dwork, delay); } @@ -199,7 +184,6 @@ int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct, */ if (e->portid == 0 && portid != 0) e->portid = portid; - e->state = NFCT_ECACHE_DESTROY_FAIL; } return ret; @@ -297,8 +281,10 @@ void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state) schedule_delayed_work(&cnet->ecache.dwork, HZ); net->ct.ecache_dwork_pending = true; } else if (state == NFCT_ECACHE_DESTROY_SENT) { - net->ct.ecache_dwork_pending = false; - mod_delayed_work(system_wq, &cnet->ecache.dwork, 0); + if (!hlist_nulls_empty(&cnet->ecache.dying_list)) + mod_delayed_work(system_wq, &cnet->ecache.dwork, 0); + else + net->ct.ecache_dwork_pending = false; } } @@ -311,8 +297,9 @@ void nf_conntrack_ecache_pernet_init(struct net *net) net->ct.sysctl_events = nf_ct_events; - cnet->ecache.ct_net = &net->ct; INIT_DELAYED_WORK(&cnet->ecache.dwork, ecache_work); + INIT_HLIST_NULLS_HEAD(&cnet->ecache.dying_list, DYING_NULLS_VAL); + spin_lock_init(&cnet->ecache.dying_lock); BUILD_BUG_ON(__IPCT_MAX >= 16); /* e->ctmask is u16 */ } |