diff options
Diffstat (limited to 'net/sched')
33 files changed, 1540 insertions, 551 deletions
diff --git a/net/sched/act_api.c b/net/sched/act_api.c index d4b8355737d8..aecf1bf233c8 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -543,7 +543,7 @@ int tcf_register_action(struct tc_action_ops *act, write_lock(&act_mod_lock); list_for_each_entry(a, &act_base, head) { - if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) { + if (act->id == a->id || (strcmp(act->kind, a->kind) == 0)) { write_unlock(&act_mod_lock); unregister_pernet_subsys(ops); return -EEXIST; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index c7633843e223..aa5c38d11a30 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -396,7 +396,7 @@ static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index) static struct tc_action_ops act_bpf_ops __read_mostly = { .kind = "bpf", - .type = TCA_ACT_BPF, + .id = TCA_ID_BPF, .owner = THIS_MODULE, .act = tcf_bpf_act, .dump = tcf_bpf_dump, diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 8475913f2070..5d24993cccfe 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -204,7 +204,7 @@ static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index) static struct tc_action_ops act_connmark_ops = { .kind = "connmark", - .type = TCA_ACT_CONNMARK, + .id = TCA_ID_CONNMARK, .owner = THIS_MODULE, .act = tcf_connmark_act, .dump = tcf_connmark_dump, diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 3dc25b7806d7..c79aca29505e 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -559,8 +559,11 @@ static int tcf_csum_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_csum *p = to_tcf_csum(a); + bool orig_vlan_tag_present = false; + unsigned int vlan_hdr_count = 0; struct tcf_csum_params *params; u32 update_flags; + __be16 protocol; int action; params = rcu_dereference_bh(p->params); @@ -573,7 +576,9 @@ static int tcf_csum_act(struct sk_buff *skb, const struct tc_action *a, goto drop; update_flags = params->update_flags; - switch (tc_skb_protocol(skb)) { + protocol = tc_skb_protocol(skb); +again: + switch (protocol) { case cpu_to_be16(ETH_P_IP): if (!tcf_csum_ipv4(skb, update_flags)) goto drop; @@ -582,13 +587,35 @@ static int tcf_csum_act(struct sk_buff *skb, const struct tc_action *a, if (!tcf_csum_ipv6(skb, update_flags)) goto drop; break; + case cpu_to_be16(ETH_P_8021AD): /* fall through */ + case cpu_to_be16(ETH_P_8021Q): + if (skb_vlan_tag_present(skb) && !orig_vlan_tag_present) { + protocol = skb->protocol; + orig_vlan_tag_present = true; + } else { + struct vlan_hdr *vlan = (struct vlan_hdr *)skb->data; + + protocol = vlan->h_vlan_encapsulated_proto; + skb_pull(skb, VLAN_HLEN); + skb_reset_network_header(skb); + vlan_hdr_count++; + } + goto again; + } + +out: + /* Restore the skb for the pulled VLAN tags */ + while (vlan_hdr_count--) { + skb_push(skb, VLAN_HLEN); + skb_reset_network_header(skb); } return action; drop: qstats_drop_inc(this_cpu_ptr(p->common.cpu_qstats)); - return TC_ACT_SHOT; + action = TC_ACT_SHOT; + goto out; } static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, @@ -660,7 +687,7 @@ static size_t tcf_csum_get_fill_size(const struct tc_action *act) static struct tc_action_ops act_csum_ops = { .kind = "csum", - .type = TCA_ACT_CSUM, + .id = TCA_ID_CSUM, .owner = THIS_MODULE, .act = tcf_csum_act, .dump = tcf_csum_dump, diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index b61c20ebb314..93da0004e9f4 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -253,7 +253,7 @@ static size_t tcf_gact_get_fill_size(const struct tc_action *act) static struct tc_action_ops act_gact_ops = { .kind = "gact", - .type = TCA_ACT_GACT, + .id = TCA_ID_GACT, .owner = THIS_MODULE, .act = tcf_gact_act, .stats_update = tcf_gact_stats_update, diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 30b63fa23ee2..9b1f2b3990ee 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -864,7 +864,7 @@ static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index) static struct tc_action_ops act_ife_ops = { .kind = "ife", - .type = TCA_ACT_IFE, + .id = TCA_ID_IFE, .owner = THIS_MODULE, .act = tcf_ife_act, .dump = tcf_ife_dump, diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index faa1addf89b3..98f5b6ea77b4 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -337,7 +337,7 @@ static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index) static struct tc_action_ops act_ipt_ops = { .kind = "ipt", - .type = TCA_ACT_IPT, + .id = TCA_ID_IPT, .owner = THIS_MODULE, .act = tcf_ipt_act, .dump = tcf_ipt_dump, @@ -386,7 +386,7 @@ static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index) static struct tc_action_ops act_xt_ops = { .kind = "xt", - .type = TCA_ACT_XT, + .id = TCA_ID_XT, .owner = THIS_MODULE, .act = tcf_ipt_act, .dump = tcf_ipt_dump, diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index c8cf4d10c435..6692fd054617 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -400,7 +400,7 @@ static void tcf_mirred_put_dev(struct net_device *dev) static struct tc_action_ops act_mirred_ops = { .kind = "mirred", - .type = TCA_ACT_MIRRED, + .id = TCA_ID_MIRRED, .owner = THIS_MODULE, .act = tcf_mirred_act, .stats_update = tcf_stats_update, diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index c5c1e23add77..543eab9193f1 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -304,7 +304,7 @@ static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index) static struct tc_action_ops act_nat_ops = { .kind = "nat", - .type = TCA_ACT_NAT, + .id = TCA_ID_NAT, .owner = THIS_MODULE, .act = tcf_nat_act, .dump = tcf_nat_dump, diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 2b372a06b432..a80373878df7 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -406,7 +406,7 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_t t; int s; - s = sizeof(*opt) + p->tcfp_nkeys * sizeof(struct tc_pedit_key); + s = struct_size(opt, keys, p->tcfp_nkeys); /* netlink spinlocks held above us - must use ATOMIC */ opt = kzalloc(s, GFP_ATOMIC); @@ -470,7 +470,7 @@ static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index) static struct tc_action_ops act_pedit_ops = { .kind = "pedit", - .type = TCA_ACT_PEDIT, + .id = TCA_ID_PEDIT, .owner = THIS_MODULE, .act = tcf_pedit_act, .dump = tcf_pedit_dump, diff --git a/net/sched/act_police.c b/net/sched/act_police.c index ec8ec55e0fe8..8271a6263824 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -366,7 +366,7 @@ MODULE_LICENSE("GPL"); static struct tc_action_ops act_police_ops = { .kind = "police", - .type = TCA_ID_POLICE, + .id = TCA_ID_POLICE, .owner = THIS_MODULE, .act = tcf_police_act, .dump = tcf_police_dump, diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 1a0c682fd734..203e399e5c85 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -233,7 +233,7 @@ static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index) static struct tc_action_ops act_sample_ops = { .kind = "sample", - .type = TCA_ACT_SAMPLE, + .id = TCA_ID_SAMPLE, .owner = THIS_MODULE, .act = tcf_sample_act, .dump = tcf_sample_dump, diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 902957beceb3..d54cb608dbaf 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -19,8 +19,6 @@ #include <net/netlink.h> #include <net/pkt_sched.h> -#define TCA_ACT_SIMP 22 - #include <linux/tc_act/tc_defact.h> #include <net/tc_act/tc_defact.h> @@ -197,7 +195,7 @@ static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index) static struct tc_action_ops act_simp_ops = { .kind = "simple", - .type = TCA_ACT_SIMP, + .id = TCA_ID_SIMP, .owner = THIS_MODULE, .act = tcf_simp_act, .dump = tcf_simp_dump, diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index cfceed28c333..65879500b688 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -304,7 +304,7 @@ static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index) static struct tc_action_ops act_skbedit_ops = { .kind = "skbedit", - .type = TCA_ACT_SKBEDIT, + .id = TCA_ID_SKBEDIT, .owner = THIS_MODULE, .act = tcf_skbedit_act, .dump = tcf_skbedit_dump, diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index 59710a183bd3..7bac1d78e7a3 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -260,7 +260,7 @@ static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index) static struct tc_action_ops act_skbmod_ops = { .kind = "skbmod", - .type = TCA_ACT_SKBMOD, + .id = TCA_ACT_SKBMOD, .owner = THIS_MODULE, .act = tcf_skbmod_act, .dump = tcf_skbmod_dump, diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 3f943de9a2c9..3beb4717d3b7 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -201,8 +201,14 @@ static void tunnel_key_release_params(struct tcf_tunnel_key_params *p) { if (!p) return; - if (p->tcft_action == TCA_TUNNEL_KEY_ACT_SET) + if (p->tcft_action == TCA_TUNNEL_KEY_ACT_SET) { +#ifdef CONFIG_DST_CACHE + struct ip_tunnel_info *info = &p->tcft_enc_metadata->u.tun_info; + + dst_cache_destroy(&info->dst_cache); +#endif dst_release(&p->tcft_enc_metadata->dst); + } kfree_rcu(p, rcu); } @@ -321,12 +327,18 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, goto err_out; } +#ifdef CONFIG_DST_CACHE + ret = dst_cache_init(&metadata->u.tun_info.dst_cache, GFP_KERNEL); + if (ret) + goto release_tun_meta; +#endif + if (opts_len) { ret = tunnel_key_opts_set(tb[TCA_TUNNEL_KEY_ENC_OPTS], &metadata->u.tun_info, opts_len, extack); if (ret < 0) - goto release_tun_meta; + goto release_dst_cache; } metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX; @@ -342,14 +354,14 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, &act_tunnel_key_ops, bind, true); if (ret) { NL_SET_ERR_MSG(extack, "Cannot create TC IDR"); - goto release_tun_meta; + goto release_dst_cache; } ret = ACT_P_CREATED; } else if (!ovr) { NL_SET_ERR_MSG(extack, "TC IDR already exists"); ret = -EEXIST; - goto release_tun_meta; + goto release_dst_cache; } t = to_tunnel_key(*a); @@ -359,7 +371,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, NL_SET_ERR_MSG(extack, "Cannot allocate tunnel key parameters"); ret = -ENOMEM; exists = true; - goto release_tun_meta; + goto release_dst_cache; } params_new->tcft_action = parm->t_action; params_new->tcft_enc_metadata = metadata; @@ -376,7 +388,12 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, return ret; +release_dst_cache: +#ifdef CONFIG_DST_CACHE + if (metadata) + dst_cache_destroy(&metadata->u.tun_info.dst_cache); release_tun_meta: +#endif if (metadata) dst_release(&metadata->dst); @@ -564,7 +581,7 @@ static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index) static struct tc_action_ops act_tunnel_key_ops = { .kind = "tunnel_key", - .type = TCA_ACT_TUNNEL_KEY, + .id = TCA_ID_TUNNEL_KEY, .owner = THIS_MODULE, .act = tunnel_key_act, .dump = tunnel_key_dump, diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 93fdaf707313..ac0061599225 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -297,7 +297,7 @@ static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index) static struct tc_action_ops act_vlan_ops = { .kind = "vlan", - .type = TCA_ACT_VLAN, + .id = TCA_ID_VLAN, .owner = THIS_MODULE, .act = tcf_vlan_act, .dump = tcf_vlan_dump, diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index e2b5cb2eb34e..478095d50f95 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -31,6 +31,13 @@ #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> +#include <net/tc_act/tc_pedit.h> +#include <net/tc_act/tc_mirred.h> +#include <net/tc_act/tc_vlan.h> +#include <net/tc_act/tc_tunnel_key.h> +#include <net/tc_act/tc_csum.h> +#include <net/tc_act/tc_gact.h> +#include <net/tc_act/tc_skbedit.h> extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1]; @@ -61,7 +68,8 @@ static const struct tcf_proto_ops *__tcf_proto_lookup_ops(const char *kind) } static const struct tcf_proto_ops * -tcf_proto_lookup_ops(const char *kind, struct netlink_ext_ack *extack) +tcf_proto_lookup_ops(const char *kind, bool rtnl_held, + struct netlink_ext_ack *extack) { const struct tcf_proto_ops *ops; @@ -69,9 +77,11 @@ tcf_proto_lookup_ops(const char *kind, struct netlink_ext_ack *extack) if (ops) return ops; #ifdef CONFIG_MODULES - rtnl_unlock(); + if (rtnl_held) + rtnl_unlock(); request_module("cls_%s", kind); - rtnl_lock(); + if (rtnl_held) + rtnl_lock(); ops = __tcf_proto_lookup_ops(kind); /* We dropped the RTNL semaphore in order to perform * the module load. So, even if we succeeded in loading @@ -152,8 +162,26 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp) return TC_H_MAJ(first); } +static bool tcf_proto_is_unlocked(const char *kind) +{ + const struct tcf_proto_ops *ops; + bool ret; + + ops = tcf_proto_lookup_ops(kind, false, NULL); + /* On error return false to take rtnl lock. Proto lookup/create + * functions will perform lookup again and properly handle errors. + */ + if (IS_ERR(ops)) + return false; + + ret = !!(ops->flags & TCF_PROTO_OPS_DOIT_UNLOCKED); + module_put(ops->owner); + return ret; +} + static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, u32 prio, struct tcf_chain *chain, + bool rtnl_held, struct netlink_ext_ack *extack) { struct tcf_proto *tp; @@ -163,7 +191,7 @@ static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, if (!tp) return ERR_PTR(-ENOBUFS); - tp->ops = tcf_proto_lookup_ops(kind, extack); + tp->ops = tcf_proto_lookup_ops(kind, rtnl_held, extack); if (IS_ERR(tp->ops)) { err = PTR_ERR(tp->ops); goto errout; @@ -172,6 +200,8 @@ static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, tp->protocol = protocol; tp->prio = prio; tp->chain = chain; + spin_lock_init(&tp->lock); + refcount_set(&tp->refcnt, 1); err = tp->ops->init(tp); if (err) { @@ -185,14 +215,80 @@ errout: return ERR_PTR(err); } -static void tcf_proto_destroy(struct tcf_proto *tp, +static void tcf_proto_get(struct tcf_proto *tp) +{ + refcount_inc(&tp->refcnt); +} + +static void tcf_chain_put(struct tcf_chain *chain); + +static void tcf_proto_destroy(struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack) { - tp->ops->destroy(tp, extack); + tp->ops->destroy(tp, rtnl_held, extack); + tcf_chain_put(tp->chain); module_put(tp->ops->owner); kfree_rcu(tp, rcu); } +static void tcf_proto_put(struct tcf_proto *tp, bool rtnl_held, + struct netlink_ext_ack *extack) +{ + if (refcount_dec_and_test(&tp->refcnt)) + tcf_proto_destroy(tp, rtnl_held, extack); +} + +static int walker_check_empty(struct tcf_proto *tp, void *fh, + struct tcf_walker *arg) +{ + if (fh) { + arg->nonempty = true; + return -1; + } + return 0; +} + +static bool tcf_proto_is_empty(struct tcf_proto *tp, bool rtnl_held) +{ + struct tcf_walker walker = { .fn = walker_check_empty, }; + + if (tp->ops->walk) { + tp->ops->walk(tp, &walker, rtnl_held); + return !walker.nonempty; + } + return true; +} + +static bool tcf_proto_check_delete(struct tcf_proto *tp, bool rtnl_held) +{ + spin_lock(&tp->lock); + if (tcf_proto_is_empty(tp, rtnl_held)) + tp->deleting = true; + spin_unlock(&tp->lock); + return tp->deleting; +} + +static void tcf_proto_mark_delete(struct tcf_proto *tp) +{ + spin_lock(&tp->lock); + tp->deleting = true; + spin_unlock(&tp->lock); +} + +static bool tcf_proto_is_deleting(struct tcf_proto *tp) +{ + bool deleting; + + spin_lock(&tp->lock); + deleting = tp->deleting; + spin_unlock(&tp->lock); + + return deleting; +} + +#define ASSERT_BLOCK_LOCKED(block) \ + lockdep_assert_held(&(block)->lock) + struct tcf_filter_chain_list_item { struct list_head list; tcf_chain_head_change_t *chain_head_change; @@ -204,10 +300,13 @@ static struct tcf_chain *tcf_chain_create(struct tcf_block *block, { struct tcf_chain *chain; + ASSERT_BLOCK_LOCKED(block); + chain = kzalloc(sizeof(*chain), GFP_KERNEL); if (!chain) return NULL; list_add_tail(&chain->list, &block->chain_list); + mutex_init(&chain->filter_chain_lock); chain->block = block; chain->index = chain_index; chain->refcnt = 1; @@ -231,29 +330,59 @@ static void tcf_chain0_head_change(struct tcf_chain *chain, if (chain->index) return; + + mutex_lock(&block->lock); list_for_each_entry(item, &block->chain0.filter_chain_list, list) tcf_chain_head_change_item(item, tp_head); + mutex_unlock(&block->lock); } -static void tcf_chain_destroy(struct tcf_chain *chain) +/* Returns true if block can be safely freed. */ + +static bool tcf_chain_detach(struct tcf_chain *chain) { struct tcf_block *block = chain->block; + ASSERT_BLOCK_LOCKED(block); + list_del(&chain->list); if (!chain->index) block->chain0.chain = NULL; + + if (list_empty(&block->chain_list) && + refcount_read(&block->refcnt) == 0) + return true; + + return false; +} + +static void tcf_block_destroy(struct tcf_block *block) +{ + mutex_destroy(&block->lock); + kfree_rcu(block, rcu); +} + +static void tcf_chain_destroy(struct tcf_chain *chain, bool free_block) +{ + struct tcf_block *block = chain->block; + + mutex_destroy(&chain->filter_chain_lock); kfree(chain); - if (list_empty(&block->chain_list) && !refcount_read(&block->refcnt)) - kfree_rcu(block, rcu); + if (free_block) + tcf_block_destroy(block); } static void tcf_chain_hold(struct tcf_chain *chain) { + ASSERT_BLOCK_LOCKED(chain->block); + ++chain->refcnt; } static bool tcf_chain_held_by_acts_only(struct tcf_chain *chain) { + ASSERT_BLOCK_LOCKED(chain->block); + /* In case all the references are action references, this * chain should not be shown to the user. */ @@ -265,6 +394,8 @@ static struct tcf_chain *tcf_chain_lookup(struct tcf_block *block, { struct tcf_chain *chain; + ASSERT_BLOCK_LOCKED(block); + list_for_each_entry(chain, &block->chain_list, list) { if (chain->index == chain_index) return chain; @@ -279,31 +410,40 @@ static struct tcf_chain *__tcf_chain_get(struct tcf_block *block, u32 chain_index, bool create, bool by_act) { - struct tcf_chain *chain = tcf_chain_lookup(block, chain_index); + struct tcf_chain *chain = NULL; + bool is_first_reference; + mutex_lock(&block->lock); + chain = tcf_chain_lookup(block, chain_index); if (chain) { tcf_chain_hold(chain); } else { if (!create) - return NULL; + goto errout; chain = tcf_chain_create(block, chain_index); if (!chain) - return NULL; + goto errout; } if (by_act) ++chain->action_refcnt; + is_first_reference = chain->refcnt - chain->action_refcnt == 1; + mutex_unlock(&block->lock); /* Send notification only in case we got the first * non-action reference. Until then, the chain acts only as * a placeholder for actions pointing to it and user ought * not know about them. */ - if (chain->refcnt - chain->action_refcnt == 1 && !by_act) + if (is_first_reference && !by_act) tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL, RTM_NEWCHAIN, false); return chain; + +errout: + mutex_unlock(&block->lock); + return chain; } static struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, @@ -318,51 +458,94 @@ struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 chain_index) } EXPORT_SYMBOL(tcf_chain_get_by_act); -static void tc_chain_tmplt_del(struct tcf_chain *chain); +static void tc_chain_tmplt_del(const struct tcf_proto_ops *tmplt_ops, + void *tmplt_priv); +static int tc_chain_notify_delete(const struct tcf_proto_ops *tmplt_ops, + void *tmplt_priv, u32 chain_index, + struct tcf_block *block, struct sk_buff *oskb, + u32 seq, u16 flags, bool unicast); -static void __tcf_chain_put(struct tcf_chain *chain, bool by_act) +static void __tcf_chain_put(struct tcf_chain *chain, bool by_act, + bool explicitly_created) { + struct tcf_block *block = chain->block; + const struct tcf_proto_ops *tmplt_ops; + bool is_last, free_block = false; + unsigned int refcnt; + void *tmplt_priv; + u32 chain_index; + + mutex_lock(&block->lock); + if (explicitly_created) { + if (!chain->explicitly_created) { + mutex_unlock(&block->lock); + return; + } + chain->explicitly_created = false; + } + if (by_act) chain->action_refcnt--; - chain->refcnt--; + + /* tc_chain_notify_delete can't be called while holding block lock. + * However, when block is unlocked chain can be changed concurrently, so + * save these to temporary variables. + */ + refcnt = --chain->refcnt; + is_last = refcnt - chain->action_refcnt == 0; + tmplt_ops = chain->tmplt_ops; + tmplt_priv = chain->tmplt_priv; + chain_index = chain->index; + + if (refcnt == 0) + free_block = tcf_chain_detach(chain); + mutex_unlock(&block->lock); /* The last dropped non-action reference will trigger notification. */ - if (chain->refcnt - chain->action_refcnt == 0 && !by_act) - tc_chain_notify(chain, NULL, 0, 0, RTM_DELCHAIN, false); + if (is_last && !by_act) { + tc_chain_notify_delete(tmplt_ops, tmplt_priv, chain_index, + block, NULL, 0, 0, false); + /* Last reference to chain, no need to lock. */ + chain->flushing = false; + } - if (chain->refcnt == 0) { - tc_chain_tmplt_del(chain); - tcf_chain_destroy(chain); + if (refcnt == 0) { + tc_chain_tmplt_del(tmplt_ops, tmplt_priv); + tcf_chain_destroy(chain, free_block); } } static void tcf_chain_put(struct tcf_chain *chain) { - __tcf_chain_put(chain, false); + __tcf_chain_put(chain, false, false); } void tcf_chain_put_by_act(struct tcf_chain *chain) { - __tcf_chain_put(chain, true); + __tcf_chain_put(chain, true, false); } EXPORT_SYMBOL(tcf_chain_put_by_act); static void tcf_chain_put_explicitly_created(struct tcf_chain *chain) { - if (chain->explicitly_created) - tcf_chain_put(chain); + __tcf_chain_put(chain, false, true); } -static void tcf_chain_flush(struct tcf_chain *chain) +static void tcf_chain_flush(struct tcf_chain *chain, bool rtnl_held) { - struct tcf_proto *tp = rtnl_dereference(chain->filter_chain); + struct tcf_proto *tp, *tp_next; + mutex_lock(&chain->filter_chain_lock); + tp = tcf_chain_dereference(chain->filter_chain, chain); + RCU_INIT_POINTER(chain->filter_chain, NULL); tcf_chain0_head_change(chain, NULL); + chain->flushing = true; + mutex_unlock(&chain->filter_chain_lock); + while (tp) { - RCU_INIT_POINTER(chain->filter_chain, tp->next); - tcf_proto_destroy(tp, NULL); - tp = rtnl_dereference(chain->filter_chain); - tcf_chain_put(chain); + tp_next = rcu_dereference_protected(tp->next, 1); + tcf_proto_put(tp, rtnl_held, NULL); + tp = tp_next; } } @@ -684,8 +867,8 @@ tcf_chain0_head_change_cb_add(struct tcf_block *block, struct tcf_block_ext_info *ei, struct netlink_ext_ack *extack) { - struct tcf_chain *chain0 = block->chain0.chain; struct tcf_filter_chain_list_item *item; + struct tcf_chain *chain0; item = kmalloc(sizeof(*item), GFP_KERNEL); if (!item) { @@ -694,9 +877,32 @@ tcf_chain0_head_change_cb_add(struct tcf_block *block, } item->chain_head_change = ei->chain_head_change; item->chain_head_change_priv = ei->chain_head_change_priv; - if (chain0 && chain0->filter_chain) - tcf_chain_head_change_item(item, chain0->filter_chain); - list_add(&item->list, &block->chain0.filter_chain_list); + + mutex_lock(&block->lock); + chain0 = block->chain0.chain; + if (chain0) + tcf_chain_hold(chain0); + else + list_add(&item->list, &block->chain0.filter_chain_list); + mutex_unlock(&block->lock); + + if (chain0) { + struct tcf_proto *tp_head; + + mutex_lock(&chain0->filter_chain_lock); + + tp_head = tcf_chain_dereference(chain0->filter_chain, chain0); + if (tp_head) + tcf_chain_head_change_item(item, tp_head); + + mutex_lock(&block->lock); + list_add(&item->list, &block->chain0.filter_chain_list); + mutex_unlock(&block->lock); + + mutex_unlock(&chain0->filter_chain_lock); + tcf_chain_put(chain0); + } + return 0; } @@ -704,20 +910,23 @@ static void tcf_chain0_head_change_cb_del(struct tcf_block *block, struct tcf_block_ext_info *ei) { - struct tcf_chain *chain0 = block->chain0.chain; struct tcf_filter_chain_list_item *item; + mutex_lock(&block->lock); list_for_each_entry(item, &block->chain0.filter_chain_list, list) { if ((!ei->chain_head_change && !ei->chain_head_change_priv) || (item->chain_head_change == ei->chain_head_change && item->chain_head_change_priv == ei->chain_head_change_priv)) { - if (chain0) + if (block->chain0.chain) tcf_chain_head_change_item(item, NULL); list_del(&item->list); + mutex_unlock(&block->lock); + kfree(item); return; } } + mutex_unlock(&block->lock); WARN_ON(1); } @@ -764,6 +973,7 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q, NL_SET_ERR_MSG(extack, "Memory allocation for block failed"); return ERR_PTR(-ENOMEM); } + mutex_init(&block->lock); INIT_LIST_HEAD(&block->chain_list); INIT_LIST_HEAD(&block->cb_list); INIT_LIST_HEAD(&block->owner_list); @@ -799,157 +1009,241 @@ static struct tcf_block *tcf_block_refcnt_get(struct net *net, u32 block_index) return block; } -static void tcf_block_flush_all_chains(struct tcf_block *block) +static struct tcf_chain * +__tcf_get_next_chain(struct tcf_block *block, struct tcf_chain *chain) { - struct tcf_chain *chain; + mutex_lock(&block->lock); + if (chain) + chain = list_is_last(&chain->list, &block->chain_list) ? + NULL : list_next_entry(chain, list); + else + chain = list_first_entry_or_null(&block->chain_list, + struct tcf_chain, list); - /* Hold a refcnt for all chains, so that they don't disappear - * while we are iterating. - */ - list_for_each_entry(chain, &block->chain_list, list) + /* skip all action-only chains */ + while (chain && tcf_chain_held_by_acts_only(chain)) + chain = list_is_last(&chain->list, &block->chain_list) ? + NULL : list_next_entry(chain, list); + + if (chain) tcf_chain_hold(chain); + mutex_unlock(&block->lock); - list_for_each_entry(chain, &block->chain_list, list) - tcf_chain_flush(chain); + return chain; } -static void tcf_block_put_all_chains(struct tcf_block *block) +/* Function to be used by all clients that want to iterate over all chains on + * block. It properly obtains block->lock and takes reference to chain before + * returning it. Users of this function must be tolerant to concurrent chain + * insertion/deletion or ensure that no concurrent chain modification is + * possible. Note that all netlink dump callbacks cannot guarantee to provide + * consistent dump because rtnl lock is released each time skb is filled with + * data and sent to user-space. + */ + +struct tcf_chain * +tcf_get_next_chain(struct tcf_block *block, struct tcf_chain *chain) { - struct tcf_chain *chain, *tmp; + struct tcf_chain *chain_next = __tcf_get_next_chain(block, chain); - /* At this point, all the chains should have refcnt >= 1. */ - list_for_each_entry_safe(chain, tmp, &block->chain_list, list) { - tcf_chain_put_explicitly_created(chain); + if (chain) tcf_chain_put(chain); - } + + return chain_next; } +EXPORT_SYMBOL(tcf_get_next_chain); -static void __tcf_block_put(struct tcf_block *block, struct Qdisc *q, - struct tcf_block_ext_info *ei) +static struct tcf_proto * +__tcf_get_next_proto(struct tcf_chain *chain, struct tcf_proto *tp) { - if (refcount_dec_and_test(&block->refcnt)) { - /* Flushing/putting all chains will cause the block to be - * deallocated when last chain is freed. However, if chain_list - * is empty, block has to be manually deallocated. After block - * reference counter reached 0, it is no longer possible to - * increment it or add new chains to block. - */ - bool free_block = list_empty(&block->chain_list); + u32 prio = 0; - if (tcf_block_shared(block)) - tcf_block_remove(block, block->net); - if (!free_block) - tcf_block_flush_all_chains(block); + ASSERT_RTNL(); + mutex_lock(&chain->filter_chain_lock); - if (q) - tcf_block_offload_unbind(block, q, ei); + if (!tp) { + tp = tcf_chain_dereference(chain->filter_chain, chain); + } else if (tcf_proto_is_deleting(tp)) { + /* 'deleting' flag is set and chain->filter_chain_lock was + * unlocked, which means next pointer could be invalid. Restart + * search. + */ + prio = tp->prio + 1; + tp = tcf_chain_dereference(chain->filter_chain, chain); - if (free_block) - kfree_rcu(block, rcu); - else - tcf_block_put_all_chains(block); - } else if (q) { - tcf_block_offload_unbind(block, q, ei); + for (; tp; tp = tcf_chain_dereference(tp->next, chain)) + if (!tp->deleting && tp->prio >= prio) + break; + } else { + tp = tcf_chain_dereference(tp->next, chain); } + + if (tp) + tcf_proto_get(tp); + + mutex_unlock(&chain->filter_chain_lock); + + return tp; } -static void tcf_block_refcnt_put(struct tcf_block *block) +/* Function to be used by all clients that want to iterate over all tp's on + * chain. Users of this function must be tolerant to concurrent tp + * insertion/deletion or ensure that no concurrent chain modification is + * possible. Note that all netlink dump callbacks cannot guarantee to provide + * consistent dump because rtnl lock is released each time skb is filled with + * data and sent to user-space. + */ + +struct tcf_proto * +tcf_get_next_proto(struct tcf_chain *chain, struct tcf_proto *tp, + bool rtnl_held) { - __tcf_block_put(block, NULL, NULL); + struct tcf_proto *tp_next = __tcf_get_next_proto(chain, tp); + + if (tp) + tcf_proto_put(tp, rtnl_held, NULL); + + return tp_next; } +EXPORT_SYMBOL(tcf_get_next_proto); -/* Find tcf block. - * Set q, parent, cl when appropriate. +static void tcf_block_flush_all_chains(struct tcf_block *block, bool rtnl_held) +{ + struct tcf_chain *chain; + + /* Last reference to block. At this point chains cannot be added or + * removed concurrently. + */ + for (chain = tcf_get_next_chain(block, NULL); + chain; + chain = tcf_get_next_chain(block, chain)) { + tcf_chain_put_explicitly_created(chain); + tcf_chain_flush(chain, rtnl_held); + } +} + +/* Lookup Qdisc and increments its reference counter. + * Set parent, if necessary. */ -static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q, - u32 *parent, unsigned long *cl, - int ifindex, u32 block_index, - struct netlink_ext_ack *extack) +static int __tcf_qdisc_find(struct net *net, struct Qdisc **q, + u32 *parent, int ifindex, bool rtnl_held, + struct netlink_ext_ack *extack) { - struct tcf_block *block; + const struct Qdisc_class_ops *cops; + struct net_device *dev; int err = 0; - if (ifindex == TCM_IFINDEX_MAGIC_BLOCK) { - block = tcf_block_refcnt_get(net, block_index); - if (!block) { - NL_SET_ERR_MSG(extack, "Block of given index was not found"); - return ERR_PTR(-EINVAL); - } - } else { - const struct Qdisc_class_ops *cops; - struct net_device *dev; - - rcu_read_lock(); + if (ifindex == TCM_IFINDEX_MAGIC_BLOCK) + return 0; - /* Find link */ - dev = dev_get_by_index_rcu(net, ifindex); - if (!dev) { - rcu_read_unlock(); - return ERR_PTR(-ENODEV); - } + rcu_read_lock(); - /* Find qdisc */ - if (!*parent) { - *q = dev->qdisc; - *parent = (*q)->handle; - } else { - *q = qdisc_lookup_rcu(dev, TC_H_MAJ(*parent)); - if (!*q) { - NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists"); - err = -EINVAL; - goto errout_rcu; - } - } + /* Find link */ + dev = dev_get_by_index_rcu(net, ifindex); + if (!dev) { + rcu_read_unlock(); + return -ENODEV; + } - *q = qdisc_refcount_inc_nz(*q); + /* Find qdisc */ + if (!*parent) { + *q = dev->qdisc; + *parent = (*q)->handle; + } else { + *q = qdisc_lookup_rcu(dev, TC_H_MAJ(*parent)); if (!*q) { NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists"); err = -EINVAL; goto errout_rcu; } + } - /* Is it classful? */ - cops = (*q)->ops->cl_ops; - if (!cops) { - NL_SET_ERR_MSG(extack, "Qdisc not classful"); - err = -EINVAL; - goto errout_rcu; - } + *q = qdisc_refcount_inc_nz(*q); + if (!*q) { + NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists"); + err = -EINVAL; + goto errout_rcu; + } - if (!cops->tcf_block) { - NL_SET_ERR_MSG(extack, "Class doesn't support blocks"); - err = -EOPNOTSUPP; - goto errout_rcu; - } + /* Is it classful? */ + cops = (*q)->ops->cl_ops; + if (!cops) { + NL_SET_ERR_MSG(extack, "Qdisc not classful"); + err = -EINVAL; + goto errout_qdisc; + } - /* At this point we know that qdisc is not noop_qdisc, - * which means that qdisc holds a reference to net_device - * and we hold a reference to qdisc, so it is safe to release - * rcu read lock. - */ - rcu_read_unlock(); + if (!cops->tcf_block) { + NL_SET_ERR_MSG(extack, "Class doesn't support blocks"); + err = -EOPNOTSUPP; + goto errout_qdisc; + } - /* Do we search for filter, attached to class? */ - if (TC_H_MIN(*parent)) { - *cl = cops->find(*q, *parent); - if (*cl == 0) { - NL_SET_ERR_MSG(extack, "Specified class doesn't exist"); - err = -ENOENT; - goto errout_qdisc; - } +errout_rcu: + /* At this point we know that qdisc is not noop_qdisc, + * which means that qdisc holds a reference to net_device + * and we hold a reference to qdisc, so it is safe to release + * rcu read lock. + */ + rcu_read_unlock(); + return err; + +errout_qdisc: + rcu_read_unlock(); + + if (rtnl_held) + qdisc_put(*q); + else + qdisc_put_unlocked(*q); + *q = NULL; + + return err; +} + +static int __tcf_qdisc_cl_find(struct Qdisc *q, u32 parent, unsigned long *cl, + int ifindex, struct netlink_ext_ack *extack) +{ + if (ifindex == TCM_IFINDEX_MAGIC_BLOCK) + return 0; + + /* Do we search for filter, attached to class? */ + if (TC_H_MIN(parent)) { + const struct Qdisc_class_ops *cops = q->ops->cl_ops; + + *cl = cops->find(q, parent); + if (*cl == 0) { + NL_SET_ERR_MSG(extack, "Specified class doesn't exist"); + return -ENOENT; } + } + + return 0; +} + +static struct tcf_block *__tcf_block_find(struct net *net, struct Qdisc *q, + unsigned long cl, int ifindex, + u32 block_index, + struct netlink_ext_ack *extack) +{ + struct tcf_block *block; - /* And the last stroke */ - block = cops->tcf_block(*q, *cl, extack); + if (ifindex == TCM_IFINDEX_MAGIC_BLOCK) { + block = tcf_block_refcnt_get(net, block_index); if (!block) { - err = -EINVAL; - goto errout_qdisc; + NL_SET_ERR_MSG(extack, "Block of given index was not found"); + return ERR_PTR(-EINVAL); } + } else { + const struct Qdisc_class_ops *cops = q->ops->cl_ops; + + block = cops->tcf_block(q, cl, extack); + if (!block) + return ERR_PTR(-EINVAL); + if (tcf_block_shared(block)) { NL_SET_ERR_MSG(extack, "This filter block is shared. Please use the block index to manipulate the filters"); - err = -EOPNOTSUPP; - goto errout_qdisc; + return ERR_PTR(-EOPNOTSUPP); } /* Always take reference to block in order to support execution @@ -962,24 +1256,91 @@ static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q, } return block; +} + +static void __tcf_block_put(struct tcf_block *block, struct Qdisc *q, + struct tcf_block_ext_info *ei, bool rtnl_held) +{ + if (refcount_dec_and_mutex_lock(&block->refcnt, &block->lock)) { + /* Flushing/putting all chains will cause the block to be + * deallocated when last chain is freed. However, if chain_list + * is empty, block has to be manually deallocated. After block + * reference counter reached 0, it is no longer possible to + * increment it or add new chains to block. + */ + bool free_block = list_empty(&block->chain_list); + + mutex_unlock(&block->lock); + if (tcf_block_shared(block)) + tcf_block_remove(block, block->net); + + if (q) + tcf_block_offload_unbind(block, q, ei); + + if (free_block) + tcf_block_destroy(block); + else + tcf_block_flush_all_chains(block, rtnl_held); + } else if (q) { + tcf_block_offload_unbind(block, q, ei); + } +} + +static void tcf_block_refcnt_put(struct tcf_block *block, bool rtnl_held) +{ + __tcf_block_put(block, NULL, NULL, rtnl_held); +} + +/* Find tcf block. + * Set q, parent, cl when appropriate. + */ + +static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q, + u32 *parent, unsigned long *cl, + int ifindex, u32 block_index, + struct netlink_ext_ack *extack) +{ + struct tcf_block *block; + int err = 0; + + ASSERT_RTNL(); + + err = __tcf_qdisc_find(net, q, parent, ifindex, true, extack); + if (err) + goto errout; + + err = __tcf_qdisc_cl_find(*q, *parent, cl, ifindex, extack); + if (err) + goto errout_qdisc; + + block = __tcf_block_find(net, *q, *cl, ifindex, block_index, extack); + if (IS_ERR(block)) { + err = PTR_ERR(block); + goto errout_qdisc; + } + + return block; -errout_rcu: - rcu_read_unlock(); errout_qdisc: - if (*q) { + if (*q) qdisc_put(*q); - *q = NULL; - } +errout: + *q = NULL; return ERR_PTR(err); } -static void tcf_block_release(struct Qdisc *q, struct tcf_block *block) +static void tcf_block_release(struct Qdisc *q, struct tcf_block *block, + bool rtnl_held) { if (!IS_ERR_OR_NULL(block)) - tcf_block_refcnt_put(block); + tcf_block_refcnt_put(block, rtnl_held); - if (q) - qdisc_put(q); + if (q) { + if (rtnl_held) + qdisc_put(q); + else + qdisc_put_unlocked(q); + } } struct tcf_block_owner_item { @@ -1087,7 +1448,7 @@ err_chain0_head_change_cb_add: tcf_block_owner_del(block, q, ei->binder_type); err_block_owner_add: err_block_insert: - tcf_block_refcnt_put(block); + tcf_block_refcnt_put(block, true); return err; } EXPORT_SYMBOL(tcf_block_get_ext); @@ -1124,7 +1485,7 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, tcf_chain0_head_change_cb_del(block, ei); tcf_block_owner_del(block, q, ei->binder_type); - __tcf_block_put(block, q, ei); + __tcf_block_put(block, q, ei, true); } EXPORT_SYMBOL(tcf_block_put_ext); @@ -1181,13 +1542,19 @@ tcf_block_playback_offloads(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_priv, bool add, bool offload_in_use, struct netlink_ext_ack *extack) { - struct tcf_chain *chain; - struct tcf_proto *tp; + struct tcf_chain *chain, *chain_prev; + struct tcf_proto *tp, *tp_prev; int err; - list_for_each_entry(chain, &block->chain_list, list) { - for (tp = rtnl_dereference(chain->filter_chain); tp; - tp = rtnl_dereference(tp->next)) { + for (chain = __tcf_get_next_chain(block, NULL); + chain; + chain_prev = chain, + chain = __tcf_get_next_chain(block, chain), + tcf_chain_put(chain_prev)) { + for (tp = __tcf_get_next_proto(chain, NULL); tp; + tp_prev = tp, + tp = __tcf_get_next_proto(chain, tp), + tcf_proto_put(tp_prev, true, NULL)) { if (tp->ops->reoffload) { err = tp->ops->reoffload(tp, add, cb, cb_priv, extack); @@ -1204,6 +1571,8 @@ tcf_block_playback_offloads(struct tcf_block *block, tc_setup_cb_t *cb, return 0; err_playback_remove: + tcf_proto_put(tp, true, NULL); + tcf_chain_put(chain); tcf_block_playback_offloads(block, cb, cb_priv, false, offload_in_use, extack); return err; @@ -1329,32 +1698,116 @@ struct tcf_chain_info { struct tcf_proto __rcu *next; }; -static struct tcf_proto *tcf_chain_tp_prev(struct tcf_chain_info *chain_info) +static struct tcf_proto *tcf_chain_tp_prev(struct tcf_chain *chain, + struct tcf_chain_info *chain_info) { - return rtnl_dereference(*chain_info->pprev); + return tcf_chain_dereference(*chain_info->pprev, chain); } -static void tcf_chain_tp_insert(struct tcf_chain *chain, - struct tcf_chain_info *chain_info, - struct tcf_proto *tp) +static int tcf_chain_tp_insert(struct tcf_chain *chain, + struct tcf_chain_info *chain_info, + struct tcf_proto *tp) { + if (chain->flushing) + return -EAGAIN; + if (*chain_info->pprev == chain->filter_chain) tcf_chain0_head_change(chain, tp); - RCU_INIT_POINTER(tp->next, tcf_chain_tp_prev(chain_info)); + tcf_proto_get(tp); + RCU_INIT_POINTER(tp->next, tcf_chain_tp_prev(chain, chain_info)); rcu_assign_pointer(*chain_info->pprev, tp); - tcf_chain_hold(chain); + + return 0; } static void tcf_chain_tp_remove(struct tcf_chain *chain, struct tcf_chain_info *chain_info, struct tcf_proto *tp) { - struct tcf_proto *next = rtnl_dereference(chain_info->next); + struct tcf_proto *next = tcf_chain_dereference(chain_info->next, chain); + tcf_proto_mark_delete(tp); if (tp == chain->filter_chain) tcf_chain0_head_change(chain, next); RCU_INIT_POINTER(*chain_info->pprev, next); - tcf_chain_put(chain); +} + +static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain, + struct tcf_chain_info *chain_info, + u32 protocol, u32 prio, + bool prio_allocate); + +/* Try to insert new proto. + * If proto with specified priority already exists, free new proto + * and return existing one. + */ + +static struct tcf_proto *tcf_chain_tp_insert_unique(struct tcf_chain *chain, + struct tcf_proto *tp_new, + u32 protocol, u32 prio, + bool rtnl_held) +{ + struct tcf_chain_info chain_info; + struct tcf_proto *tp; + int err = 0; + + mutex_lock(&chain->filter_chain_lock); + + tp = tcf_chain_tp_find(chain, &chain_info, + protocol, prio, false); + if (!tp) + err = tcf_chain_tp_insert(chain, &chain_info, tp_new); + mutex_unlock(&chain->filter_chain_lock); + + if (tp) { + tcf_proto_destroy(tp_new, rtnl_held, NULL); + tp_new = tp; + } else if (err) { + tcf_proto_destroy(tp_new, rtnl_held, NULL); + tp_new = ERR_PTR(err); + } + + return tp_new; +} + +static void tcf_chain_tp_delete_empty(struct tcf_chain *chain, + struct tcf_proto *tp, bool rtnl_held, + struct netlink_ext_ack *extack) +{ + struct tcf_chain_info chain_info; + struct tcf_proto *tp_iter; + struct tcf_proto **pprev; + struct tcf_proto *next; + + mutex_lock(&chain->filter_chain_lock); + + /* Atomically find and remove tp from chain. */ + for (pprev = &chain->filter_chain; + (tp_iter = tcf_chain_dereference(*pprev, chain)); + pprev = &tp_iter->next) { + if (tp_iter == tp) { + chain_info.pprev = pprev; + chain_info.next = tp_iter->next; + WARN_ON(tp_iter->deleting); + break; + } + } + /* Verify that tp still exists and no new filters were inserted + * concurrently. + * Mark tp for deletion if it is empty. + */ + if (!tp_iter || !tcf_proto_check_delete(tp, rtnl_held)) { + mutex_unlock(&chain->filter_chain_lock); + return; + } + + next = tcf_chain_dereference(chain_info.next, chain); + if (tp == chain->filter_chain) + tcf_chain0_head_change(chain, next); + RCU_INIT_POINTER(*chain_info.pprev, next); + mutex_unlock(&chain->filter_chain_lock); + + tcf_proto_put(tp, rtnl_held, extack); } static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain, @@ -1367,7 +1820,8 @@ static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain, /* Check the chain for existence of proto-tcf with this priority */ for (pprev = &chain->filter_chain; - (tp = rtnl_dereference(*pprev)); pprev = &tp->next) { + (tp = tcf_chain_dereference(*pprev, chain)); + pprev = &tp->next) { if (tp->prio >= prio) { if (tp->prio == prio) { if (prio_allocate || @@ -1380,14 +1834,20 @@ static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain, } } chain_info->pprev = pprev; - chain_info->next = tp ? tp->next : NULL; + if (tp) { + chain_info->next = tp->next; + tcf_proto_get(tp); + } else { + chain_info->next = NULL; + } return tp; } static int tcf_fill_node(struct net *net, struct sk_buff *skb, struct tcf_proto *tp, struct tcf_block *block, struct Qdisc *q, u32 parent, void *fh, - u32 portid, u32 seq, u16 flags, int event) + u32 portid, u32 seq, u16 flags, int event, + bool rtnl_held) { struct tcmsg *tcm; struct nlmsghdr *nlh; @@ -1415,7 +1875,8 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, if (!fh) { tcm->tcm_handle = 0; } else { - if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm) < 0) + if (tp->ops->dump && + tp->ops->dump(net, tp, fh, skb, tcm, rtnl_held) < 0) goto nla_put_failure; } nlh->nlmsg_len = skb_tail_pointer(skb) - b; @@ -1430,7 +1891,8 @@ nla_put_failure: static int tfilter_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct tcf_proto *tp, struct tcf_block *block, struct Qdisc *q, - u32 parent, void *fh, int event, bool unicast) + u32 parent, void *fh, int event, bool unicast, + bool rtnl_held) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; @@ -1440,7 +1902,8 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, return -ENOBUFS; if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, - n->nlmsg_seq, n->nlmsg_flags, event) <= 0) { + n->nlmsg_seq, n->nlmsg_flags, event, + rtnl_held) <= 0) { kfree_skb(skb); return -EINVAL; } @@ -1456,7 +1919,7 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct tcf_proto *tp, struct tcf_block *block, struct Qdisc *q, u32 parent, void *fh, bool unicast, bool *last, - struct netlink_ext_ack *extack) + bool rtnl_held, struct netlink_ext_ack *extack) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; @@ -1467,13 +1930,14 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, return -ENOBUFS; if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, - n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER) <= 0) { + n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER, + rtnl_held) <= 0) { NL_SET_ERR_MSG(extack, "Failed to build del event notification"); kfree_skb(skb); return -EINVAL; } - err = tp->ops->delete(tp, fh, last, extack); + err = tp->ops->delete(tp, fh, last, rtnl_held, extack); if (err) { kfree_skb(skb); return err; @@ -1492,14 +1956,21 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb, struct tcf_block *block, struct Qdisc *q, u32 parent, struct nlmsghdr *n, - struct tcf_chain *chain, int event) + struct tcf_chain *chain, int event, + bool rtnl_held) { struct tcf_proto *tp; - for (tp = rtnl_dereference(chain->filter_chain); - tp; tp = rtnl_dereference(tp->next)) + for (tp = tcf_get_next_proto(chain, NULL, rtnl_held); + tp; tp = tcf_get_next_proto(chain, tp, rtnl_held)) tfilter_notify(net, oskb, n, tp, block, - q, parent, NULL, event, false); + q, parent, NULL, event, false, rtnl_held); +} + +static void tfilter_put(struct tcf_proto *tp, void *fh) +{ + if (tp->ops->put && fh) + tp->ops->put(tp, fh); } static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n, @@ -1522,6 +1993,7 @@ static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *fh; int err; int tp_created; + bool rtnl_held = false; if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) return -EPERM; @@ -1538,7 +2010,9 @@ replay: prio = TC_H_MAJ(t->tcm_info); prio_allocate = false; parent = t->tcm_parent; + tp = NULL; cl = 0; + block = NULL; if (prio == 0) { /* If no priority is provided by the user, @@ -1555,8 +2029,27 @@ replay: /* Find head of filter chain. */ - block = tcf_block_find(net, &q, &parent, &cl, - t->tcm_ifindex, t->tcm_block_index, extack); + err = __tcf_qdisc_find(net, &q, &parent, t->tcm_ifindex, false, extack); + if (err) + return err; + + /* Take rtnl mutex if rtnl_held was set to true on previous iteration, + * block is shared (no qdisc found), qdisc is not unlocked, classifier + * type is not specified, classifier is not unlocked. + */ + if (rtnl_held || + (q && !(q->ops->cl_ops->flags & QDISC_CLASS_OPS_DOIT_UNLOCKED)) || + !tca[TCA_KIND] || !tcf_proto_is_unlocked(nla_data(tca[TCA_KIND]))) { + rtnl_held = true; + rtnl_lock(); + } + + err = __tcf_qdisc_cl_find(q, parent, &cl, t->tcm_ifindex, extack); + if (err) + goto errout; + + block = __tcf_block_find(net, q, cl, t->tcm_ifindex, t->tcm_block_index, + extack); if (IS_ERR(block)) { err = PTR_ERR(block); goto errout; @@ -1575,40 +2068,62 @@ replay: goto errout; } + mutex_lock(&chain->filter_chain_lock); tp = tcf_chain_tp_find(chain, &chain_info, protocol, prio, prio_allocate); if (IS_ERR(tp)) { NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found"); err = PTR_ERR(tp); - goto errout; + goto errout_locked; } if (tp == NULL) { + struct tcf_proto *tp_new = NULL; + + if (chain->flushing) { + err = -EAGAIN; + goto errout_locked; + } + /* Proto-tcf does not exist, create new one */ if (tca[TCA_KIND] == NULL || !protocol) { NL_SET_ERR_MSG(extack, "Filter kind and protocol must be specified"); err = -EINVAL; - goto errout; + goto errout_locked; } if (!(n->nlmsg_flags & NLM_F_CREATE)) { NL_SET_ERR_MSG(extack, "Need both RTM_NEWTFILTER and NLM_F_CREATE to create a new filter"); err = -ENOENT; - goto errout; + goto errout_locked; } if (prio_allocate) - prio = tcf_auto_prio(tcf_chain_tp_prev(&chain_info)); + prio = tcf_auto_prio(tcf_chain_tp_prev(chain, + &chain_info)); + + mutex_unlock(&chain->filter_chain_lock); + tp_new = tcf_proto_create(nla_data(tca[TCA_KIND]), + protocol, prio, chain, rtnl_held, + extack); + if (IS_ERR(tp_new)) { + err = PTR_ERR(tp_new); + goto errout_tp; + } - tp = tcf_proto_create(nla_data(tca[TCA_KIND]), - protocol, prio, chain, extack); + tp_created = 1; + tp = tcf_chain_tp_insert_unique(chain, tp_new, protocol, prio, + rtnl_held); if (IS_ERR(tp)) { err = PTR_ERR(tp); - goto errout; + goto errout_tp; } - tp_created = 1; - } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) { + } else { + mutex_unlock(&chain->filter_chain_lock); + } + + if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) { NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one"); err = -EINVAL; goto errout; @@ -1623,6 +2138,7 @@ replay: goto errout; } } else if (n->nlmsg_flags & NLM_F_EXCL) { + tfilter_put(tp, fh); NL_SET_ERR_MSG(extack, "Filter already exists"); err = -EEXIST; goto errout; @@ -1636,25 +2152,41 @@ replay: err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh, n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE, - extack); + rtnl_held, extack); if (err == 0) { - if (tp_created) - tcf_chain_tp_insert(chain, &chain_info, tp); tfilter_notify(net, skb, n, tp, block, q, parent, fh, - RTM_NEWTFILTER, false); - } else { - if (tp_created) - tcf_proto_destroy(tp, NULL); + RTM_NEWTFILTER, false, rtnl_held); + tfilter_put(tp, fh); } errout: - if (chain) - tcf_chain_put(chain); - tcf_block_release(q, block); - if (err == -EAGAIN) + if (err && tp_created) + tcf_chain_tp_delete_empty(chain, tp, rtnl_held, NULL); +errout_tp: + if (chain) { + if (tp && !IS_ERR(tp)) + tcf_proto_put(tp, rtnl_held, NULL); + if (!tp_created) + tcf_chain_put(chain); + } + tcf_block_release(q, block, rtnl_held); + + if (rtnl_held) + rtnl_unlock(); + + if (err == -EAGAIN) { + /* Take rtnl lock in case EAGAIN is caused by concurrent flush + * of target chain. + */ + rtnl_held = true; /* Replay the request. */ goto replay; + } return err; + +errout_locked: + mutex_unlock(&chain->filter_chain_lock); + goto errout; } static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, @@ -1670,11 +2202,12 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, struct Qdisc *q = NULL; struct tcf_chain_info chain_info; struct tcf_chain *chain = NULL; - struct tcf_block *block; + struct tcf_block *block = NULL; struct tcf_proto *tp = NULL; unsigned long cl = 0; void *fh = NULL; int err; + bool rtnl_held = false; if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) return -EPERM; @@ -1695,8 +2228,27 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, /* Find head of filter chain. */ - block = tcf_block_find(net, &q, &parent, &cl, - t->tcm_ifindex, t->tcm_block_index, extack); + err = __tcf_qdisc_find(net, &q, &parent, t->tcm_ifindex, false, extack); + if (err) + return err; + + /* Take rtnl mutex if flushing whole chain, block is shared (no qdisc + * found), qdisc is not unlocked, classifier type is not specified, + * classifier is not unlocked. + */ + if (!prio || + (q && !(q->ops->cl_ops->flags & QDISC_CLASS_OPS_DOIT_UNLOCKED)) || + !tca[TCA_KIND] || !tcf_proto_is_unlocked(nla_data(tca[TCA_KIND]))) { + rtnl_held = true; + rtnl_lock(); + } + + err = __tcf_qdisc_cl_find(q, parent, &cl, t->tcm_ifindex, extack); + if (err) + goto errout; + + block = __tcf_block_find(net, q, cl, t->tcm_ifindex, t->tcm_block_index, + extack); if (IS_ERR(block)) { err = PTR_ERR(block); goto errout; @@ -1724,56 +2276,69 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, if (prio == 0) { tfilter_notify_chain(net, skb, block, q, parent, n, - chain, RTM_DELTFILTER); - tcf_chain_flush(chain); + chain, RTM_DELTFILTER, rtnl_held); + tcf_chain_flush(chain, rtnl_held); err = 0; goto errout; } + mutex_lock(&chain->filter_chain_lock); tp = tcf_chain_tp_find(chain, &chain_info, protocol, prio, false); if (!tp || IS_ERR(tp)) { NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found"); err = tp ? PTR_ERR(tp) : -ENOENT; - goto errout; + goto errout_locked; } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) { NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one"); err = -EINVAL; + goto errout_locked; + } else if (t->tcm_handle == 0) { + tcf_chain_tp_remove(chain, &chain_info, tp); + mutex_unlock(&chain->filter_chain_lock); + + tcf_proto_put(tp, rtnl_held, NULL); + tfilter_notify(net, skb, n, tp, block, q, parent, fh, + RTM_DELTFILTER, false, rtnl_held); + err = 0; goto errout; } + mutex_unlock(&chain->filter_chain_lock); fh = tp->ops->get(tp, t->tcm_handle); if (!fh) { - if (t->tcm_handle == 0) { - tcf_chain_tp_remove(chain, &chain_info, tp); - tfilter_notify(net, skb, n, tp, block, q, parent, fh, - RTM_DELTFILTER, false); - tcf_proto_destroy(tp, extack); - err = 0; - } else { - NL_SET_ERR_MSG(extack, "Specified filter handle not found"); - err = -ENOENT; - } + NL_SET_ERR_MSG(extack, "Specified filter handle not found"); + err = -ENOENT; } else { bool last; err = tfilter_del_notify(net, skb, n, tp, block, q, parent, fh, false, &last, - extack); + rtnl_held, extack); + if (err) goto errout; - if (last) { - tcf_chain_tp_remove(chain, &chain_info, tp); - tcf_proto_destroy(tp, extack); - } + if (last) + tcf_chain_tp_delete_empty(chain, tp, rtnl_held, extack); } errout: - if (chain) + if (chain) { + if (tp && !IS_ERR(tp)) + tcf_proto_put(tp, rtnl_held, NULL); tcf_chain_put(chain); - tcf_block_release(q, block); + } + tcf_block_release(q, block, rtnl_held); + + if (rtnl_held) + rtnl_unlock(); + return err; + +errout_locked: + mutex_unlock(&chain->filter_chain_lock); + goto errout; } static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n, @@ -1789,11 +2354,12 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n, struct Qdisc *q = NULL; struct tcf_chain_info chain_info; struct tcf_chain *chain = NULL; - struct tcf_block *block; + struct tcf_block *block = NULL; struct tcf_proto *tp = NULL; unsigned long cl = 0; void *fh = NULL; int err; + bool rtnl_held = false; err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) @@ -1811,8 +2377,26 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n, /* Find head of filter chain. */ - block = tcf_block_find(net, &q, &parent, &cl, - t->tcm_ifindex, t->tcm_block_index, extack); + err = __tcf_qdisc_find(net, &q, &parent, t->tcm_ifindex, false, extack); + if (err) + return err; + + /* Take rtnl mutex if block is shared (no qdisc found), qdisc is not + * unlocked, classifier type is not specified, classifier is not + * unlocked. + */ + if ((q && !(q->ops->cl_ops->flags & QDISC_CLASS_OPS_DOIT_UNLOCKED)) || + !tca[TCA_KIND] || !tcf_proto_is_unlocked(nla_data(tca[TCA_KIND]))) { + rtnl_held = true; + rtnl_lock(); + } + + err = __tcf_qdisc_cl_find(q, parent, &cl, t->tcm_ifindex, extack); + if (err) + goto errout; + + block = __tcf_block_find(net, q, cl, t->tcm_ifindex, t->tcm_block_index, + extack); if (IS_ERR(block)) { err = PTR_ERR(block); goto errout; @@ -1831,8 +2415,10 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n, goto errout; } + mutex_lock(&chain->filter_chain_lock); tp = tcf_chain_tp_find(chain, &chain_info, protocol, prio, false); + mutex_unlock(&chain->filter_chain_lock); if (!tp || IS_ERR(tp)) { NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found"); err = tp ? PTR_ERR(tp) : -ENOENT; @@ -1850,15 +2436,23 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n, err = -ENOENT; } else { err = tfilter_notify(net, skb, n, tp, block, q, parent, - fh, RTM_NEWTFILTER, true); + fh, RTM_NEWTFILTER, true, rtnl_held); if (err < 0) NL_SET_ERR_MSG(extack, "Failed to send filter notify message"); } + tfilter_put(tp, fh); errout: - if (chain) + if (chain) { + if (tp && !IS_ERR(tp)) + tcf_proto_put(tp, rtnl_held, NULL); tcf_chain_put(chain); - tcf_block_release(q, block); + } + tcf_block_release(q, block, rtnl_held); + + if (rtnl_held) + rtnl_unlock(); + return err; } @@ -1879,7 +2473,7 @@ static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg) return tcf_fill_node(net, a->skb, tp, a->block, a->q, a->parent, n, NETLINK_CB(a->cb->skb).portid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWTFILTER); + RTM_NEWTFILTER, true); } static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, @@ -1889,11 +2483,15 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, struct net *net = sock_net(skb->sk); struct tcf_block *block = chain->block; struct tcmsg *tcm = nlmsg_data(cb->nlh); + struct tcf_proto *tp, *tp_prev; struct tcf_dump_args arg; - struct tcf_proto *tp; - for (tp = rtnl_dereference(chain->filter_chain); - tp; tp = rtnl_dereference(tp->next), (*p_index)++) { + for (tp = __tcf_get_next_proto(chain, NULL); + tp; + tp_prev = tp, + tp = __tcf_get_next_proto(chain, tp), + tcf_proto_put(tp_prev, true, NULL), + (*p_index)++) { if (*p_index < index_start) continue; if (TC_H_MAJ(tcm->tcm_info) && @@ -1909,9 +2507,8 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, if (tcf_fill_node(net, skb, tp, block, q, parent, NULL, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWTFILTER) <= 0) - return false; - + RTM_NEWTFILTER, true) <= 0) + goto errout; cb->args[1] = 1; } if (!tp->ops->walk) @@ -1926,23 +2523,27 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, arg.w.skip = cb->args[1] - 1; arg.w.count = 0; arg.w.cookie = cb->args[2]; - tp->ops->walk(tp, &arg.w); + tp->ops->walk(tp, &arg.w, true); cb->args[2] = arg.w.cookie; cb->args[1] = arg.w.count + 1; if (arg.w.stop) - return false; + goto errout; } return true; + +errout: + tcf_proto_put(tp, true, NULL); + return false; } /* called with RTNL */ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) { + struct tcf_chain *chain, *chain_prev; struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_MAX + 1]; struct Qdisc *q = NULL; struct tcf_block *block; - struct tcf_chain *chain; struct tcmsg *tcm = nlmsg_data(cb->nlh); long index_start; long index; @@ -2006,19 +2607,24 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) index_start = cb->args[0]; index = 0; - list_for_each_entry(chain, &block->chain_list, list) { + for (chain = __tcf_get_next_chain(block, NULL); + chain; + chain_prev = chain, + chain = __tcf_get_next_chain(block, chain), + tcf_chain_put(chain_prev)) { if (tca[TCA_CHAIN] && nla_get_u32(tca[TCA_CHAIN]) != chain->index) continue; if (!tcf_chain_dump(chain, q, parent, skb, cb, index_start, &index)) { + tcf_chain_put(chain); err = -EMSGSIZE; break; } } if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) - tcf_block_refcnt_put(block); + tcf_block_refcnt_put(block, true); cb->args[0] = index; out: @@ -2028,8 +2634,10 @@ out: return skb->len; } -static int tc_chain_fill_node(struct tcf_chain *chain, struct net *net, - struct sk_buff *skb, struct tcf_block *block, +static int tc_chain_fill_node(const struct tcf_proto_ops *tmplt_ops, + void *tmplt_priv, u32 chain_index, + struct net *net, struct sk_buff *skb, + struct tcf_block *block, u32 portid, u32 seq, u16 flags, int event) { unsigned char *b = skb_tail_pointer(skb); @@ -2038,8 +2646,8 @@ static int tc_chain_fill_node(struct tcf_chain *chain, struct net *net, struct tcmsg *tcm; void *priv; - ops = chain->tmplt_ops; - priv = chain->tmplt_priv; + ops = tmplt_ops; + priv = tmplt_priv; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); if (!nlh) @@ -2057,7 +2665,7 @@ static int tc_chain_fill_node(struct tcf_chain *chain, struct net *net, tcm->tcm_block_index = block->index; } - if (nla_put_u32(skb, TCA_CHAIN, chain->index)) + if (nla_put_u32(skb, TCA_CHAIN, chain_index)) goto nla_put_failure; if (ops) { @@ -2088,7 +2696,8 @@ static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, if (!skb) return -ENOBUFS; - if (tc_chain_fill_node(chain, net, skb, block, portid, + if (tc_chain_fill_node(chain->tmplt_ops, chain->tmplt_priv, + chain->index, net, skb, block, portid, seq, flags, event) <= 0) { kfree_skb(skb); return -EINVAL; @@ -2100,6 +2709,31 @@ static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, return rtnetlink_send(skb, net, portid, RTNLGRP_TC, flags & NLM_F_ECHO); } +static int tc_chain_notify_delete(const struct tcf_proto_ops *tmplt_ops, + void *tmplt_priv, u32 chain_index, + struct tcf_block *block, struct sk_buff *oskb, + u32 seq, u16 flags, bool unicast) +{ + u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; + struct net *net = block->net; + struct sk_buff *skb; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (tc_chain_fill_node(tmplt_ops, tmplt_priv, chain_index, net, skb, + block, portid, seq, flags, RTM_DELCHAIN) <= 0) { + kfree_skb(skb); + return -EINVAL; + } + + if (unicast) + return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT); + + return rtnetlink_send(skb, net, portid, RTNLGRP_TC, flags & NLM_F_ECHO); +} + static int tc_chain_tmplt_add(struct tcf_chain *chain, struct net *net, struct nlattr **tca, struct netlink_ext_ack *extack) @@ -2111,7 +2745,7 @@ static int tc_chain_tmplt_add(struct tcf_chain *chain, struct net *net, if (!tca[TCA_KIND]) return 0; - ops = tcf_proto_lookup_ops(nla_data(tca[TCA_KIND]), extack); + ops = tcf_proto_lookup_ops(nla_data(tca[TCA_KIND]), true, extack); if (IS_ERR(ops)) return PTR_ERR(ops); if (!ops->tmplt_create || !ops->tmplt_destroy || !ops->tmplt_dump) { @@ -2129,16 +2763,15 @@ static int tc_chain_tmplt_add(struct tcf_chain *chain, struct net *net, return 0; } -static void tc_chain_tmplt_del(struct tcf_chain *chain) +static void tc_chain_tmplt_del(const struct tcf_proto_ops *tmplt_ops, + void *tmplt_priv) { - const struct tcf_proto_ops *ops = chain->tmplt_ops; - /* If template ops are set, no work to do for us. */ - if (!ops) + if (!tmplt_ops) return; - ops->tmplt_destroy(chain->tmplt_priv); - module_put(ops->owner); + tmplt_ops->tmplt_destroy(tmplt_priv); + module_put(tmplt_ops->owner); } /* Add/delete/get a chain */ @@ -2181,6 +2814,8 @@ replay: err = -EINVAL; goto errout_block; } + + mutex_lock(&block->lock); chain = tcf_chain_lookup(block, chain_index); if (n->nlmsg_type == RTM_NEWCHAIN) { if (chain) { @@ -2192,54 +2827,61 @@ replay: } else { NL_SET_ERR_MSG(extack, "Filter chain already exists"); err = -EEXIST; - goto errout_block; + goto errout_block_locked; } } else { if (!(n->nlmsg_flags & NLM_F_CREATE)) { NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN and NLM_F_CREATE to create a new chain"); err = -ENOENT; - goto errout_block; + goto errout_block_locked; } chain = tcf_chain_create(block, chain_index); if (!chain) { NL_SET_ERR_MSG(extack, "Failed to create filter chain"); err = -ENOMEM; - goto errout_block; + goto errout_block_locked; } } } else { if (!chain || tcf_chain_held_by_acts_only(chain)) { NL_SET_ERR_MSG(extack, "Cannot find specified filter chain"); err = -EINVAL; - goto errout_block; + goto errout_block_locked; } tcf_chain_hold(chain); } + if (n->nlmsg_type == RTM_NEWCHAIN) { + /* Modifying chain requires holding parent block lock. In case + * the chain was successfully added, take a reference to the + * chain. This ensures that an empty chain does not disappear at + * the end of this function. + */ + tcf_chain_hold(chain); + chain->explicitly_created = true; + } + mutex_unlock(&block->lock); + switch (n->nlmsg_type) { case RTM_NEWCHAIN: err = tc_chain_tmplt_add(chain, net, tca, extack); - if (err) + if (err) { + tcf_chain_put_explicitly_created(chain); goto errout; - /* In case the chain was successfully added, take a reference - * to the chain. This ensures that an empty chain - * does not disappear at the end of this function. - */ - tcf_chain_hold(chain); - chain->explicitly_created = true; + } + tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL, RTM_NEWCHAIN, false); break; case RTM_DELCHAIN: tfilter_notify_chain(net, skb, block, q, parent, n, - chain, RTM_DELTFILTER); + chain, RTM_DELTFILTER, true); /* Flush the chain first as the user requested chain removal. */ - tcf_chain_flush(chain); + tcf_chain_flush(chain, true); /* In case the chain was successfully deleted, put a reference * to the chain previously taken during addition. */ tcf_chain_put_explicitly_created(chain); - chain->explicitly_created = false; break; case RTM_GETCHAIN: err = tc_chain_notify(chain, skb, n->nlmsg_seq, @@ -2256,11 +2898,15 @@ replay: errout: tcf_chain_put(chain); errout_block: - tcf_block_release(q, block); + tcf_block_release(q, block, true); if (err == -EAGAIN) /* Replay the request. */ goto replay; return err; + +errout_block_locked: + mutex_unlock(&block->lock); + goto errout_block; } /* called with RTNL */ @@ -2270,8 +2916,8 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) struct nlattr *tca[TCA_MAX + 1]; struct Qdisc *q = NULL; struct tcf_block *block; - struct tcf_chain *chain; struct tcmsg *tcm = nlmsg_data(cb->nlh); + struct tcf_chain *chain; long index_start; long index; u32 parent; @@ -2334,6 +2980,7 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) index_start = cb->args[0]; index = 0; + mutex_lock(&block->lock); list_for_each_entry(chain, &block->chain_list, list) { if ((tca[TCA_CHAIN] && nla_get_u32(tca[TCA_CHAIN]) != chain->index)) @@ -2344,7 +2991,8 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) } if (tcf_chain_held_by_acts_only(chain)) continue; - err = tc_chain_fill_node(chain, net, skb, block, + err = tc_chain_fill_node(chain->tmplt_ops, chain->tmplt_priv, + chain->index, net, skb, block, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWCHAIN); @@ -2352,9 +3000,10 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) break; index++; } + mutex_unlock(&block->lock); if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) - tcf_block_refcnt_put(block); + tcf_block_refcnt_put(block, true); cb->args[0] = index; out: @@ -2376,7 +3025,7 @@ EXPORT_SYMBOL(tcf_exts_destroy); int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr, - struct netlink_ext_ack *extack) + bool rtnl_held, struct netlink_ext_ack *extack) { #ifdef CONFIG_NET_CLS_ACT { @@ -2386,7 +3035,8 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, if (exts->police && tb[exts->police]) { act = tcf_action_init_1(net, tp, tb[exts->police], rate_tlv, "police", ovr, - TCA_ACT_BIND, true, extack); + TCA_ACT_BIND, rtnl_held, + extack); if (IS_ERR(act)) return PTR_ERR(act); @@ -2398,13 +3048,12 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, err = tcf_action_init(net, tp, tb[exts->action], rate_tlv, NULL, ovr, TCA_ACT_BIND, - exts->actions, &attr_size, true, - extack); + exts->actions, &attr_size, + rtnl_held, extack); if (err < 0) return err; exts->nr_actions = err; } - exts->net = net; } #else if ((exts->action && tb[exts->action]) || @@ -2515,6 +3164,114 @@ int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type, } EXPORT_SYMBOL(tc_setup_cb_call); +int tc_setup_flow_action(struct flow_action *flow_action, + const struct tcf_exts *exts) +{ + const struct tc_action *act; + int i, j, k; + + if (!exts) + return 0; + + j = 0; + tcf_exts_for_each_action(i, act, exts) { + struct flow_action_entry *entry; + + entry = &flow_action->entries[j]; + if (is_tcf_gact_ok(act)) { + entry->id = FLOW_ACTION_ACCEPT; + } else if (is_tcf_gact_shot(act)) { + entry->id = FLOW_ACTION_DROP; + } else if (is_tcf_gact_trap(act)) { + entry->id = FLOW_ACTION_TRAP; + } else if (is_tcf_gact_goto_chain(act)) { + entry->id = FLOW_ACTION_GOTO; + entry->chain_index = tcf_gact_goto_chain_index(act); + } else if (is_tcf_mirred_egress_redirect(act)) { + entry->id = FLOW_ACTION_REDIRECT; + entry->dev = tcf_mirred_dev(act); + } else if (is_tcf_mirred_egress_mirror(act)) { + entry->id = FLOW_ACTION_MIRRED; + entry->dev = tcf_mirred_dev(act); + } else if (is_tcf_vlan(act)) { + switch (tcf_vlan_action(act)) { + case TCA_VLAN_ACT_PUSH: + entry->id = FLOW_ACTION_VLAN_PUSH; + entry->vlan.vid = tcf_vlan_push_vid(act); + entry->vlan.proto = tcf_vlan_push_proto(act); + entry->vlan.prio = tcf_vlan_push_prio(act); + break; + case TCA_VLAN_ACT_POP: + entry->id = FLOW_ACTION_VLAN_POP; + break; + case TCA_VLAN_ACT_MODIFY: + entry->id = FLOW_ACTION_VLAN_MANGLE; + entry->vlan.vid = tcf_vlan_push_vid(act); + entry->vlan.proto = tcf_vlan_push_proto(act); + entry->vlan.prio = tcf_vlan_push_prio(act); + break; + default: + goto err_out; + } + } else if (is_tcf_tunnel_set(act)) { + entry->id = FLOW_ACTION_TUNNEL_ENCAP; + entry->tunnel = tcf_tunnel_info(act); + } else if (is_tcf_tunnel_release(act)) { + entry->id = FLOW_ACTION_TUNNEL_DECAP; + entry->tunnel = tcf_tunnel_info(act); + } else if (is_tcf_pedit(act)) { + for (k = 0; k < tcf_pedit_nkeys(act); k++) { + switch (tcf_pedit_cmd(act, k)) { + case TCA_PEDIT_KEY_EX_CMD_SET: + entry->id = FLOW_ACTION_MANGLE; + break; + case TCA_PEDIT_KEY_EX_CMD_ADD: + entry->id = FLOW_ACTION_ADD; + break; + default: + goto err_out; + } + entry->mangle.htype = tcf_pedit_htype(act, k); + entry->mangle.mask = tcf_pedit_mask(act, k); + entry->mangle.val = tcf_pedit_val(act, k); + entry->mangle.offset = tcf_pedit_offset(act, k); + entry = &flow_action->entries[++j]; + } + } else if (is_tcf_csum(act)) { + entry->id = FLOW_ACTION_CSUM; + entry->csum_flags = tcf_csum_update_flags(act); + } else if (is_tcf_skbedit_mark(act)) { + entry->id = FLOW_ACTION_MARK; + entry->mark = tcf_skbedit_mark(act); + } else { + goto err_out; + } + + if (!is_tcf_pedit(act)) + j++; + } + return 0; +err_out: + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(tc_setup_flow_action); + +unsigned int tcf_exts_num_actions(struct tcf_exts *exts) +{ + unsigned int num_acts = 0; + struct tc_action *act; + int i; + + tcf_exts_for_each_action(i, act, exts) { + if (is_tcf_pedit(act)) + num_acts += tcf_pedit_nkeys(act); + else + num_acts++; + } + return num_acts; +} +EXPORT_SYMBOL(tcf_exts_num_actions); + static __net_init int tcf_net_init(struct net *net) { struct tcf_net *tn = net_generic(net, tcf_net_id); @@ -2555,10 +3312,12 @@ static int __init tc_filter_init(void) if (err) goto err_rhash_setup_block_ht; - rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL, + RTNL_FLAG_DOIT_UNLOCKED); + rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, + RTNL_FLAG_DOIT_UNLOCKED); rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_get_tfilter, - tc_dump_tfilter, 0); + tc_dump_tfilter, RTNL_FLAG_DOIT_UNLOCKED); rtnl_register(PF_UNSPEC, RTM_NEWCHAIN, tc_ctl_chain, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELCHAIN, tc_ctl_chain, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETCHAIN, tc_ctl_chain, diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 6a5dce8baf19..687b0af67878 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -18,6 +18,7 @@ #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <linux/idr.h> +#include <linux/percpu.h> #include <net/netlink.h> #include <net/act_api.h> #include <net/pkt_cls.h> @@ -35,6 +36,7 @@ struct basic_filter { struct tcf_result res; struct tcf_proto *tp; struct list_head link; + struct tc_basic_pcnt __percpu *pf; struct rcu_work rwork; }; @@ -46,8 +48,10 @@ static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct basic_filter *f; list_for_each_entry_rcu(f, &head->flist, link) { + __this_cpu_inc(f->pf->rcnt); if (!tcf_em_tree_match(skb, &f->ematches, NULL)) continue; + __this_cpu_inc(f->pf->rhit); *res = f->res; r = tcf_exts_exec(skb, &f->exts, res); if (r < 0) @@ -89,6 +93,7 @@ static void __basic_delete_filter(struct basic_filter *f) tcf_exts_destroy(&f->exts); tcf_em_tree_destroy(&f->ematches); tcf_exts_put_net(&f->exts); + free_percpu(f->pf); kfree(f); } @@ -102,7 +107,8 @@ static void basic_delete_filter_work(struct work_struct *work) rtnl_unlock(); } -static void basic_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) +static void basic_destroy(struct tcf_proto *tp, bool rtnl_held, + struct netlink_ext_ack *extack) { struct basic_head *head = rtnl_dereference(tp->root); struct basic_filter *f, *n; @@ -121,7 +127,7 @@ static void basic_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) } static int basic_delete(struct tcf_proto *tp, void *arg, bool *last, - struct netlink_ext_ack *extack) + bool rtnl_held, struct netlink_ext_ack *extack) { struct basic_head *head = rtnl_dereference(tp->root); struct basic_filter *f = arg; @@ -148,7 +154,7 @@ static int basic_set_parms(struct net *net, struct tcf_proto *tp, { int err; - err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, extack); + err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, true, extack); if (err < 0) return err; @@ -168,7 +174,7 @@ static int basic_set_parms(struct net *net, struct tcf_proto *tp, static int basic_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, void **arg, bool ovr, - struct netlink_ext_ack *extack) + bool rtnl_held, struct netlink_ext_ack *extack) { int err; struct basic_head *head = rtnl_dereference(tp->root); @@ -193,7 +199,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, if (!fnew) return -ENOBUFS; - err = tcf_exts_init(&fnew->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE); + err = tcf_exts_init(&fnew->exts, net, TCA_BASIC_ACT, TCA_BASIC_POLICE); if (err < 0) goto errout; @@ -208,6 +214,11 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, if (err) goto errout; fnew->handle = handle; + fnew->pf = alloc_percpu(struct tc_basic_pcnt); + if (!fnew->pf) { + err = -ENOMEM; + goto errout; + } err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr, extack); @@ -231,12 +242,14 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, return 0; errout: + free_percpu(fnew->pf); tcf_exts_destroy(&fnew->exts); kfree(fnew); return err; } -static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg) +static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg, + bool rtnl_held) { struct basic_head *head = rtnl_dereference(tp->root); struct basic_filter *f; @@ -263,10 +276,12 @@ static void basic_bind_class(void *fh, u32 classid, unsigned long cl) } static int basic_dump(struct net *net, struct tcf_proto *tp, void *fh, - struct sk_buff *skb, struct tcmsg *t) + struct sk_buff *skb, struct tcmsg *t, bool rtnl_held) { + struct tc_basic_pcnt gpf = {}; struct basic_filter *f = fh; struct nlattr *nest; + int cpu; if (f == NULL) return skb->len; @@ -281,6 +296,18 @@ static int basic_dump(struct net *net, struct tcf_proto *tp, void *fh, nla_put_u32(skb, TCA_BASIC_CLASSID, f->res.classid)) goto nla_put_failure; + for_each_possible_cpu(cpu) { + struct tc_basic_pcnt *pf = per_cpu_ptr(f->pf, cpu); + + gpf.rcnt += pf->rcnt; + gpf.rhit += pf->rhit; + } + + if (nla_put_64bit(skb, TCA_BASIC_PCNT, + sizeof(struct tc_basic_pcnt), + &gpf, TCA_BASIC_PAD)) + goto nla_put_failure; + if (tcf_exts_dump(skb, &f->exts) < 0 || tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0) goto nla_put_failure; diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index a95cb240a606..b4ac58039cb1 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -298,7 +298,7 @@ static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog, } static int cls_bpf_delete(struct tcf_proto *tp, void *arg, bool *last, - struct netlink_ext_ack *extack) + bool rtnl_held, struct netlink_ext_ack *extack) { struct cls_bpf_head *head = rtnl_dereference(tp->root); @@ -307,7 +307,7 @@ static int cls_bpf_delete(struct tcf_proto *tp, void *arg, bool *last, return 0; } -static void cls_bpf_destroy(struct tcf_proto *tp, +static void cls_bpf_destroy(struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack) { struct cls_bpf_head *head = rtnl_dereference(tp->root); @@ -417,7 +417,8 @@ static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp, if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf)) return -EINVAL; - ret = tcf_exts_validate(net, tp, tb, est, &prog->exts, ovr, extack); + ret = tcf_exts_validate(net, tp, tb, est, &prog->exts, ovr, true, + extack); if (ret < 0) return ret; @@ -455,7 +456,8 @@ static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp, static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - void **arg, bool ovr, struct netlink_ext_ack *extack) + void **arg, bool ovr, bool rtnl_held, + struct netlink_ext_ack *extack) { struct cls_bpf_head *head = rtnl_dereference(tp->root); struct cls_bpf_prog *oldprog = *arg; @@ -475,7 +477,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, if (!prog) return -ENOBUFS; - ret = tcf_exts_init(&prog->exts, TCA_BPF_ACT, TCA_BPF_POLICE); + ret = tcf_exts_init(&prog->exts, net, TCA_BPF_ACT, TCA_BPF_POLICE); if (ret < 0) goto errout; @@ -575,7 +577,7 @@ static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog *prog, } static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, void *fh, - struct sk_buff *skb, struct tcmsg *tm) + struct sk_buff *skb, struct tcmsg *tm, bool rtnl_held) { struct cls_bpf_prog *prog = fh; struct nlattr *nest; @@ -635,7 +637,8 @@ static void cls_bpf_bind_class(void *fh, u32 classid, unsigned long cl) prog->res.class = cl; } -static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg) +static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg, + bool rtnl_held) { struct cls_bpf_head *head = rtnl_dereference(tp->root); struct cls_bpf_prog *prog; diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 3bc01bdde165..4c1567854f95 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -78,7 +78,7 @@ static void cls_cgroup_destroy_work(struct work_struct *work) static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - void **arg, bool ovr, + void **arg, bool ovr, bool rtnl_held, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_CGROUP_MAX + 1]; @@ -99,7 +99,7 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, if (!new) return -ENOBUFS; - err = tcf_exts_init(&new->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); + err = tcf_exts_init(&new->exts, net, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); if (err < 0) goto errout; new->handle = handle; @@ -110,7 +110,7 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, goto errout; err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &new->exts, ovr, - extack); + true, extack); if (err < 0) goto errout; @@ -130,7 +130,7 @@ errout: return err; } -static void cls_cgroup_destroy(struct tcf_proto *tp, +static void cls_cgroup_destroy(struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack) { struct cls_cgroup_head *head = rtnl_dereference(tp->root); @@ -145,18 +145,21 @@ static void cls_cgroup_destroy(struct tcf_proto *tp, } static int cls_cgroup_delete(struct tcf_proto *tp, void *arg, bool *last, - struct netlink_ext_ack *extack) + bool rtnl_held, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } -static void cls_cgroup_walk(struct tcf_proto *tp, struct tcf_walker *arg) +static void cls_cgroup_walk(struct tcf_proto *tp, struct tcf_walker *arg, + bool rtnl_held) { struct cls_cgroup_head *head = rtnl_dereference(tp->root); if (arg->count < arg->skip) goto skip; + if (!head) + return; if (arg->fn(tp, head, arg) < 0) { arg->stop = 1; return; @@ -166,7 +169,7 @@ skip: } static int cls_cgroup_dump(struct net *net, struct tcf_proto *tp, void *fh, - struct sk_buff *skb, struct tcmsg *t) + struct sk_buff *skb, struct tcmsg *t, bool rtnl_held) { struct cls_cgroup_head *head = rtnl_dereference(tp->root); struct nlattr *nest; diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 2bb043cd436b..eece1ee26930 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -391,7 +391,8 @@ static void flow_destroy_filter_work(struct work_struct *work) static int flow_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - void **arg, bool ovr, struct netlink_ext_ack *extack) + void **arg, bool ovr, bool rtnl_held, + struct netlink_ext_ack *extack) { struct flow_head *head = rtnl_dereference(tp->root); struct flow_filter *fold, *fnew; @@ -440,12 +441,12 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, if (err < 0) goto err1; - err = tcf_exts_init(&fnew->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE); + err = tcf_exts_init(&fnew->exts, net, TCA_FLOW_ACT, TCA_FLOW_POLICE); if (err < 0) goto err2; err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &fnew->exts, ovr, - extack); + true, extack); if (err < 0) goto err2; @@ -566,7 +567,7 @@ err1: } static int flow_delete(struct tcf_proto *tp, void *arg, bool *last, - struct netlink_ext_ack *extack) + bool rtnl_held, struct netlink_ext_ack *extack) { struct flow_head *head = rtnl_dereference(tp->root); struct flow_filter *f = arg; @@ -590,7 +591,8 @@ static int flow_init(struct tcf_proto *tp) return 0; } -static void flow_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) +static void flow_destroy(struct tcf_proto *tp, bool rtnl_held, + struct netlink_ext_ack *extack) { struct flow_head *head = rtnl_dereference(tp->root); struct flow_filter *f, *next; @@ -617,7 +619,7 @@ static void *flow_get(struct tcf_proto *tp, u32 handle) } static int flow_dump(struct net *net, struct tcf_proto *tp, void *fh, - struct sk_buff *skb, struct tcmsg *t) + struct sk_buff *skb, struct tcmsg *t, bool rtnl_held) { struct flow_filter *f = fh; struct nlattr *nest; @@ -677,7 +679,8 @@ nla_put_failure: return -1; } -static void flow_walk(struct tcf_proto *tp, struct tcf_walker *arg) +static void flow_walk(struct tcf_proto *tp, struct tcf_walker *arg, + bool rtnl_held) { struct flow_head *head = rtnl_dereference(tp->root); struct flow_filter *f; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 12ca9d13db83..27300a3e76c7 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -381,16 +381,31 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, bool skip_sw = tc_skip_sw(f->flags); int err; + cls_flower.rule = flow_rule_alloc(tcf_exts_num_actions(&f->exts)); + if (!cls_flower.rule) + return -ENOMEM; + tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack); cls_flower.command = TC_CLSFLOWER_REPLACE; cls_flower.cookie = (unsigned long) f; - cls_flower.dissector = &f->mask->dissector; - cls_flower.mask = &f->mask->key; - cls_flower.key = &f->mkey; - cls_flower.exts = &f->exts; + cls_flower.rule->match.dissector = &f->mask->dissector; + cls_flower.rule->match.mask = &f->mask->key; + cls_flower.rule->match.key = &f->mkey; cls_flower.classid = f->res.classid; + err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts); + if (err) { + kfree(cls_flower.rule); + if (skip_sw) { + NL_SET_ERR_MSG_MOD(extack, "Failed to setup flow action"); + return err; + } + return 0; + } + err = tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, skip_sw); + kfree(cls_flower.rule); + if (err < 0) { fl_hw_destroy_filter(tp, f, NULL); return err; @@ -413,10 +428,13 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f) tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, NULL); cls_flower.command = TC_CLSFLOWER_STATS; cls_flower.cookie = (unsigned long) f; - cls_flower.exts = &f->exts; cls_flower.classid = f->res.classid; tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false); + + tcf_exts_stats_update(&f->exts, cls_flower.stats.bytes, + cls_flower.stats.pkts, + cls_flower.stats.lastused); } static bool __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f, @@ -451,7 +469,8 @@ static void fl_destroy_sleepable(struct work_struct *work) module_put(THIS_MODULE); } -static void fl_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) +static void fl_destroy(struct tcf_proto *tp, bool rtnl_held, + struct netlink_ext_ack *extack) { struct cls_fl_head *head = rtnl_dereference(tp->root); struct fl_flow_mask *mask, *next_mask; @@ -1258,7 +1277,8 @@ static int fl_set_parms(struct net *net, struct tcf_proto *tp, { int err; - err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, extack); + err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, true, + extack); if (err < 0) return err; @@ -1285,7 +1305,8 @@ static int fl_set_parms(struct net *net, struct tcf_proto *tp, static int fl_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - void **arg, bool ovr, struct netlink_ext_ack *extack) + void **arg, bool ovr, bool rtnl_held, + struct netlink_ext_ack *extack) { struct cls_fl_head *head = rtnl_dereference(tp->root); struct cls_fl_filter *fold = *arg; @@ -1323,7 +1344,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, goto errout_tb; } - err = tcf_exts_init(&fnew->exts, TCA_FLOWER_ACT, 0); + err = tcf_exts_init(&fnew->exts, net, TCA_FLOWER_ACT, 0); if (err < 0) goto errout; @@ -1422,7 +1443,7 @@ errout_mask_alloc: } static int fl_delete(struct tcf_proto *tp, void *arg, bool *last, - struct netlink_ext_ack *extack) + bool rtnl_held, struct netlink_ext_ack *extack) { struct cls_fl_head *head = rtnl_dereference(tp->root); struct cls_fl_filter *f = arg; @@ -1434,7 +1455,8 @@ static int fl_delete(struct tcf_proto *tp, void *arg, bool *last, return 0; } -static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg) +static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg, + bool rtnl_held) { struct cls_fl_head *head = rtnl_dereference(tp->root); struct cls_fl_filter *f; @@ -1467,18 +1489,36 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, if (tc_skip_hw(f->flags)) continue; + cls_flower.rule = + flow_rule_alloc(tcf_exts_num_actions(&f->exts)); + if (!cls_flower.rule) + return -ENOMEM; + tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack); cls_flower.command = add ? TC_CLSFLOWER_REPLACE : TC_CLSFLOWER_DESTROY; cls_flower.cookie = (unsigned long)f; - cls_flower.dissector = &mask->dissector; - cls_flower.mask = &mask->key; - cls_flower.key = &f->mkey; - cls_flower.exts = &f->exts; + cls_flower.rule->match.dissector = &mask->dissector; + cls_flower.rule->match.mask = &mask->key; + cls_flower.rule->match.key = &f->mkey; + + err = tc_setup_flow_action(&cls_flower.rule->action, + &f->exts); + if (err) { + kfree(cls_flower.rule); + if (tc_skip_sw(f->flags)) { + NL_SET_ERR_MSG_MOD(extack, "Failed to setup flow action"); + return err; + } + continue; + } + cls_flower.classid = f->res.classid; err = cb(TC_SETUP_CLSFLOWER, &cls_flower, cb_priv); + kfree(cls_flower.rule); + if (err) { if (add && tc_skip_sw(f->flags)) return err; @@ -1493,25 +1533,30 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, return 0; } -static void fl_hw_create_tmplt(struct tcf_chain *chain, - struct fl_flow_tmplt *tmplt) +static int fl_hw_create_tmplt(struct tcf_chain *chain, + struct fl_flow_tmplt *tmplt) { struct tc_cls_flower_offload cls_flower = {}; struct tcf_block *block = chain->block; - struct tcf_exts dummy_exts = { 0, }; + + cls_flower.rule = flow_rule_alloc(0); + if (!cls_flower.rule) + return -ENOMEM; cls_flower.common.chain_index = chain->index; cls_flower.command = TC_CLSFLOWER_TMPLT_CREATE; cls_flower.cookie = (unsigned long) tmplt; - cls_flower.dissector = &tmplt->dissector; - cls_flower.mask = &tmplt->mask; - cls_flower.key = &tmplt->dummy_key; - cls_flower.exts = &dummy_exts; + cls_flower.rule->match.dissector = &tmplt->dissector; + cls_flower.rule->match.mask = &tmplt->mask; + cls_flower.rule->match.key = &tmplt->dummy_key; /* We don't care if driver (any of them) fails to handle this * call. It serves just as a hint for it. */ tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false); + kfree(cls_flower.rule); + + return 0; } static void fl_hw_destroy_tmplt(struct tcf_chain *chain, @@ -1555,12 +1600,14 @@ static void *fl_tmplt_create(struct net *net, struct tcf_chain *chain, err = fl_set_key(net, tb, &tmplt->dummy_key, &tmplt->mask, extack); if (err) goto errout_tmplt; - kfree(tb); fl_init_dissector(&tmplt->dissector, &tmplt->mask); - fl_hw_create_tmplt(chain, tmplt); + err = fl_hw_create_tmplt(chain, tmplt); + if (err) + goto errout_tmplt; + kfree(tb); return tmplt; errout_tmplt: @@ -2008,7 +2055,7 @@ nla_put_failure: } static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, - struct sk_buff *skb, struct tcmsg *t) + struct sk_buff *skb, struct tcmsg *t, bool rtnl_held) { struct cls_fl_filter *f = fh; struct nlattr *nest; diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 29eeeaf3ea44..ad036b00427d 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -139,7 +139,8 @@ static void fw_delete_filter_work(struct work_struct *work) rtnl_unlock(); } -static void fw_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) +static void fw_destroy(struct tcf_proto *tp, bool rtnl_held, + struct netlink_ext_ack *extack) { struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f; @@ -163,7 +164,7 @@ static void fw_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) } static int fw_delete(struct tcf_proto *tp, void *arg, bool *last, - struct netlink_ext_ack *extack) + bool rtnl_held, struct netlink_ext_ack *extack) { struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f = arg; @@ -217,7 +218,7 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp, int err; err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &f->exts, ovr, - extack); + true, extack); if (err < 0) return err; @@ -250,7 +251,8 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp, static int fw_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, void **arg, - bool ovr, struct netlink_ext_ack *extack) + bool ovr, bool rtnl_held, + struct netlink_ext_ack *extack) { struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f = *arg; @@ -283,7 +285,8 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, #endif /* CONFIG_NET_CLS_IND */ fnew->tp = f->tp; - err = tcf_exts_init(&fnew->exts, TCA_FW_ACT, TCA_FW_POLICE); + err = tcf_exts_init(&fnew->exts, net, TCA_FW_ACT, + TCA_FW_POLICE); if (err < 0) { kfree(fnew); return err; @@ -332,7 +335,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, if (f == NULL) return -ENOBUFS; - err = tcf_exts_init(&f->exts, TCA_FW_ACT, TCA_FW_POLICE); + err = tcf_exts_init(&f->exts, net, TCA_FW_ACT, TCA_FW_POLICE); if (err < 0) goto errout; f->id = handle; @@ -354,7 +357,8 @@ errout: return err; } -static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg) +static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg, + bool rtnl_held) { struct fw_head *head = rtnl_dereference(tp->root); int h; @@ -384,7 +388,7 @@ static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg) } static int fw_dump(struct net *net, struct tcf_proto *tp, void *fh, - struct sk_buff *skb, struct tcmsg *t) + struct sk_buff *skb, struct tcmsg *t, bool rtnl_held) { struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f = fh; diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 0e408ee9dcec..459921bd3d87 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -12,6 +12,7 @@ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> +#include <linux/percpu.h> #include <net/sch_generic.h> #include <net/pkt_cls.h> @@ -22,6 +23,7 @@ struct cls_mall_head { u32 handle; u32 flags; unsigned int in_hw_count; + struct tc_matchall_pcnt __percpu *pf; struct rcu_work rwork; }; @@ -34,6 +36,7 @@ static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp, return -1; *res = head->res; + __this_cpu_inc(head->pf->rhit); return tcf_exts_exec(skb, &head->exts, res); } @@ -46,6 +49,7 @@ static void __mall_destroy(struct cls_mall_head *head) { tcf_exts_destroy(&head->exts); tcf_exts_put_net(&head->exts); + free_percpu(head->pf); kfree(head); } @@ -105,7 +109,8 @@ static int mall_replace_hw_filter(struct tcf_proto *tp, return 0; } -static void mall_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) +static void mall_destroy(struct tcf_proto *tp, bool rtnl_held, + struct netlink_ext_ack *extack) { struct cls_mall_head *head = rtnl_dereference(tp->root); @@ -141,7 +146,8 @@ static int mall_set_parms(struct net *net, struct tcf_proto *tp, { int err; - err = tcf_exts_validate(net, tp, tb, est, &head->exts, ovr, extack); + err = tcf_exts_validate(net, tp, tb, est, &head->exts, ovr, true, + extack); if (err < 0) return err; @@ -155,7 +161,8 @@ static int mall_set_parms(struct net *net, struct tcf_proto *tp, static int mall_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - void **arg, bool ovr, struct netlink_ext_ack *extack) + void **arg, bool ovr, bool rtnl_held, + struct netlink_ext_ack *extack) { struct cls_mall_head *head = rtnl_dereference(tp->root); struct nlattr *tb[TCA_MATCHALL_MAX + 1]; @@ -184,7 +191,7 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, if (!new) return -ENOBUFS; - err = tcf_exts_init(&new->exts, TCA_MATCHALL_ACT, 0); + err = tcf_exts_init(&new->exts, net, TCA_MATCHALL_ACT, 0); if (err) goto err_exts_init; @@ -192,6 +199,11 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, handle = 1; new->handle = handle; new->flags = flags; + new->pf = alloc_percpu(struct tc_matchall_pcnt); + if (!new->pf) { + err = -ENOMEM; + goto err_alloc_percpu; + } err = mall_set_parms(net, tp, new, base, tb, tca[TCA_RATE], ovr, extack); @@ -214,6 +226,8 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, err_replace_hw_filter: err_set_parms: + free_percpu(new->pf); +err_alloc_percpu: tcf_exts_destroy(&new->exts); err_exts_init: kfree(new); @@ -221,17 +235,21 @@ err_exts_init: } static int mall_delete(struct tcf_proto *tp, void *arg, bool *last, - struct netlink_ext_ack *extack) + bool rtnl_held, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } -static void mall_walk(struct tcf_proto *tp, struct tcf_walker *arg) +static void mall_walk(struct tcf_proto *tp, struct tcf_walker *arg, + bool rtnl_held) { struct cls_mall_head *head = rtnl_dereference(tp->root); if (arg->count < arg->skip) goto skip; + + if (!head) + return; if (arg->fn(tp, head, arg) < 0) arg->stop = 1; skip: @@ -268,10 +286,12 @@ static int mall_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, } static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh, - struct sk_buff *skb, struct tcmsg *t) + struct sk_buff *skb, struct tcmsg *t, bool rtnl_held) { + struct tc_matchall_pcnt gpf = {}; struct cls_mall_head *head = fh; struct nlattr *nest; + int cpu; if (!head) return skb->len; @@ -289,6 +309,17 @@ static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh, if (head->flags && nla_put_u32(skb, TCA_MATCHALL_FLAGS, head->flags)) goto nla_put_failure; + for_each_possible_cpu(cpu) { + struct tc_matchall_pcnt *pf = per_cpu_ptr(head->pf, cpu); + + gpf.rhit += pf->rhit; + } + + if (nla_put_64bit(skb, TCA_MATCHALL_PCNT, + sizeof(struct tc_matchall_pcnt), + &gpf, TCA_MATCHALL_PAD)) + goto nla_put_failure; + if (tcf_exts_dump(skb, &head->exts)) goto nla_put_failure; diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 0404aa5fa7cb..f006af23b64a 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -276,7 +276,8 @@ static void route4_queue_work(struct route4_filter *f) tcf_queue_work(&f->rwork, route4_delete_filter_work); } -static void route4_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) +static void route4_destroy(struct tcf_proto *tp, bool rtnl_held, + struct netlink_ext_ack *extack) { struct route4_head *head = rtnl_dereference(tp->root); int h1, h2; @@ -312,7 +313,7 @@ static void route4_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) } static int route4_delete(struct tcf_proto *tp, void *arg, bool *last, - struct netlink_ext_ack *extack) + bool rtnl_held, struct netlink_ext_ack *extack) { struct route4_head *head = rtnl_dereference(tp->root); struct route4_filter *f = arg; @@ -393,7 +394,7 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp, struct route4_bucket *b; int err; - err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, extack); + err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, true, extack); if (err < 0) return err; @@ -468,7 +469,7 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp, static int route4_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, void **arg, bool ovr, - struct netlink_ext_ack *extack) + bool rtnl_held, struct netlink_ext_ack *extack) { struct route4_head *head = rtnl_dereference(tp->root); struct route4_filter __rcu **fp; @@ -496,7 +497,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, if (!f) goto errout; - err = tcf_exts_init(&f->exts, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE); + err = tcf_exts_init(&f->exts, net, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE); if (err < 0) goto errout; @@ -560,15 +561,13 @@ errout: return err; } -static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg) +static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg, + bool rtnl_held) { struct route4_head *head = rtnl_dereference(tp->root); unsigned int h, h1; - if (head == NULL) - arg->stop = 1; - - if (arg->stop) + if (head == NULL || arg->stop) return; for (h = 0; h <= 256; h++) { @@ -597,7 +596,7 @@ static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg) } static int route4_dump(struct net *net, struct tcf_proto *tp, void *fh, - struct sk_buff *skb, struct tcmsg *t) + struct sk_buff *skb, struct tcmsg *t, bool rtnl_held) { struct route4_filter *f = fh; struct nlattr *nest; diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index e9ccf7daea7d..0719a21d9c41 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -312,7 +312,8 @@ static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f) __rsvp_delete_filter(f); } -static void rsvp_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) +static void rsvp_destroy(struct tcf_proto *tp, bool rtnl_held, + struct netlink_ext_ack *extack) { struct rsvp_head *data = rtnl_dereference(tp->root); int h1, h2; @@ -341,7 +342,7 @@ static void rsvp_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) } static int rsvp_delete(struct tcf_proto *tp, void *arg, bool *last, - struct netlink_ext_ack *extack) + bool rtnl_held, struct netlink_ext_ack *extack) { struct rsvp_head *head = rtnl_dereference(tp->root); struct rsvp_filter *nfp, *f = arg; @@ -477,7 +478,8 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - void **arg, bool ovr, struct netlink_ext_ack *extack) + void **arg, bool ovr, bool rtnl_held, + struct netlink_ext_ack *extack) { struct rsvp_head *data = rtnl_dereference(tp->root); struct rsvp_filter *f, *nfp; @@ -499,10 +501,11 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb, if (err < 0) return err; - err = tcf_exts_init(&e, TCA_RSVP_ACT, TCA_RSVP_POLICE); + err = tcf_exts_init(&e, net, TCA_RSVP_ACT, TCA_RSVP_POLICE); if (err < 0) return err; - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr, extack); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr, true, + extack); if (err < 0) goto errout2; @@ -520,7 +523,8 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb, goto errout2; } - err = tcf_exts_init(&n->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE); + err = tcf_exts_init(&n->exts, net, TCA_RSVP_ACT, + TCA_RSVP_POLICE); if (err < 0) { kfree(n); goto errout2; @@ -548,7 +552,7 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb, if (f == NULL) goto errout2; - err = tcf_exts_init(&f->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE); + err = tcf_exts_init(&f->exts, net, TCA_RSVP_ACT, TCA_RSVP_POLICE); if (err < 0) goto errout; h2 = 16; @@ -654,7 +658,8 @@ errout2: return err; } -static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg) +static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg, + bool rtnl_held) { struct rsvp_head *head = rtnl_dereference(tp->root); unsigned int h, h1; @@ -688,7 +693,7 @@ static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg) } static int rsvp_dump(struct net *net, struct tcf_proto *tp, void *fh, - struct sk_buff *skb, struct tcmsg *t) + struct sk_buff *skb, struct tcmsg *t, bool rtnl_held) { struct rsvp_filter *f = fh; struct rsvp_session *s; diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 38bb882bb958..24e0a62a65cc 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -173,7 +173,7 @@ static void tcindex_destroy_fexts_work(struct work_struct *work) } static int tcindex_delete(struct tcf_proto *tp, void *arg, bool *last, - struct netlink_ext_ack *extack) + bool rtnl_held, struct netlink_ext_ack *extack) { struct tcindex_data *p = rtnl_dereference(tp->root); struct tcindex_filter_result *r = arg; @@ -246,10 +246,12 @@ static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = { [TCA_TCINDEX_CLASSID] = { .type = NLA_U32 }, }; -static int tcindex_filter_result_init(struct tcindex_filter_result *r) +static int tcindex_filter_result_init(struct tcindex_filter_result *r, + struct net *net) { memset(r, 0, sizeof(*r)); - return tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); + return tcf_exts_init(&r->exts, net, TCA_TCINDEX_ACT, + TCA_TCINDEX_POLICE); } static void tcindex_partial_destroy_work(struct work_struct *work) @@ -281,13 +283,10 @@ static int tcindex_alloc_perfect_hash(struct net *net, struct tcindex_data *cp) return -ENOMEM; for (i = 0; i < cp->hash; i++) { - err = tcf_exts_init(&cp->perfect[i].exts, + err = tcf_exts_init(&cp->perfect[i].exts, net, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); if (err < 0) goto errout; -#ifdef CONFIG_NET_CLS_ACT - cp->perfect[i].exts.net = net; -#endif } return 0; @@ -310,10 +309,10 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, int err, balloc = 0; struct tcf_exts e; - err = tcf_exts_init(&e, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); + err = tcf_exts_init(&e, net, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); if (err < 0) return err; - err = tcf_exts_validate(net, tp, tb, est, &e, ovr, extack); + err = tcf_exts_validate(net, tp, tb, est, &e, ovr, true, extack); if (err < 0) goto errout; @@ -344,7 +343,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, } cp->h = p->h; - err = tcindex_filter_result_init(&new_filter_result); + err = tcindex_filter_result_init(&new_filter_result, net); if (err < 0) goto errout1; if (old_r) @@ -431,7 +430,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, goto errout_alloc; f->key = handle; f->next = NULL; - err = tcindex_filter_result_init(&f->result); + err = tcindex_filter_result_init(&f->result, net); if (err < 0) { kfree(f); goto errout_alloc; @@ -444,7 +443,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, } if (old_r && old_r != r) { - err = tcindex_filter_result_init(old_r); + err = tcindex_filter_result_init(old_r, net); if (err < 0) { kfree(f); goto errout_alloc; @@ -496,7 +495,7 @@ static int tcindex_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, void **arg, bool ovr, - struct netlink_ext_ack *extack) + bool rtnl_held, struct netlink_ext_ack *extack) { struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_TCINDEX_MAX + 1]; @@ -519,7 +518,8 @@ tcindex_change(struct net *net, struct sk_buff *in_skb, tca[TCA_RATE], ovr, extack); } -static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker) +static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker, + bool rtnl_held) { struct tcindex_data *p = rtnl_dereference(tp->root); struct tcindex_filter *f, *next; @@ -555,7 +555,7 @@ static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker) } } -static void tcindex_destroy(struct tcf_proto *tp, +static void tcindex_destroy(struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack) { struct tcindex_data *p = rtnl_dereference(tp->root); @@ -582,7 +582,7 @@ static void tcindex_destroy(struct tcf_proto *tp, for (f = rtnl_dereference(p->h[i]); f; f = next) { next = rtnl_dereference(f->next); - tcindex_delete(tp, &f->result, &last, NULL); + tcindex_delete(tp, &f->result, &last, rtnl_held, NULL); } } @@ -591,7 +591,7 @@ static void tcindex_destroy(struct tcf_proto *tp, static int tcindex_dump(struct net *net, struct tcf_proto *tp, void *fh, - struct sk_buff *skb, struct tcmsg *t) + struct sk_buff *skb, struct tcmsg *t, bool rtnl_held) { struct tcindex_data *p = rtnl_dereference(tp->root); struct tcindex_filter_result *r = fh; diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index dcea21004604..48e76a3acf8a 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -629,7 +629,8 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht, return -ENOENT; } -static void u32_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) +static void u32_destroy(struct tcf_proto *tp, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_u_common *tp_c = tp->data; struct tc_u_hnode *root_ht = rtnl_dereference(tp->root); @@ -663,7 +664,7 @@ static void u32_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) } static int u32_delete(struct tcf_proto *tp, void *arg, bool *last, - struct netlink_ext_ack *extack) + bool rtnl_held, struct netlink_ext_ack *extack) { struct tc_u_hnode *ht = arg; struct tc_u_common *tp_c = tp->data; @@ -726,7 +727,7 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp, { int err; - err = tcf_exts_validate(net, tp, tb, est, &n->exts, ovr, extack); + err = tcf_exts_validate(net, tp, tb, est, &n->exts, ovr, true, extack); if (err < 0) return err; @@ -803,7 +804,7 @@ static void u32_replace_knode(struct tcf_proto *tp, struct tc_u_common *tp_c, rcu_assign_pointer(*ins, n); } -static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp, +static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp, struct tc_u_knode *n) { struct tc_u_hnode *ht = rtnl_dereference(n->ht_down); @@ -848,7 +849,7 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp, #endif memcpy(&new->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key)); - if (tcf_exts_init(&new->exts, TCA_U32_ACT, TCA_U32_POLICE)) { + if (tcf_exts_init(&new->exts, net, TCA_U32_ACT, TCA_U32_POLICE)) { kfree(new); return NULL; } @@ -858,7 +859,7 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp, static int u32_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, - struct nlattr **tca, void **arg, bool ovr, + struct nlattr **tca, void **arg, bool ovr, bool rtnl_held, struct netlink_ext_ack *extack) { struct tc_u_common *tp_c = tp->data; @@ -910,7 +911,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, return -EINVAL; } - new = u32_init_knode(tp, n); + new = u32_init_knode(net, tp, n); if (!new) return -ENOMEM; @@ -1060,7 +1061,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0; n->flags = flags; - err = tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE); + err = tcf_exts_init(&n->exts, net, TCA_U32_ACT, TCA_U32_POLICE); if (err < 0) goto errout; @@ -1123,7 +1124,8 @@ erridr: return err; } -static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg) +static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg, + bool rtnl_held) { struct tc_u_common *tp_c = tp->data; struct tc_u_hnode *ht; @@ -1281,7 +1283,7 @@ static void u32_bind_class(void *fh, u32 classid, unsigned long cl) } static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh, - struct sk_buff *skb, struct tcmsg *t) + struct sk_buff *skb, struct tcmsg *t, bool rtnl_held) { struct tc_u_knode *n = fh; struct tc_u_hnode *ht_up, *ht_down; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 7e4d1ccf4c87..352b46f98440 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -526,11 +526,6 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt, return stab; } -static void stab_kfree_rcu(struct rcu_head *head) -{ - kfree(container_of(head, struct qdisc_size_table, rcu)); -} - void qdisc_put_stab(struct qdisc_size_table *tab) { if (!tab) @@ -538,7 +533,7 @@ void qdisc_put_stab(struct qdisc_size_table *tab) if (--tab->refcnt == 0) { list_del(&tab->list); - call_rcu(&tab->rcu, stab_kfree_rcu); + kfree_rcu(tab, rcu); } } EXPORT_SYMBOL(qdisc_put_stab); @@ -758,8 +753,7 @@ static u32 qdisc_alloc_handle(struct net_device *dev) return 0; } -void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n, - unsigned int len) +void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len) { bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED; const struct Qdisc_class_ops *cops; @@ -1202,9 +1196,11 @@ static struct Qdisc *qdisc_create(struct net_device *dev, } else { if (handle == 0) { handle = qdisc_alloc_handle(dev); - err = -ENOMEM; - if (handle == 0) + if (handle == 0) { + NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded"); + err = -ENOSPC; goto err_out3; + } } if (!netif_is_multiqueue(dev)) sch->flags |= TCQ_F_ONETXQUEUE; @@ -1910,17 +1906,19 @@ static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, block = cops->tcf_block(q, cl, NULL); if (!block) return; - list_for_each_entry(chain, &block->chain_list, list) { + for (chain = tcf_get_next_chain(block, NULL); + chain; + chain = tcf_get_next_chain(block, chain)) { struct tcf_proto *tp; - for (tp = rtnl_dereference(chain->filter_chain); - tp; tp = rtnl_dereference(tp->next)) { + for (tp = tcf_get_next_proto(chain, NULL, true); + tp; tp = tcf_get_next_proto(chain, tp, true)) { struct tcf_bind_args arg = {}; arg.w.fn = tcf_node_bind; arg.classid = clid; arg.cl = new_cl; - tp->ops->walk(tp, &arg.w); + tp->ops->walk(tp, &arg.w, true); } } } diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 73940293700d..1d2a12132abc 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -138,8 +138,8 @@ struct cake_flow { struct cake_host { u32 srchost_tag; u32 dsthost_tag; - u16 srchost_refcnt; - u16 dsthost_refcnt; + u16 srchost_bulk_flow_count; + u16 dsthost_bulk_flow_count; }; struct cake_heap_entry { @@ -258,7 +258,8 @@ enum { CAKE_FLAG_AUTORATE_INGRESS = BIT(1), CAKE_FLAG_INGRESS = BIT(2), CAKE_FLAG_WASH = BIT(3), - CAKE_FLAG_SPLIT_GSO = BIT(4) + CAKE_FLAG_SPLIT_GSO = BIT(4), + CAKE_FLAG_FWMARK = BIT(5) }; /* COBALT operates the Codel and BLUE algorithms in parallel, in order to @@ -746,8 +747,10 @@ skip_hash: * queue, accept the collision, update the host tags. */ q->way_collisions++; - q->hosts[q->flows[reduced_hash].srchost].srchost_refcnt--; - q->hosts[q->flows[reduced_hash].dsthost].dsthost_refcnt--; + if (q->flows[outer_hash + k].set == CAKE_SET_BULK) { + q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--; + q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--; + } allocate_src = cake_dsrc(flow_mode); allocate_dst = cake_ddst(flow_mode); found: @@ -767,13 +770,14 @@ found: } for (i = 0; i < CAKE_SET_WAYS; i++, k = (k + 1) % CAKE_SET_WAYS) { - if (!q->hosts[outer_hash + k].srchost_refcnt) + if (!q->hosts[outer_hash + k].srchost_bulk_flow_count) break; } q->hosts[outer_hash + k].srchost_tag = srchost_hash; found_src: srchost_idx = outer_hash + k; - q->hosts[srchost_idx].srchost_refcnt++; + if (q->flows[reduced_hash].set == CAKE_SET_BULK) + q->hosts[srchost_idx].srchost_bulk_flow_count++; q->flows[reduced_hash].srchost = srchost_idx; } @@ -789,13 +793,14 @@ found_src: } for (i = 0; i < CAKE_SET_WAYS; i++, k = (k + 1) % CAKE_SET_WAYS) { - if (!q->hosts[outer_hash + k].dsthost_refcnt) + if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count) break; } q->hosts[outer_hash + k].dsthost_tag = dsthost_hash; found_dst: dsthost_idx = outer_hash + k; - q->hosts[dsthost_idx].dsthost_refcnt++; + if (q->flows[reduced_hash].set == CAKE_SET_BULK) + q->hosts[dsthost_idx].dsthost_bulk_flow_count++; q->flows[reduced_hash].dsthost = dsthost_idx; } } @@ -1508,20 +1513,6 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) return idx + (tin << 16); } -static void cake_wash_diffserv(struct sk_buff *skb) -{ - switch (skb->protocol) { - case htons(ETH_P_IP): - ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0); - break; - case htons(ETH_P_IPV6): - ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0); - break; - default: - break; - } -} - static u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash) { u8 dscp; @@ -1553,25 +1544,32 @@ static struct cake_tin_data *cake_select_tin(struct Qdisc *sch, { struct cake_sched_data *q = qdisc_priv(sch); u32 tin; + u8 dscp; + + /* Tin selection: Default to diffserv-based selection, allow overriding + * using firewall marks or skb->priority. + */ + dscp = cake_handle_diffserv(skb, + q->rate_flags & CAKE_FLAG_WASH); + + if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT) + tin = 0; - if (TC_H_MAJ(skb->priority) == sch->handle && - TC_H_MIN(skb->priority) > 0 && - TC_H_MIN(skb->priority) <= q->tin_cnt) { + else if (q->rate_flags & CAKE_FLAG_FWMARK && /* use fw mark */ + skb->mark && + skb->mark <= q->tin_cnt) + tin = q->tin_order[skb->mark - 1]; + + else if (TC_H_MAJ(skb->priority) == sch->handle && + TC_H_MIN(skb->priority) > 0 && + TC_H_MIN(skb->priority) <= q->tin_cnt) tin = q->tin_order[TC_H_MIN(skb->priority) - 1]; - if (q->rate_flags & CAKE_FLAG_WASH) - cake_wash_diffserv(skb); - } else if (q->tin_mode != CAKE_DIFFSERV_BESTEFFORT) { - /* extract the Diffserv Precedence field, if it exists */ - /* and clear DSCP bits if washing */ - tin = q->tin_index[cake_handle_diffserv(skb, - q->rate_flags & CAKE_FLAG_WASH)]; + else { + tin = q->tin_index[dscp]; + if (unlikely(tin >= q->tin_cnt)) tin = 0; - } else { - tin = 0; - if (q->rate_flags & CAKE_FLAG_WASH) - cake_wash_diffserv(skb); } return &q->tins[tin]; @@ -1794,20 +1792,30 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, b->sparse_flow_count++; if (cake_dsrc(q->flow_mode)) - host_load = max(host_load, srchost->srchost_refcnt); + host_load = max(host_load, srchost->srchost_bulk_flow_count); if (cake_ddst(q->flow_mode)) - host_load = max(host_load, dsthost->dsthost_refcnt); + host_load = max(host_load, dsthost->dsthost_bulk_flow_count); flow->deficit = (b->flow_quantum * quantum_div[host_load]) >> 16; } else if (flow->set == CAKE_SET_SPARSE_WAIT) { + struct cake_host *srchost = &b->hosts[flow->srchost]; + struct cake_host *dsthost = &b->hosts[flow->dsthost]; + /* this flow was empty, accounted as a sparse flow, but actually * in the bulk rotation. */ flow->set = CAKE_SET_BULK; b->sparse_flow_count--; b->bulk_flow_count++; + + if (cake_dsrc(q->flow_mode)) + srchost->srchost_bulk_flow_count++; + + if (cake_ddst(q->flow_mode)) + dsthost->dsthost_bulk_flow_count++; + } if (q->buffer_used > q->buffer_max_used) @@ -1975,23 +1983,8 @@ retry: dsthost = &b->hosts[flow->dsthost]; host_load = 1; - if (cake_dsrc(q->flow_mode)) - host_load = max(host_load, srchost->srchost_refcnt); - - if (cake_ddst(q->flow_mode)) - host_load = max(host_load, dsthost->dsthost_refcnt); - - WARN_ON(host_load > CAKE_QUEUES); - /* flow isolation (DRR++) */ if (flow->deficit <= 0) { - /* The shifted prandom_u32() is a way to apply dithering to - * avoid accumulating roundoff errors - */ - flow->deficit += (b->flow_quantum * quantum_div[host_load] + - (prandom_u32() >> 16)) >> 16; - list_move_tail(&flow->flowchain, &b->old_flows); - /* Keep all flows with deficits out of the sparse and decaying * rotations. No non-empty flow can go into the decaying * rotation, so they can't get deficits @@ -2000,6 +1993,13 @@ retry: if (flow->head) { b->sparse_flow_count--; b->bulk_flow_count++; + + if (cake_dsrc(q->flow_mode)) + srchost->srchost_bulk_flow_count++; + + if (cake_ddst(q->flow_mode)) + dsthost->dsthost_bulk_flow_count++; + flow->set = CAKE_SET_BULK; } else { /* we've moved it to the bulk rotation for @@ -2009,6 +2009,22 @@ retry: flow->set = CAKE_SET_SPARSE_WAIT; } } + + if (cake_dsrc(q->flow_mode)) + host_load = max(host_load, srchost->srchost_bulk_flow_count); + + if (cake_ddst(q->flow_mode)) + host_load = max(host_load, dsthost->dsthost_bulk_flow_count); + + WARN_ON(host_load > CAKE_QUEUES); + + /* The shifted prandom_u32() is a way to apply dithering to + * avoid accumulating roundoff errors + */ + flow->deficit += (b->flow_quantum * quantum_div[host_load] + + (prandom_u32() >> 16)) >> 16; + list_move_tail(&flow->flowchain, &b->old_flows); + goto retry; } @@ -2029,6 +2045,13 @@ retry: &b->decaying_flows); if (flow->set == CAKE_SET_BULK) { b->bulk_flow_count--; + + if (cake_dsrc(q->flow_mode)) + srchost->srchost_bulk_flow_count--; + + if (cake_ddst(q->flow_mode)) + dsthost->dsthost_bulk_flow_count--; + b->decaying_flow_count++; } else if (flow->set == CAKE_SET_SPARSE || flow->set == CAKE_SET_SPARSE_WAIT) { @@ -2042,14 +2065,19 @@ retry: if (flow->set == CAKE_SET_SPARSE || flow->set == CAKE_SET_SPARSE_WAIT) b->sparse_flow_count--; - else if (flow->set == CAKE_SET_BULK) + else if (flow->set == CAKE_SET_BULK) { b->bulk_flow_count--; - else + + if (cake_dsrc(q->flow_mode)) + srchost->srchost_bulk_flow_count--; + + if (cake_ddst(q->flow_mode)) + dsthost->dsthost_bulk_flow_count--; + + } else b->decaying_flow_count--; flow->set = CAKE_SET_NONE; - srchost->srchost_refcnt--; - dsthost->dsthost_refcnt--; } goto begin; } @@ -2590,6 +2618,13 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO; } + if (tb[TCA_CAKE_FWMARK]) { + if (!!nla_get_u32(tb[TCA_CAKE_FWMARK])) + q->rate_flags |= CAKE_FLAG_FWMARK; + else + q->rate_flags &= ~CAKE_FLAG_FWMARK; + } + if (q->tins) { sch_tree_lock(sch); cake_reconfigure(sch); @@ -2749,6 +2784,10 @@ static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO))) goto nla_put_failure; + if (nla_put_u32(skb, TCA_CAKE_FWMARK, + !!(q->rate_flags & CAKE_FLAG_FWMARK))) + goto nla_put_failure; + return nla_nest_end(skb, opts); nla_put_failure: diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 968a85fe4d4a..a117d9260558 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -68,7 +68,7 @@ static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q) skb = __skb_dequeue(&q->skb_bad_txq); if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_backlog_dec(q, skb); - qdisc_qstats_cpu_qlen_dec(q); + qdisc_qstats_atomic_qlen_dec(q); } else { qdisc_qstats_backlog_dec(q, skb); q->q.qlen--; @@ -108,7 +108,7 @@ static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q, if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_backlog_inc(q, skb); - qdisc_qstats_cpu_qlen_inc(q); + qdisc_qstats_atomic_qlen_inc(q); } else { qdisc_qstats_backlog_inc(q, skb); q->q.qlen++; @@ -147,7 +147,7 @@ static inline int dev_requeue_skb_locked(struct sk_buff *skb, struct Qdisc *q) qdisc_qstats_cpu_requeues_inc(q); qdisc_qstats_cpu_backlog_inc(q, skb); - qdisc_qstats_cpu_qlen_inc(q); + qdisc_qstats_atomic_qlen_inc(q); skb = next; } @@ -252,7 +252,7 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, skb = __skb_dequeue(&q->gso_skb); if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_backlog_dec(q, skb); - qdisc_qstats_cpu_qlen_dec(q); + qdisc_qstats_atomic_qlen_dec(q); } else { qdisc_qstats_backlog_dec(q, skb); q->q.qlen--; @@ -559,7 +559,7 @@ struct Qdisc_ops noop_qdisc_ops __read_mostly = { }; static struct netdev_queue noop_netdev_queue = { - .qdisc = &noop_qdisc, + RCU_POINTER_INITIALIZER(qdisc, &noop_qdisc), .qdisc_sleeping = &noop_qdisc, }; @@ -645,7 +645,7 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, if (unlikely(err)) return qdisc_drop_cpu(skb, qdisc, to_free); - qdisc_qstats_cpu_qlen_inc(qdisc); + qdisc_qstats_atomic_qlen_inc(qdisc); /* Note: skb can not be used after skb_array_produce(), * so we better not use qdisc_qstats_cpu_backlog_inc() */ @@ -670,7 +670,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) if (likely(skb)) { qdisc_qstats_cpu_backlog_dec(qdisc, skb); qdisc_bstats_cpu_update(qdisc, skb); - qdisc_qstats_cpu_qlen_dec(qdisc); + qdisc_qstats_atomic_qlen_dec(qdisc); } return skb; @@ -714,7 +714,6 @@ static void pfifo_fast_reset(struct Qdisc *qdisc) struct gnet_stats_queue *q = per_cpu_ptr(qdisc->cpu_qstats, i); q->backlog = 0; - q->qlen = 0; } } @@ -1366,7 +1365,11 @@ static void mini_qdisc_rcu_func(struct rcu_head *head) void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, struct tcf_proto *tp_head) { - struct mini_Qdisc *miniq_old = rtnl_dereference(*miniqp->p_miniq); + /* Protected with chain0->filter_chain_lock. + * Can't access chain directly because tp_head can be NULL. + */ + struct mini_Qdisc *miniq_old = + rcu_dereference_protected(*miniqp->p_miniq, 1); struct mini_Qdisc *miniq; if (!tp_head) { diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index d1429371592f..1cc0c7b74aa3 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -17,9 +17,7 @@ * University of Oslo, Norway. * * References: - * IETF draft submission: http://tools.ietf.org/html/draft-pan-aqm-pie-00 - * IEEE Conference on High Performance Switching and Routing 2013 : - * "PIE: A * Lightweight Control Scheme to Address the Bufferbloat Problem" + * RFC 8033: https://tools.ietf.org/html/rfc8033 */ #include <linux/module.h> @@ -31,9 +29,9 @@ #include <net/pkt_sched.h> #include <net/inet_ecn.h> -#define QUEUE_THRESHOLD 10000 +#define QUEUE_THRESHOLD 16384 #define DQCOUNT_INVALID -1 -#define MAX_PROB 0xffffffff +#define MAX_PROB 0xffffffffffffffff #define PIE_SCALE 8 /* parameters used */ @@ -49,14 +47,16 @@ struct pie_params { /* variables used */ struct pie_vars { - u32 prob; /* probability but scaled by u32 limit. */ + u64 prob; /* probability but scaled by u64 limit. */ psched_time_t burst_time; psched_time_t qdelay; psched_time_t qdelay_old; u64 dq_count; /* measured in bytes */ psched_time_t dq_tstamp; /* drain rate */ + u64 accu_prob; /* accumulated drop probability */ u32 avg_dq_rate; /* bytes per pschedtime tick,scaled */ u32 qlen_old; /* in bytes */ + u8 accu_prob_overflows; /* overflows of accu_prob */ }; /* statistics gathering */ @@ -81,9 +81,9 @@ static void pie_params_init(struct pie_params *params) { params->alpha = 2; params->beta = 20; - params->tupdate = usecs_to_jiffies(30 * USEC_PER_MSEC); /* 30 ms */ + params->tupdate = usecs_to_jiffies(15 * USEC_PER_MSEC); /* 15 ms */ params->limit = 1000; /* default of 1000 packets */ - params->target = PSCHED_NS2TICKS(20 * NSEC_PER_MSEC); /* 20 ms */ + params->target = PSCHED_NS2TICKS(15 * NSEC_PER_MSEC); /* 15 ms */ params->ecn = false; params->bytemode = false; } @@ -91,16 +91,18 @@ static void pie_params_init(struct pie_params *params) static void pie_vars_init(struct pie_vars *vars) { vars->dq_count = DQCOUNT_INVALID; + vars->accu_prob = 0; vars->avg_dq_rate = 0; - /* default of 100 ms in pschedtime */ - vars->burst_time = PSCHED_NS2TICKS(100 * NSEC_PER_MSEC); + /* default of 150 ms in pschedtime */ + vars->burst_time = PSCHED_NS2TICKS(150 * NSEC_PER_MSEC); + vars->accu_prob_overflows = 0; } static bool drop_early(struct Qdisc *sch, u32 packet_size) { struct pie_sched_data *q = qdisc_priv(sch); - u32 rnd; - u32 local_prob = q->vars.prob; + u64 rnd; + u64 local_prob = q->vars.prob; u32 mtu = psched_mtu(qdisc_dev(sch)); /* If there is still burst allowance left skip random early drop */ @@ -124,13 +126,33 @@ static bool drop_early(struct Qdisc *sch, u32 packet_size) * probablity. Smaller packets will have lower drop prob in this case */ if (q->params.bytemode && packet_size <= mtu) - local_prob = (local_prob / mtu) * packet_size; + local_prob = (u64)packet_size * div_u64(local_prob, mtu); else local_prob = q->vars.prob; - rnd = prandom_u32(); - if (rnd < local_prob) + if (local_prob == 0) { + q->vars.accu_prob = 0; + q->vars.accu_prob_overflows = 0; + } + + if (local_prob > MAX_PROB - q->vars.accu_prob) + q->vars.accu_prob_overflows++; + + q->vars.accu_prob += local_prob; + + if (q->vars.accu_prob_overflows == 0 && + q->vars.accu_prob < (MAX_PROB / 100) * 85) + return false; + if (q->vars.accu_prob_overflows == 8 && + q->vars.accu_prob >= MAX_PROB / 2) + return true; + + prandom_bytes(&rnd, 8); + if (rnd < local_prob) { + q->vars.accu_prob = 0; + q->vars.accu_prob_overflows = 0; return true; + } return false; } @@ -168,6 +190,8 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, out: q->stats.dropped++; + q->vars.accu_prob = 0; + q->vars.accu_prob_overflows = 0; return qdisc_drop(skb, sch, to_free); } @@ -317,9 +341,10 @@ static void calculate_probability(struct Qdisc *sch) u32 qlen = sch->qstats.backlog; /* queue size in bytes */ psched_time_t qdelay = 0; /* in pschedtime */ psched_time_t qdelay_old = q->vars.qdelay; /* in pschedtime */ - s32 delta = 0; /* determines the change in probability */ - u32 oldprob; - u32 alpha, beta; + s64 delta = 0; /* determines the change in probability */ + u64 oldprob; + u64 alpha, beta; + u32 power; bool update_prob = true; q->vars.qdelay_old = q->vars.qdelay; @@ -339,38 +364,36 @@ static void calculate_probability(struct Qdisc *sch) * value for alpha as 0.125. In this implementation, we use values 0-32 * passed from user space to represent this. Also, alpha and beta have * unit of HZ and need to be scaled before they can used to update - * probability. alpha/beta are updated locally below by 1) scaling them - * appropriately 2) scaling down by 16 to come to 0-2 range. - * Please see paper for details. - * - * We scale alpha and beta differently depending on whether we are in - * light, medium or high dropping mode. + * probability. alpha/beta are updated locally below by scaling down + * by 16 to come to 0-2 range. */ - if (q->vars.prob < MAX_PROB / 100) { - alpha = - (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7; - beta = - (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7; - } else if (q->vars.prob < MAX_PROB / 10) { - alpha = - (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5; - beta = - (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5; - } else { - alpha = - (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; - beta = - (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; + alpha = ((u64)q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; + beta = ((u64)q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; + + /* We scale alpha and beta differently depending on how heavy the + * congestion is. Please see RFC 8033 for details. + */ + if (q->vars.prob < MAX_PROB / 10) { + alpha >>= 1; + beta >>= 1; + + power = 100; + while (q->vars.prob < div_u64(MAX_PROB, power) && + power <= 1000000) { + alpha >>= 2; + beta >>= 2; + power *= 10; + } } /* alpha and beta should be between 0 and 32, in multiples of 1/16 */ - delta += alpha * ((qdelay - q->params.target)); - delta += beta * ((qdelay - qdelay_old)); + delta += alpha * (u64)(qdelay - q->params.target); + delta += beta * (u64)(qdelay - qdelay_old); oldprob = q->vars.prob; /* to ensure we increase probability in steps of no more than 2% */ - if (delta > (s32)(MAX_PROB / (100 / 2)) && + if (delta > (s64)(MAX_PROB / (100 / 2)) && q->vars.prob >= MAX_PROB / 10) delta = (MAX_PROB / 100) * 2; @@ -406,7 +429,8 @@ static void calculate_probability(struct Qdisc *sch) */ if (qdelay == 0 && qdelay_old == 0 && update_prob) - q->vars.prob = (q->vars.prob * 98) / 100; + /* Reduce drop probability to 98.4% */ + q->vars.prob -= q->vars.prob / 64u; q->vars.qdelay = qdelay; q->vars.qlen_old = qlen; |