diff options
Diffstat (limited to 'net/sched')
-rw-r--r-- | net/sched/Kconfig | 12 | ||||
-rw-r--r-- | net/sched/Makefile | 1 | ||||
-rw-r--r-- | net/sched/act_api.c | 43 | ||||
-rw-r--r-- | net/sched/act_ct.c | 2 | ||||
-rw-r--r-- | net/sched/act_gate.c | 639 | ||||
-rw-r--r-- | net/sched/cls_api.c | 260 | ||||
-rw-r--r-- | net/sched/cls_flower.c | 350 | ||||
-rw-r--r-- | net/sched/em_ipt.c | 2 | ||||
-rw-r--r-- | net/sched/sch_api.c | 3 | ||||
-rw-r--r-- | net/sched/sch_cake.c | 65 | ||||
-rw-r--r-- | net/sched/sch_choke.c | 6 | ||||
-rw-r--r-- | net/sched/sch_fq.c | 143 | ||||
-rw-r--r-- | net/sched/sch_generic.c | 109 | ||||
-rw-r--r-- | net/sched/sch_red.c | 9 |
14 files changed, 1361 insertions, 283 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig index bfbefb7bff9d..2f20073f4f84 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -981,6 +981,18 @@ config NET_ACT_CT To compile this code as a module, choose M here: the module will be called act_ct. +config NET_ACT_GATE + tristate "Frame gate entry list control tc action" + depends on NET_CLS_ACT + help + Say Y here to allow to control the ingress flow to be passed at + specific time slot and be dropped at other specific time slot by + the gate entry list. + + If unsure, say N. + To compile this code as a module, choose M here: the + module will be called act_gate. + config NET_IFE_SKBMARK tristate "Support to encoding decoding skb mark on IFE action" depends on NET_ACT_IFE diff --git a/net/sched/Makefile b/net/sched/Makefile index 31c367a6cd09..66bbf9a98f9e 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -30,6 +30,7 @@ obj-$(CONFIG_NET_IFE_SKBPRIO) += act_meta_skbprio.o obj-$(CONFIG_NET_IFE_SKBTCINDEX) += act_meta_skbtcindex.o obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o obj-$(CONFIG_NET_ACT_CT) += act_ct.o +obj-$(CONFIG_NET_ACT_GATE) += act_gate.o obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index df4560909157..8ac7eb0a8309 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -766,12 +766,10 @@ tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref) return a->ops->dump(skb, a, bind, ref); } -int -tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +static int +tcf_action_dump_terse(struct sk_buff *skb, struct tc_action *a) { - int err = -EINVAL; unsigned char *b = skb_tail_pointer(skb); - struct nlattr *nest; struct tc_cookie *cookie; if (nla_put_string(skb, TCA_KIND, a->ops->kind)) @@ -789,6 +787,23 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) } rcu_read_unlock(); + return 0; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +int +tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +{ + int err = -EINVAL; + unsigned char *b = skb_tail_pointer(skb); + struct nlattr *nest; + + if (tcf_action_dump_terse(skb, a)) + goto nla_put_failure; + if (a->hw_stats != TCA_ACT_HW_STATS_ANY && nla_put_bitfield32(skb, TCA_ACT_HW_STATS, a->hw_stats, TCA_ACT_HW_STATS_ANY)) @@ -820,7 +835,7 @@ nla_put_failure: EXPORT_SYMBOL(tcf_action_dump_1); int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], - int bind, int ref) + int bind, int ref, bool terse) { struct tc_action *a; int err = -EINVAL, i; @@ -831,7 +846,8 @@ int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], nest = nla_nest_start_noflag(skb, i + 1); if (nest == NULL) goto nla_put_failure; - err = tcf_action_dump_1(skb, a, bind, ref); + err = terse ? tcf_action_dump_terse(skb, a) : + tcf_action_dump_1(skb, a, bind, ref); if (err < 0) goto errout; nla_nest_end(skb, nest); @@ -876,19 +892,14 @@ static u8 tcf_action_hw_stats_get(struct nlattr *hw_stats_attr) return hw_stats_bf.value; } -static const u32 tca_act_flags_allowed = TCA_ACT_FLAGS_NO_PERCPU_STATS; -static const u32 tca_act_hw_stats_allowed = TCA_ACT_HW_STATS_ANY; - static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = { [TCA_ACT_KIND] = { .type = NLA_STRING }, [TCA_ACT_INDEX] = { .type = NLA_U32 }, [TCA_ACT_COOKIE] = { .type = NLA_BINARY, .len = TC_COOKIE_MAX_SIZE }, [TCA_ACT_OPTIONS] = { .type = NLA_NESTED }, - [TCA_ACT_FLAGS] = { .type = NLA_BITFIELD32, - .validation_data = &tca_act_flags_allowed }, - [TCA_ACT_HW_STATS] = { .type = NLA_BITFIELD32, - .validation_data = &tca_act_hw_stats_allowed }, + [TCA_ACT_FLAGS] = NLA_POLICY_BITFIELD32(TCA_ACT_FLAGS_NO_PERCPU_STATS), + [TCA_ACT_HW_STATS] = NLA_POLICY_BITFIELD32(TCA_ACT_HW_STATS_ANY), }; struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, @@ -1138,7 +1149,7 @@ static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[], if (!nest) goto out_nlmsg_trim; - if (tcf_action_dump(skb, actions, bind, ref) < 0) + if (tcf_action_dump(skb, actions, bind, ref, false) < 0) goto out_nlmsg_trim; nla_nest_end(skb, nest); @@ -1454,10 +1465,8 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, return ret; } -static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON; static const struct nla_policy tcaa_policy[TCA_ROOT_MAX + 1] = { - [TCA_ROOT_FLAGS] = { .type = NLA_BITFIELD32, - .validation_data = &tcaa_root_flags_allowed }, + [TCA_ROOT_FLAGS] = NLA_POLICY_BITFIELD32(TCA_FLAG_LARGE_DUMP_ON), [TCA_ROOT_TIME_DELTA] = { .type = NLA_U32 }, }; diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 20577355235a..e29f0f45d688 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -30,6 +30,7 @@ #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> #include <uapi/linux/netfilter/nf_nat.h> @@ -539,6 +540,7 @@ static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p, flow_offload_refresh(nf_ft, flow); nf_conntrack_get(&ct->ct_general); nf_ct_set(skb, ct, ctinfo); + nf_ct_acct_update(ct, dir, skb->len); return true; } diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c new file mode 100644 index 000000000000..9c628591f452 --- /dev/null +++ b/net/sched/act_gate.c @@ -0,0 +1,639 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Copyright 2020 NXP */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <net/act_api.h> +#include <net/netlink.h> +#include <net/pkt_cls.h> +#include <net/tc_act/tc_gate.h> + +static unsigned int gate_net_id; +static struct tc_action_ops act_gate_ops; + +static ktime_t gate_get_time(struct tcf_gate *gact) +{ + ktime_t mono = ktime_get(); + + switch (gact->tk_offset) { + case TK_OFFS_MAX: + return mono; + default: + return ktime_mono_to_any(mono, gact->tk_offset); + } + + return KTIME_MAX; +} + +static int gate_get_start_time(struct tcf_gate *gact, ktime_t *start) +{ + struct tcf_gate_params *param = &gact->param; + ktime_t now, base, cycle; + u64 n; + + base = ns_to_ktime(param->tcfg_basetime); + now = gate_get_time(gact); + + if (ktime_after(base, now)) { + *start = base; + return 0; + } + + cycle = param->tcfg_cycletime; + + /* cycle time should not be zero */ + if (!cycle) + return -EFAULT; + + n = div64_u64(ktime_sub_ns(now, base), cycle); + *start = ktime_add_ns(base, (n + 1) * cycle); + return 0; +} + +static void gate_start_timer(struct tcf_gate *gact, ktime_t start) +{ + ktime_t expires; + + expires = hrtimer_get_expires(&gact->hitimer); + if (expires == 0) + expires = KTIME_MAX; + + start = min_t(ktime_t, start, expires); + + hrtimer_start(&gact->hitimer, start, HRTIMER_MODE_ABS_SOFT); +} + +static enum hrtimer_restart gate_timer_func(struct hrtimer *timer) +{ + struct tcf_gate *gact = container_of(timer, struct tcf_gate, + hitimer); + struct tcf_gate_params *p = &gact->param; + struct tcfg_gate_entry *next; + ktime_t close_time, now; + + spin_lock(&gact->tcf_lock); + + next = gact->next_entry; + + /* cycle start, clear pending bit, clear total octets */ + gact->current_gate_status = next->gate_state ? GATE_ACT_GATE_OPEN : 0; + gact->current_entry_octets = 0; + gact->current_max_octets = next->maxoctets; + + gact->current_close_time = ktime_add_ns(gact->current_close_time, + next->interval); + + close_time = gact->current_close_time; + + if (list_is_last(&next->list, &p->entries)) + next = list_first_entry(&p->entries, + struct tcfg_gate_entry, list); + else + next = list_next_entry(next, list); + + now = gate_get_time(gact); + + if (ktime_after(now, close_time)) { + ktime_t cycle, base; + u64 n; + + cycle = p->tcfg_cycletime; + base = ns_to_ktime(p->tcfg_basetime); + n = div64_u64(ktime_sub_ns(now, base), cycle); + close_time = ktime_add_ns(base, (n + 1) * cycle); + } + + gact->next_entry = next; + + hrtimer_set_expires(&gact->hitimer, close_time); + + spin_unlock(&gact->tcf_lock); + + return HRTIMER_RESTART; +} + +static int tcf_gate_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + struct tcf_gate *gact = to_gate(a); + + spin_lock(&gact->tcf_lock); + + tcf_lastuse_update(&gact->tcf_tm); + bstats_update(&gact->tcf_bstats, skb); + + if (unlikely(gact->current_gate_status & GATE_ACT_PENDING)) { + spin_unlock(&gact->tcf_lock); + return gact->tcf_action; + } + + if (!(gact->current_gate_status & GATE_ACT_GATE_OPEN)) + goto drop; + + if (gact->current_max_octets >= 0) { + gact->current_entry_octets += qdisc_pkt_len(skb); + if (gact->current_entry_octets > gact->current_max_octets) { + gact->tcf_qstats.overlimits++; + goto drop; + } + } + + spin_unlock(&gact->tcf_lock); + + return gact->tcf_action; +drop: + gact->tcf_qstats.drops++; + spin_unlock(&gact->tcf_lock); + + return TC_ACT_SHOT; +} + +static const struct nla_policy entry_policy[TCA_GATE_ENTRY_MAX + 1] = { + [TCA_GATE_ENTRY_INDEX] = { .type = NLA_U32 }, + [TCA_GATE_ENTRY_GATE] = { .type = NLA_FLAG }, + [TCA_GATE_ENTRY_INTERVAL] = { .type = NLA_U32 }, + [TCA_GATE_ENTRY_IPV] = { .type = NLA_S32 }, + [TCA_GATE_ENTRY_MAX_OCTETS] = { .type = NLA_S32 }, +}; + +static const struct nla_policy gate_policy[TCA_GATE_MAX + 1] = { + [TCA_GATE_PARMS] = { .len = sizeof(struct tc_gate), + .type = NLA_EXACT_LEN }, + [TCA_GATE_PRIORITY] = { .type = NLA_S32 }, + [TCA_GATE_ENTRY_LIST] = { .type = NLA_NESTED }, + [TCA_GATE_BASE_TIME] = { .type = NLA_U64 }, + [TCA_GATE_CYCLE_TIME] = { .type = NLA_U64 }, + [TCA_GATE_CYCLE_TIME_EXT] = { .type = NLA_U64 }, + [TCA_GATE_FLAGS] = { .type = NLA_U32 }, + [TCA_GATE_CLOCKID] = { .type = NLA_S32 }, +}; + +static int fill_gate_entry(struct nlattr **tb, struct tcfg_gate_entry *entry, + struct netlink_ext_ack *extack) +{ + u32 interval = 0; + + entry->gate_state = nla_get_flag(tb[TCA_GATE_ENTRY_GATE]); + + if (tb[TCA_GATE_ENTRY_INTERVAL]) + interval = nla_get_u32(tb[TCA_GATE_ENTRY_INTERVAL]); + + if (interval == 0) { + NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry"); + return -EINVAL; + } + + entry->interval = interval; + + if (tb[TCA_GATE_ENTRY_IPV]) + entry->ipv = nla_get_s32(tb[TCA_GATE_ENTRY_IPV]); + else + entry->ipv = -1; + + if (tb[TCA_GATE_ENTRY_MAX_OCTETS]) + entry->maxoctets = nla_get_s32(tb[TCA_GATE_ENTRY_MAX_OCTETS]); + else + entry->maxoctets = -1; + + return 0; +} + +static int parse_gate_entry(struct nlattr *n, struct tcfg_gate_entry *entry, + int index, struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_GATE_ENTRY_MAX + 1] = { }; + int err; + + err = nla_parse_nested(tb, TCA_GATE_ENTRY_MAX, n, entry_policy, extack); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Could not parse nested entry"); + return -EINVAL; + } + + entry->index = index; + + return fill_gate_entry(tb, entry, extack); +} + +static void release_entry_list(struct list_head *entries) +{ + struct tcfg_gate_entry *entry, *e; + + list_for_each_entry_safe(entry, e, entries, list) { + list_del(&entry->list); + kfree(entry); + } +} + +static int parse_gate_list(struct nlattr *list_attr, + struct tcf_gate_params *sched, + struct netlink_ext_ack *extack) +{ + struct tcfg_gate_entry *entry; + struct nlattr *n; + int err, rem; + int i = 0; + + if (!list_attr) + return -EINVAL; + + nla_for_each_nested(n, list_attr, rem) { + if (nla_type(n) != TCA_GATE_ONE_ENTRY) { + NL_SET_ERR_MSG(extack, "Attribute isn't type 'entry'"); + continue; + } + + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + if (!entry) { + NL_SET_ERR_MSG(extack, "Not enough memory for entry"); + err = -ENOMEM; + goto release_list; + } + + err = parse_gate_entry(n, entry, i, extack); + if (err < 0) { + kfree(entry); + goto release_list; + } + + list_add_tail(&entry->list, &sched->entries); + i++; + } + + sched->num_entries = i; + + return i; + +release_list: + release_entry_list(&sched->entries); + + return err; +} + +static int tcf_gate_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action **a, + int ovr, int bind, bool rtnl_held, + struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) +{ + struct tc_action_net *tn = net_generic(net, gate_net_id); + enum tk_offsets tk_offset = TK_OFFS_TAI; + struct nlattr *tb[TCA_GATE_MAX + 1]; + struct tcf_chain *goto_ch = NULL; + struct tcf_gate_params *p; + s32 clockid = CLOCK_TAI; + struct tcf_gate *gact; + struct tc_gate *parm; + int ret = 0, err; + u64 basetime = 0; + u32 gflags = 0; + s32 prio = -1; + ktime_t start; + u32 index; + + if (!nla) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_GATE_MAX, nla, gate_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_GATE_PARMS]) + return -EINVAL; + + parm = nla_data(tb[TCA_GATE_PARMS]); + index = parm->index; + + err = tcf_idr_check_alloc(tn, &index, a, bind); + if (err < 0) + return err; + + if (err && bind) + return 0; + + if (!err) { + ret = tcf_idr_create(tn, index, est, a, + &act_gate_ops, bind, false, 0); + if (ret) { + tcf_idr_cleanup(tn, index); + return ret; + } + + ret = ACT_P_CREATED; + } else if (!ovr) { + tcf_idr_release(*a, bind); + return -EEXIST; + } + if (ret == ACT_P_CREATED) { + to_gate(*a)->param.tcfg_clockid = -1; + INIT_LIST_HEAD(&(to_gate(*a)->param.entries)); + } + + if (tb[TCA_GATE_PRIORITY]) + prio = nla_get_s32(tb[TCA_GATE_PRIORITY]); + + if (tb[TCA_GATE_BASE_TIME]) + basetime = nla_get_u64(tb[TCA_GATE_BASE_TIME]); + + if (tb[TCA_GATE_FLAGS]) + gflags = nla_get_u32(tb[TCA_GATE_FLAGS]); + + if (tb[TCA_GATE_CLOCKID]) { + clockid = nla_get_s32(tb[TCA_GATE_CLOCKID]); + switch (clockid) { + case CLOCK_REALTIME: + tk_offset = TK_OFFS_REAL; + break; + case CLOCK_MONOTONIC: + tk_offset = TK_OFFS_MAX; + break; + case CLOCK_BOOTTIME: + tk_offset = TK_OFFS_BOOT; + break; + case CLOCK_TAI: + tk_offset = TK_OFFS_TAI; + break; + default: + NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); + goto release_idr; + } + } + + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); + if (err < 0) + goto release_idr; + + gact = to_gate(*a); + + spin_lock_bh(&gact->tcf_lock); + p = &gact->param; + + if (tb[TCA_GATE_CYCLE_TIME]) { + p->tcfg_cycletime = nla_get_u64(tb[TCA_GATE_CYCLE_TIME]); + if (!p->tcfg_cycletime_ext) + goto chain_put; + } + + if (tb[TCA_GATE_ENTRY_LIST]) { + err = parse_gate_list(tb[TCA_GATE_ENTRY_LIST], p, extack); + if (err < 0) + goto chain_put; + } + + if (!p->tcfg_cycletime) { + struct tcfg_gate_entry *entry; + ktime_t cycle = 0; + + list_for_each_entry(entry, &p->entries, list) + cycle = ktime_add_ns(cycle, entry->interval); + p->tcfg_cycletime = cycle; + } + + if (tb[TCA_GATE_CYCLE_TIME_EXT]) + p->tcfg_cycletime_ext = + nla_get_u64(tb[TCA_GATE_CYCLE_TIME_EXT]); + + p->tcfg_priority = prio; + p->tcfg_basetime = basetime; + p->tcfg_clockid = clockid; + p->tcfg_flags = gflags; + + gact->tk_offset = tk_offset; + hrtimer_init(&gact->hitimer, clockid, HRTIMER_MODE_ABS_SOFT); + gact->hitimer.function = gate_timer_func; + + err = gate_get_start_time(gact, &start); + if (err < 0) { + NL_SET_ERR_MSG(extack, + "Internal error: failed get start time"); + release_entry_list(&p->entries); + goto chain_put; + } + + gact->current_close_time = start; + gact->current_gate_status = GATE_ACT_GATE_OPEN | GATE_ACT_PENDING; + + gact->next_entry = list_first_entry(&p->entries, + struct tcfg_gate_entry, list); + + goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); + + gate_start_timer(gact, start); + + spin_unlock_bh(&gact->tcf_lock); + + if (goto_ch) + tcf_chain_put_by_act(goto_ch); + + if (ret == ACT_P_CREATED) + tcf_idr_insert(tn, *a); + + return ret; + +chain_put: + spin_unlock_bh(&gact->tcf_lock); + + if (goto_ch) + tcf_chain_put_by_act(goto_ch); +release_idr: + tcf_idr_release(*a, bind); + return err; +} + +static void tcf_gate_cleanup(struct tc_action *a) +{ + struct tcf_gate *gact = to_gate(a); + struct tcf_gate_params *p; + + p = &gact->param; + if (p->tcfg_clockid != -1) + hrtimer_cancel(&gact->hitimer); + + release_entry_list(&p->entries); +} + +static int dumping_entry(struct sk_buff *skb, + struct tcfg_gate_entry *entry) +{ + struct nlattr *item; + + item = nla_nest_start_noflag(skb, TCA_GATE_ONE_ENTRY); + if (!item) + return -ENOSPC; + + if (nla_put_u32(skb, TCA_GATE_ENTRY_INDEX, entry->index)) + goto nla_put_failure; + + if (entry->gate_state && nla_put_flag(skb, TCA_GATE_ENTRY_GATE)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_GATE_ENTRY_INTERVAL, entry->interval)) + goto nla_put_failure; + + if (nla_put_s32(skb, TCA_GATE_ENTRY_MAX_OCTETS, entry->maxoctets)) + goto nla_put_failure; + + if (nla_put_s32(skb, TCA_GATE_ENTRY_IPV, entry->ipv)) + goto nla_put_failure; + + return nla_nest_end(skb, item); + +nla_put_failure: + nla_nest_cancel(skb, item); + return -1; +} + +static int tcf_gate_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tcf_gate *gact = to_gate(a); + struct tc_gate opt = { + .index = gact->tcf_index, + .refcnt = refcount_read(&gact->tcf_refcnt) - ref, + .bindcnt = atomic_read(&gact->tcf_bindcnt) - bind, + }; + struct tcfg_gate_entry *entry; + struct tcf_gate_params *p; + struct nlattr *entry_list; + struct tcf_t t; + + spin_lock_bh(&gact->tcf_lock); + opt.action = gact->tcf_action; + + p = &gact->param; + + if (nla_put(skb, TCA_GATE_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + + if (nla_put_u64_64bit(skb, TCA_GATE_BASE_TIME, + p->tcfg_basetime, TCA_GATE_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(skb, TCA_GATE_CYCLE_TIME, + p->tcfg_cycletime, TCA_GATE_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(skb, TCA_GATE_CYCLE_TIME_EXT, + p->tcfg_cycletime_ext, TCA_GATE_PAD)) + goto nla_put_failure; + + if (nla_put_s32(skb, TCA_GATE_CLOCKID, p->tcfg_clockid)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_GATE_FLAGS, p->tcfg_flags)) + goto nla_put_failure; + + if (nla_put_s32(skb, TCA_GATE_PRIORITY, p->tcfg_priority)) + goto nla_put_failure; + + entry_list = nla_nest_start_noflag(skb, TCA_GATE_ENTRY_LIST); + if (!entry_list) + goto nla_put_failure; + + list_for_each_entry(entry, &p->entries, list) { + if (dumping_entry(skb, entry) < 0) + goto nla_put_failure; + } + + nla_nest_end(skb, entry_list); + + tcf_tm_dump(&t, &gact->tcf_tm); + if (nla_put_64bit(skb, TCA_GATE_TM, sizeof(t), &t, TCA_GATE_PAD)) + goto nla_put_failure; + spin_unlock_bh(&gact->tcf_lock); + + return skb->len; + +nla_put_failure: + spin_unlock_bh(&gact->tcf_lock); + nlmsg_trim(skb, b); + return -1; +} + +static int tcf_gate_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) +{ + struct tc_action_net *tn = net_generic(net, gate_net_id); + + return tcf_generic_walker(tn, skb, cb, type, ops, extack); +} + +static void tcf_gate_stats_update(struct tc_action *a, u64 bytes, u32 packets, + u64 lastuse, bool hw) +{ + struct tcf_gate *gact = to_gate(a); + struct tcf_t *tm = &gact->tcf_tm; + + tcf_action_update_stats(a, bytes, packets, false, hw); + tm->lastuse = max_t(u64, tm->lastuse, lastuse); +} + +static int tcf_gate_search(struct net *net, struct tc_action **a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, gate_net_id); + + return tcf_idr_search(tn, a, index); +} + +static size_t tcf_gate_get_fill_size(const struct tc_action *act) +{ + return nla_total_size(sizeof(struct tc_gate)); +} + +static struct tc_action_ops act_gate_ops = { + .kind = "gate", + .id = TCA_ID_GATE, + .owner = THIS_MODULE, + .act = tcf_gate_act, + .dump = tcf_gate_dump, + .init = tcf_gate_init, + .cleanup = tcf_gate_cleanup, + .walk = tcf_gate_walker, + .stats_update = tcf_gate_stats_update, + .get_fill_size = tcf_gate_get_fill_size, + .lookup = tcf_gate_search, + .size = sizeof(struct tcf_gate), +}; + +static __net_init int gate_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, gate_net_id); + + return tc_action_net_init(net, tn, &act_gate_ops); +} + +static void __net_exit gate_exit_net(struct list_head *net_list) +{ + tc_action_net_exit(net_list, gate_net_id); +} + +static struct pernet_operations gate_net_ops = { + .init = gate_init_net, + .exit_batch = gate_exit_net, + .id = &gate_net_id, + .size = sizeof(struct tc_action_net), +}; + +static int __init gate_init_module(void) +{ + return tcf_register_action(&act_gate_ops, &gate_net_ops); +} + +static void __exit gate_cleanup_module(void) +{ + tcf_unregister_action(&act_gate_ops, &gate_net_ops); +} + +module_init(gate_init_module); +module_exit(gate_cleanup_module); +MODULE_LICENSE("GPL v2"); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 0a7ecc292bd3..a00a203b2ef5 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -39,6 +39,7 @@ #include <net/tc_act/tc_skbedit.h> #include <net/tc_act/tc_ct.h> #include <net/tc_act/tc_mpls.h> +#include <net/tc_act/tc_gate.h> #include <net/flow_offload.h> extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1]; @@ -620,96 +621,42 @@ static void tcf_chain_flush(struct tcf_chain *chain, bool rtnl_held) static int tcf_block_setup(struct tcf_block *block, struct flow_block_offload *bo); -static void tc_indr_block_cmd(struct net_device *dev, struct tcf_block *block, - flow_indr_block_bind_cb_t *cb, void *cb_priv, - enum flow_block_command command, bool ingress) -{ - struct flow_block_offload bo = { - .command = command, - .binder_type = ingress ? - FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS : - FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS, - .net = dev_net(dev), - .block_shared = tcf_block_non_null_shared(block), - }; - INIT_LIST_HEAD(&bo.cb_list); - - if (!block) - return; - - bo.block = &block->flow_block; - - down_write(&block->cb_lock); - cb(dev, cb_priv, TC_SETUP_BLOCK, &bo); - - tcf_block_setup(block, &bo); - up_write(&block->cb_lock); -} - -static struct tcf_block *tc_dev_block(struct net_device *dev, bool ingress) +static void tcf_block_offload_init(struct flow_block_offload *bo, + struct net_device *dev, + enum flow_block_command command, + enum flow_block_binder_type binder_type, + struct flow_block *flow_block, + bool shared, struct netlink_ext_ack *extack) { - const struct Qdisc_class_ops *cops; - const struct Qdisc_ops *ops; - struct Qdisc *qdisc; - - if (!dev_ingress_queue(dev)) - return NULL; - - qdisc = dev_ingress_queue(dev)->qdisc_sleeping; - if (!qdisc) - return NULL; - - ops = qdisc->ops; - if (!ops) - return NULL; - - if (!ingress && !strcmp("ingress", ops->id)) - return NULL; - - cops = ops->cl_ops; - if (!cops) - return NULL; - - if (!cops->tcf_block) - return NULL; - - return cops->tcf_block(qdisc, - ingress ? TC_H_MIN_INGRESS : TC_H_MIN_EGRESS, - NULL); + bo->net = dev_net(dev); + bo->command = command; + bo->binder_type = binder_type; + bo->block = flow_block; + bo->block_shared = shared; + bo->extack = extack; + INIT_LIST_HEAD(&bo->cb_list); } -static void tc_indr_block_get_and_cmd(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, - void *cb_priv, - enum flow_block_command command) -{ - struct tcf_block *block; - - block = tc_dev_block(dev, true); - tc_indr_block_cmd(dev, block, cb, cb_priv, command, true); - - block = tc_dev_block(dev, false); - tc_indr_block_cmd(dev, block, cb, cb_priv, command, false); -} +static void tcf_block_unbind(struct tcf_block *block, + struct flow_block_offload *bo); -static void tc_indr_block_call(struct tcf_block *block, - struct net_device *dev, - struct tcf_block_ext_info *ei, - enum flow_block_command command, - struct netlink_ext_ack *extack) +static void tc_block_indr_cleanup(struct flow_block_cb *block_cb) { - struct flow_block_offload bo = { - .command = command, - .binder_type = ei->binder_type, - .net = dev_net(dev), - .block = &block->flow_block, - .block_shared = tcf_block_shared(block), - .extack = extack, - }; - INIT_LIST_HEAD(&bo.cb_list); + struct tcf_block *block = block_cb->indr.data; + struct net_device *dev = block_cb->indr.dev; + struct netlink_ext_ack extack = {}; + struct flow_block_offload bo; - flow_indr_block_call(dev, &bo, command, TC_SETUP_BLOCK); - tcf_block_setup(block, &bo); + tcf_block_offload_init(&bo, dev, FLOW_BLOCK_UNBIND, + block_cb->indr.binder_type, + &block->flow_block, tcf_block_shared(block), + &extack); + down_write(&block->cb_lock); + list_move(&block_cb->list, &bo.cb_list); + up_write(&block->cb_lock); + rtnl_lock(); + tcf_block_unbind(block, &bo); + rtnl_unlock(); } static bool tcf_block_offload_in_use(struct tcf_block *block) @@ -726,17 +673,21 @@ static int tcf_block_offload_cmd(struct tcf_block *block, struct flow_block_offload bo = {}; int err; - bo.net = dev_net(dev); - bo.command = command; - bo.binder_type = ei->binder_type; - bo.block = &block->flow_block; - bo.block_shared = tcf_block_shared(block); - bo.extack = extack; - INIT_LIST_HEAD(&bo.cb_list); + tcf_block_offload_init(&bo, dev, command, ei->binder_type, + &block->flow_block, tcf_block_shared(block), + extack); - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); - if (err < 0) + if (dev->netdev_ops->ndo_setup_tc) + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); + else + err = flow_indr_dev_setup_offload(dev, TC_SETUP_BLOCK, block, + &bo, tc_block_indr_cleanup); + + if (err < 0) { + if (err != -EOPNOTSUPP) + NL_SET_ERR_MSG(extack, "Driver ndo_setup_tc failed"); return err; + } return tcf_block_setup(block, &bo); } @@ -749,13 +700,13 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, int err; down_write(&block->cb_lock); - if (!dev->netdev_ops->ndo_setup_tc) - goto no_offload_dev_inc; /* If tc offload feature is disabled and the block we try to bind * to already has some offloaded filters, forbid to bind. */ - if (!tc_can_offload(dev) && tcf_block_offload_in_use(block)) { + if (dev->netdev_ops->ndo_setup_tc && + !tc_can_offload(dev) && + tcf_block_offload_in_use(block)) { NL_SET_ERR_MSG(extack, "Bind to offloaded block failed as dev has offload disabled"); err = -EOPNOTSUPP; goto err_unlock; @@ -767,18 +718,15 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, if (err) goto err_unlock; - tc_indr_block_call(block, dev, ei, FLOW_BLOCK_BIND, extack); up_write(&block->cb_lock); return 0; no_offload_dev_inc: - if (tcf_block_offload_in_use(block)) { - err = -EOPNOTSUPP; + if (tcf_block_offload_in_use(block)) goto err_unlock; - } + err = 0; block->nooffloaddevcnt++; - tc_indr_block_call(block, dev, ei, FLOW_BLOCK_BIND, extack); err_unlock: up_write(&block->cb_lock); return err; @@ -791,10 +739,6 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q, int err; down_write(&block->cb_lock); - tc_indr_block_call(block, dev, ei, FLOW_BLOCK_UNBIND, NULL); - - if (!dev->netdev_ops->ndo_setup_tc) - goto no_offload_dev_dec; err = tcf_block_offload_cmd(block, dev, ei, FLOW_BLOCK_UNBIND, NULL); if (err == -EOPNOTSUPP) goto no_offload_dev_dec; @@ -1847,7 +1791,7 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, struct tcf_proto *tp, struct tcf_block *block, struct Qdisc *q, u32 parent, void *fh, u32 portid, u32 seq, u16 flags, int event, - bool rtnl_held) + bool terse_dump, bool rtnl_held) { struct tcmsg *tcm; struct nlmsghdr *nlh; @@ -1874,6 +1818,14 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; if (!fh) { tcm->tcm_handle = 0; + } else if (terse_dump) { + if (tp->ops->terse_dump) { + if (tp->ops->terse_dump(net, tp, fh, skb, tcm, + rtnl_held) < 0) + goto nla_put_failure; + } else { + goto cls_op_not_supp; + } } else { if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm, rtnl_held) < 0) @@ -1884,6 +1836,7 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, out_nlmsg_trim: nla_put_failure: +cls_op_not_supp: nlmsg_trim(skb, b); return -1; } @@ -1904,7 +1857,7 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, n->nlmsg_seq, n->nlmsg_flags, event, - rtnl_held) <= 0) { + false, rtnl_held) <= 0) { kfree_skb(skb); return -EINVAL; } @@ -1936,7 +1889,7 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER, - rtnl_held) <= 0) { + false, rtnl_held) <= 0) { NL_SET_ERR_MSG(extack, "Failed to build del event notification"); kfree_skb(skb); return -EINVAL; @@ -2497,6 +2450,7 @@ struct tcf_dump_args { struct tcf_block *block; struct Qdisc *q; u32 parent; + bool terse_dump; }; static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg) @@ -2507,12 +2461,12 @@ static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg) return tcf_fill_node(net, a->skb, tp, a->block, a->q, a->parent, n, NETLINK_CB(a->cb->skb).portid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWTFILTER, true); + RTM_NEWTFILTER, a->terse_dump, true); } static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, struct sk_buff *skb, struct netlink_callback *cb, - long index_start, long *p_index) + long index_start, long *p_index, bool terse) { struct net *net = sock_net(skb->sk); struct tcf_block *block = chain->block; @@ -2541,7 +2495,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, if (tcf_fill_node(net, skb, tp, block, q, parent, NULL, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWTFILTER, true) <= 0) + RTM_NEWTFILTER, false, true) <= 0) goto errout; cb->args[1] = 1; } @@ -2557,6 +2511,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, arg.w.skip = cb->args[1] - 1; arg.w.count = 0; arg.w.cookie = cb->args[2]; + arg.terse_dump = terse; tp->ops->walk(tp, &arg.w, true); cb->args[2] = arg.w.cookie; cb->args[1] = arg.w.count + 1; @@ -2570,6 +2525,10 @@ errout: return false; } +static const struct nla_policy tcf_tfilter_dump_policy[TCA_MAX + 1] = { + [TCA_DUMP_FLAGS] = NLA_POLICY_BITFIELD32(TCA_DUMP_FLAGS_TERSE), +}; + /* called with RTNL */ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) { @@ -2579,6 +2538,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) struct Qdisc *q = NULL; struct tcf_block *block; struct tcmsg *tcm = nlmsg_data(cb->nlh); + bool terse_dump = false; long index_start; long index; u32 parent; @@ -2588,10 +2548,17 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; err = nlmsg_parse_deprecated(cb->nlh, sizeof(*tcm), tca, TCA_MAX, - NULL, cb->extack); + tcf_tfilter_dump_policy, cb->extack); if (err) return err; + if (tca[TCA_DUMP_FLAGS]) { + struct nla_bitfield32 flags = + nla_get_bitfield32(tca[TCA_DUMP_FLAGS]); + + terse_dump = flags.value & TCA_DUMP_FLAGS_TERSE; + } + if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) { block = tcf_block_refcnt_get(net, tcm->tcm_block_index); if (!block) @@ -2649,7 +2616,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) nla_get_u32(tca[TCA_CHAIN]) != chain->index) continue; if (!tcf_chain_dump(chain, q, parent, skb, cb, - index_start, &index)) { + index_start, &index, terse_dump)) { tcf_chain_put(chain); err = -EMSGSIZE; break; @@ -3152,7 +3119,8 @@ int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts) if (nest == NULL) goto nla_put_failure; - if (tcf_action_dump(skb, exts->actions, 0, 0) < 0) + if (tcf_action_dump(skb, exts->actions, 0, 0, false) + < 0) goto nla_put_failure; nla_nest_end(skb, nest); } else if (exts->police) { @@ -3176,6 +3144,31 @@ nla_put_failure: } EXPORT_SYMBOL(tcf_exts_dump); +int tcf_exts_terse_dump(struct sk_buff *skb, struct tcf_exts *exts) +{ +#ifdef CONFIG_NET_CLS_ACT + struct nlattr *nest; + + if (!exts->action || !tcf_exts_has_actions(exts)) + return 0; + + nest = nla_nest_start_noflag(skb, exts->action); + if (!nest) + goto nla_put_failure; + + if (tcf_action_dump(skb, exts->actions, 0, 0, true) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +#else + return 0; +#endif +} +EXPORT_SYMBOL(tcf_exts_terse_dump); int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts) { @@ -3523,6 +3516,27 @@ static void tcf_sample_get_group(struct flow_action_entry *entry, #endif } +static void tcf_gate_entry_destructor(void *priv) +{ + struct action_gate_entry *oe = priv; + + kfree(oe); +} + +static int tcf_gate_get_entries(struct flow_action_entry *entry, + const struct tc_action *act) +{ + entry->gate.entries = tcf_gate_get_list(act); + + if (!entry->gate.entries) + return -EINVAL; + + entry->destructor = tcf_gate_entry_destructor; + entry->destructor_priv = entry->gate.entries; + + return 0; +} + static enum flow_action_hw_stats tc_act_hw_stats(u8 hw_stats) { if (WARN_ON_ONCE(hw_stats > TCA_ACT_HW_STATS_ANY)) @@ -3679,6 +3693,17 @@ int tc_setup_flow_action(struct flow_action *flow_action, } else if (is_tcf_skbedit_priority(act)) { entry->id = FLOW_ACTION_PRIORITY; entry->priority = tcf_skbedit_priority(act); + } else if (is_tcf_gate(act)) { + entry->id = FLOW_ACTION_GATE; + entry->gate.index = tcf_gate_index(act); + entry->gate.prio = tcf_gate_prio(act); + entry->gate.basetime = tcf_gate_basetime(act); + entry->gate.cycletime = tcf_gate_cycletime(act); + entry->gate.cycletimeext = tcf_gate_cycletimeext(act); + entry->gate.num_entries = tcf_gate_num_entries(act); + err = tcf_gate_get_entries(entry, act); + if (err) + goto err_out; } else { err = -EOPNOTSUPP; goto err_out_locked; @@ -3739,11 +3764,6 @@ static struct pernet_operations tcf_net_ops = { .size = sizeof(struct tcf_net), }; -static struct flow_indr_block_entry block_entry = { - .cb = tc_indr_block_get_and_cmd, - .list = LIST_HEAD_INIT(block_entry.list), -}; - static int __init tc_filter_init(void) { int err; @@ -3756,8 +3776,6 @@ static int __init tc_filter_init(void) if (err) goto err_register_pernet_subsys; - flow_indr_add_block_cb(&block_entry); - rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL, RTNL_FLAG_DOIT_UNLOCKED); rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 74a0febcafb8..b2da37286082 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -272,14 +272,16 @@ static struct cls_fl_filter *fl_lookup_range(struct fl_flow_mask *mask, return NULL; } -static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask, - struct fl_flow_key *mkey, - struct fl_flow_key *key) +static noinline_for_stack +struct cls_fl_filter *fl_mask_lookup(struct fl_flow_mask *mask, struct fl_flow_key *key) { + struct fl_flow_key mkey; + + fl_set_masked_key(&mkey, key, mask); if ((mask->flags & TCA_FLOWER_MASK_FLAGS_RANGE)) - return fl_lookup_range(mask, mkey, key); + return fl_lookup_range(mask, &mkey, key); - return __fl_lookup(mask, mkey); + return __fl_lookup(mask, &mkey); } static u16 fl_ct_info_to_flower_map[] = { @@ -299,7 +301,6 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct cls_fl_head *head = rcu_dereference_bh(tp->root); - struct fl_flow_key skb_mkey; struct fl_flow_key skb_key; struct fl_flow_mask *mask; struct cls_fl_filter *f; @@ -319,9 +320,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, ARRAY_SIZE(fl_ct_info_to_flower_map)); skb_flow_dissect(skb, &mask->dissector, &skb_key, 0); - fl_set_masked_key(&skb_mkey, &skb_key, mask); - - f = fl_lookup(mask, &skb_mkey, &skb_key); + f = fl_mask_lookup(mask, &skb_key); if (f && !tc_skip_sw(f->flags)) { *res = f->res; return tcf_exts_exec(skb, &f->exts, res); @@ -668,6 +667,7 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_MPLS_BOS] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_MPLS_TC] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_MPLS_LABEL] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_MPLS_OPTS] = { .type = NLA_NESTED }, [TCA_FLOWER_KEY_TCP_FLAGS] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_TCP_FLAGS_MASK] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_IP_TOS] = { .type = NLA_U8 }, @@ -726,6 +726,15 @@ erspan_opt_policy[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX + 1] = { [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID] = { .type = NLA_U8 }, }; +static const struct nla_policy +mpls_stack_entry_policy[TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX + 1] = { + [TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_MPLS_OPT_LSE_TC] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL] = { .type = NLA_U32 }, +}; + static void fl_set_key_val(struct nlattr **tb, void *val, int val_type, void *mask, int mask_type, int len) @@ -776,14 +785,157 @@ static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key, return 0; } +static int fl_set_key_mpls_lse(const struct nlattr *nla_lse, + struct flow_dissector_key_mpls *key_val, + struct flow_dissector_key_mpls *key_mask, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX + 1]; + struct flow_dissector_mpls_lse *lse_mask; + struct flow_dissector_mpls_lse *lse_val; + u8 lse_index; + u8 depth; + int err; + + err = nla_parse_nested(tb, TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX, nla_lse, + mpls_stack_entry_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH]) { + NL_SET_ERR_MSG(extack, "Missing MPLS option \"depth\""); + return -EINVAL; + } + + depth = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH]); + + /* LSE depth starts at 1, for consistency with terminology used by + * RFC 3031 (section 3.9), where depth 0 refers to unlabeled packets. + */ + if (depth < 1 || depth > FLOW_DIS_MPLS_MAX) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH], + "Invalid MPLS depth"); + return -EINVAL; + } + lse_index = depth - 1; + + dissector_set_mpls_lse(key_val, lse_index); + dissector_set_mpls_lse(key_mask, lse_index); + + lse_val = &key_val->ls[lse_index]; + lse_mask = &key_mask->ls[lse_index]; + + if (tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL]) { + lse_val->mpls_ttl = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL]); + lse_mask->mpls_ttl = MPLS_TTL_MASK; + } + if (tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS]) { + u8 bos = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS]); + + if (bos & ~MPLS_BOS_MASK) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS], + "Bottom Of Stack (BOS) must be 0 or 1"); + return -EINVAL; + } + lse_val->mpls_bos = bos; + lse_mask->mpls_bos = MPLS_BOS_MASK; + } + if (tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TC]) { + u8 tc = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TC]); + + if (tc & ~MPLS_TC_MASK) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TC], + "Traffic Class (TC) must be between 0 and 7"); + return -EINVAL; + } + lse_val->mpls_tc = tc; + lse_mask->mpls_tc = MPLS_TC_MASK; + } + if (tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL]) { + u32 label = nla_get_u32(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL]); + + if (label & ~MPLS_LABEL_MASK) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL], + "Label must be between 0 and 1048575"); + return -EINVAL; + } + lse_val->mpls_label = label; + lse_mask->mpls_label = MPLS_LABEL_MASK; + } + + return 0; +} + +static int fl_set_key_mpls_opts(const struct nlattr *nla_mpls_opts, + struct flow_dissector_key_mpls *key_val, + struct flow_dissector_key_mpls *key_mask, + struct netlink_ext_ack *extack) +{ + struct nlattr *nla_lse; + int rem; + int err; + + if (!(nla_mpls_opts->nla_type & NLA_F_NESTED)) { + NL_SET_ERR_MSG_ATTR(extack, nla_mpls_opts, + "NLA_F_NESTED is missing"); + return -EINVAL; + } + + nla_for_each_nested(nla_lse, nla_mpls_opts, rem) { + if (nla_type(nla_lse) != TCA_FLOWER_KEY_MPLS_OPTS_LSE) { + NL_SET_ERR_MSG_ATTR(extack, nla_lse, + "Invalid MPLS option type"); + return -EINVAL; + } + + err = fl_set_key_mpls_lse(nla_lse, key_val, key_mask, extack); + if (err < 0) + return err; + } + if (rem) { + NL_SET_ERR_MSG(extack, + "Bytes leftover after parsing MPLS options"); + return -EINVAL; + } + + return 0; +} + static int fl_set_key_mpls(struct nlattr **tb, struct flow_dissector_key_mpls *key_val, struct flow_dissector_key_mpls *key_mask, struct netlink_ext_ack *extack) { + struct flow_dissector_mpls_lse *lse_mask; + struct flow_dissector_mpls_lse *lse_val; + + if (tb[TCA_FLOWER_KEY_MPLS_OPTS]) { + if (tb[TCA_FLOWER_KEY_MPLS_TTL] || + tb[TCA_FLOWER_KEY_MPLS_BOS] || + tb[TCA_FLOWER_KEY_MPLS_TC] || + tb[TCA_FLOWER_KEY_MPLS_LABEL]) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_FLOWER_KEY_MPLS_OPTS], + "MPLS label, Traffic Class, Bottom Of Stack and Time To Live must be encapsulated in the MPLS options attribute"); + return -EBADMSG; + } + + return fl_set_key_mpls_opts(tb[TCA_FLOWER_KEY_MPLS_OPTS], + key_val, key_mask, extack); + } + + lse_val = &key_val->ls[0]; + lse_mask = &key_mask->ls[0]; + if (tb[TCA_FLOWER_KEY_MPLS_TTL]) { - key_val->mpls_ttl = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TTL]); - key_mask->mpls_ttl = MPLS_TTL_MASK; + lse_val->mpls_ttl = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TTL]); + lse_mask->mpls_ttl = MPLS_TTL_MASK; + dissector_set_mpls_lse(key_val, 0); + dissector_set_mpls_lse(key_mask, 0); } if (tb[TCA_FLOWER_KEY_MPLS_BOS]) { u8 bos = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_BOS]); @@ -794,8 +946,10 @@ static int fl_set_key_mpls(struct nlattr **tb, "Bottom Of Stack (BOS) must be 0 or 1"); return -EINVAL; } - key_val->mpls_bos = bos; - key_mask->mpls_bos = MPLS_BOS_MASK; + lse_val->mpls_bos = bos; + lse_mask->mpls_bos = MPLS_BOS_MASK; + dissector_set_mpls_lse(key_val, 0); + dissector_set_mpls_lse(key_mask, 0); } if (tb[TCA_FLOWER_KEY_MPLS_TC]) { u8 tc = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TC]); @@ -806,8 +960,10 @@ static int fl_set_key_mpls(struct nlattr **tb, "Traffic Class (TC) must be between 0 and 7"); return -EINVAL; } - key_val->mpls_tc = tc; - key_mask->mpls_tc = MPLS_TC_MASK; + lse_val->mpls_tc = tc; + lse_mask->mpls_tc = MPLS_TC_MASK; + dissector_set_mpls_lse(key_val, 0); + dissector_set_mpls_lse(key_mask, 0); } if (tb[TCA_FLOWER_KEY_MPLS_LABEL]) { u32 label = nla_get_u32(tb[TCA_FLOWER_KEY_MPLS_LABEL]); @@ -818,8 +974,10 @@ static int fl_set_key_mpls(struct nlattr **tb, "Label must be between 0 and 1048575"); return -EINVAL; } - key_val->mpls_label = label; - key_mask->mpls_label = MPLS_LABEL_MASK; + lse_val->mpls_label = label; + lse_mask->mpls_label = MPLS_LABEL_MASK; + dissector_set_mpls_lse(key_val, 0); + dissector_set_mpls_lse(key_mask, 0); } return 0; } @@ -2218,35 +2376,132 @@ static int fl_dump_key_port_range(struct sk_buff *skb, struct fl_flow_key *key, return 0; } +static int fl_dump_key_mpls_opt_lse(struct sk_buff *skb, + struct flow_dissector_key_mpls *mpls_key, + struct flow_dissector_key_mpls *mpls_mask, + u8 lse_index) +{ + struct flow_dissector_mpls_lse *lse_mask = &mpls_mask->ls[lse_index]; + struct flow_dissector_mpls_lse *lse_key = &mpls_key->ls[lse_index]; + int err; + + err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH, + lse_index + 1); + if (err) + return err; + + if (lse_mask->mpls_ttl) { + err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL, + lse_key->mpls_ttl); + if (err) + return err; + } + if (lse_mask->mpls_bos) { + err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS, + lse_key->mpls_bos); + if (err) + return err; + } + if (lse_mask->mpls_tc) { + err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_TC, + lse_key->mpls_tc); + if (err) + return err; + } + if (lse_mask->mpls_label) { + err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL, + lse_key->mpls_label); + if (err) + return err; + } + + return 0; +} + +static int fl_dump_key_mpls_opts(struct sk_buff *skb, + struct flow_dissector_key_mpls *mpls_key, + struct flow_dissector_key_mpls *mpls_mask) +{ + struct nlattr *opts; + struct nlattr *lse; + u8 lse_index; + int err; + + opts = nla_nest_start(skb, TCA_FLOWER_KEY_MPLS_OPTS); + if (!opts) + return -EMSGSIZE; + + for (lse_index = 0; lse_index < FLOW_DIS_MPLS_MAX; lse_index++) { + if (!(mpls_mask->used_lses & 1 << lse_index)) + continue; + + lse = nla_nest_start(skb, TCA_FLOWER_KEY_MPLS_OPTS_LSE); + if (!lse) { + err = -EMSGSIZE; + goto err_opts; + } + + err = fl_dump_key_mpls_opt_lse(skb, mpls_key, mpls_mask, + lse_index); + if (err) + goto err_opts_lse; + nla_nest_end(skb, lse); + } + nla_nest_end(skb, opts); + + return 0; + +err_opts_lse: + nla_nest_cancel(skb, lse); +err_opts: + nla_nest_cancel(skb, opts); + + return err; +} + static int fl_dump_key_mpls(struct sk_buff *skb, struct flow_dissector_key_mpls *mpls_key, struct flow_dissector_key_mpls *mpls_mask) { + struct flow_dissector_mpls_lse *lse_mask; + struct flow_dissector_mpls_lse *lse_key; int err; - if (!memchr_inv(mpls_mask, 0, sizeof(*mpls_mask))) + if (!mpls_mask->used_lses) return 0; - if (mpls_mask->mpls_ttl) { + + lse_mask = &mpls_mask->ls[0]; + lse_key = &mpls_key->ls[0]; + + /* For backward compatibility, don't use the MPLS nested attributes if + * the rule can be expressed using the old attributes. + */ + if (mpls_mask->used_lses & ~1 || + (!lse_mask->mpls_ttl && !lse_mask->mpls_bos && + !lse_mask->mpls_tc && !lse_mask->mpls_label)) + return fl_dump_key_mpls_opts(skb, mpls_key, mpls_mask); + + if (lse_mask->mpls_ttl) { err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_TTL, - mpls_key->mpls_ttl); + lse_key->mpls_ttl); if (err) return err; } - if (mpls_mask->mpls_tc) { + if (lse_mask->mpls_tc) { err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_TC, - mpls_key->mpls_tc); + lse_key->mpls_tc); if (err) return err; } - if (mpls_mask->mpls_label) { + if (lse_mask->mpls_label) { err = nla_put_u32(skb, TCA_FLOWER_KEY_MPLS_LABEL, - mpls_key->mpls_label); + lse_key->mpls_label); if (err) return err; } - if (mpls_mask->mpls_bos) { + if (lse_mask->mpls_bos) { err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_BOS, - mpls_key->mpls_bos); + lse_key->mpls_bos); if (err) return err; } @@ -2768,6 +3023,48 @@ nla_put_failure: return -1; } +static int fl_terse_dump(struct net *net, struct tcf_proto *tp, void *fh, + struct sk_buff *skb, struct tcmsg *t, bool rtnl_held) +{ + struct cls_fl_filter *f = fh; + struct nlattr *nest; + bool skip_hw; + + if (!f) + return skb->len; + + t->tcm_handle = f->handle; + + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); + if (!nest) + goto nla_put_failure; + + spin_lock(&tp->lock); + + skip_hw = tc_skip_hw(f->flags); + + if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags)) + goto nla_put_failure_locked; + + spin_unlock(&tp->lock); + + if (!skip_hw) + fl_hw_update_stats(tp, f, rtnl_held); + + if (tcf_exts_terse_dump(skb, &f->exts)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + return skb->len; + +nla_put_failure_locked: + spin_unlock(&tp->lock); +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + static int fl_tmplt_dump(struct sk_buff *skb, struct net *net, void *tmplt_priv) { struct fl_flow_tmplt *tmplt = tmplt_priv; @@ -2832,6 +3129,7 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = { .hw_add = fl_hw_add, .hw_del = fl_hw_del, .dump = fl_dump, + .terse_dump = fl_terse_dump, .bind_class = fl_bind_class, .tmplt_create = fl_tmplt_create, .tmplt_destroy = fl_tmplt_destroy, diff --git a/net/sched/em_ipt.c b/net/sched/em_ipt.c index eecfe072c508..18755d29fd15 100644 --- a/net/sched/em_ipt.c +++ b/net/sched/em_ipt.c @@ -199,7 +199,7 @@ static void em_ipt_destroy(struct tcf_ematch *em) im->match->destroy(&par); } module_put(im->match->me); - kfree((void *)im); + kfree(im); } static int em_ipt_match(struct sk_buff *skb, struct tcf_ematch *em, diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 0d99df1e764d..9a3449b56bd6 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -32,6 +32,8 @@ #include <net/pkt_sched.h> #include <net/pkt_cls.h> +#include <trace/events/qdisc.h> + /* Short review. @@ -1283,6 +1285,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev, } qdisc_hash_add(sch, false); + trace_qdisc_create(ops, dev, parent); return sch; diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 1496e87cd07b..60f8ae578819 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -584,26 +584,48 @@ static bool cobalt_should_drop(struct cobalt_vars *vars, return drop; } -static void cake_update_flowkeys(struct flow_keys *keys, +static bool cake_update_flowkeys(struct flow_keys *keys, const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) struct nf_conntrack_tuple tuple = {}; - bool rev = !skb->_nfct; + bool rev = !skb->_nfct, upd = false; + __be32 ip; if (tc_skb_protocol(skb) != htons(ETH_P_IP)) - return; + return false; if (!nf_ct_get_tuple_skb(&tuple, skb)) - return; + return false; - keys->addrs.v4addrs.src = rev ? tuple.dst.u3.ip : tuple.src.u3.ip; - keys->addrs.v4addrs.dst = rev ? tuple.src.u3.ip : tuple.dst.u3.ip; + ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip; + if (ip != keys->addrs.v4addrs.src) { + keys->addrs.v4addrs.src = ip; + upd = true; + } + ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip; + if (ip != keys->addrs.v4addrs.dst) { + keys->addrs.v4addrs.dst = ip; + upd = true; + } if (keys->ports.ports) { - keys->ports.src = rev ? tuple.dst.u.all : tuple.src.u.all; - keys->ports.dst = rev ? tuple.src.u.all : tuple.dst.u.all; + __be16 port; + + port = rev ? tuple.dst.u.all : tuple.src.u.all; + if (port != keys->ports.src) { + keys->ports.src = port; + upd = true; + } + port = rev ? tuple.src.u.all : tuple.dst.u.all; + if (port != keys->ports.dst) { + port = keys->ports.dst; + upd = true; + } } + return upd; +#else + return false; #endif } @@ -624,23 +646,36 @@ static bool cake_ddst(int flow_mode) static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, int flow_mode, u16 flow_override, u16 host_override) { + bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS)); + bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS)); + bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG); u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0; u16 reduced_hash, srchost_idx, dsthost_idx; struct flow_keys keys, host_keys; + bool use_skbhash = skb->l4_hash; if (unlikely(flow_mode == CAKE_FLOW_NONE)) return 0; - /* If both overrides are set we can skip packet dissection entirely */ - if ((flow_override || !(flow_mode & CAKE_FLOW_FLOWS)) && - (host_override || !(flow_mode & CAKE_FLOW_HOSTS))) + /* If both overrides are set, or we can use the SKB hash and nat mode is + * disabled, we can skip packet dissection entirely. If nat mode is + * enabled there's another check below after doing the conntrack lookup. + */ + if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts) goto skip_hash; skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); - if (flow_mode & CAKE_FLOW_NAT_FLAG) - cake_update_flowkeys(&keys, skb); + /* Don't use the SKB hash if we change the lookup keys from conntrack */ + if (nat_enabled && cake_update_flowkeys(&keys, skb)) + use_skbhash = false; + + /* If we can still use the SKB hash and don't need the host hash, we can + * skip the rest of the hashing procedure + */ + if (use_skbhash && !hash_hosts) + goto skip_hash; /* flow_hash_from_keys() sorts the addresses by value, so we have * to preserve their order in a separate data structure to treat @@ -679,12 +714,14 @@ static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, /* This *must* be after the above switch, since as a * side-effect it sorts the src and dst addresses. */ - if (flow_mode & CAKE_FLOW_FLOWS) + if (hash_flows && !use_skbhash) flow_hash = flow_hash_from_keys(&keys); skip_hash: if (flow_override) flow_hash = flow_override - 1; + else if (use_skbhash) + flow_hash = skb->hash; if (host_override) { dsthost_hash = host_override - 1; srchost_hash = host_override - 1; diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 1bcf8fbfd40e..bd618b00d319 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -131,7 +131,6 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx, } struct choke_skb_cb { - u16 classid; u8 keys_valid; struct flow_keys_digest keys; }; @@ -142,11 +141,6 @@ static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb) return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data; } -static inline void choke_set_classid(struct sk_buff *skb, u16 classid) -{ - choke_skb_cb(skb)->classid = classid; -} - /* * Compare flow of two packets * Returns true only if source and destination address and port match. diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 4c060134c736..8f06a808c59a 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -66,22 +66,27 @@ static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb) * in linear list (head,tail), otherwise are placed in a rbtree (t_root). */ struct fq_flow { +/* First cache line : used in fq_gc(), fq_enqueue(), fq_dequeue() */ struct rb_root t_root; struct sk_buff *head; /* list of skbs for this flow : first skb */ union { struct sk_buff *tail; /* last skb in the list */ - unsigned long age; /* jiffies when flow was emptied, for gc */ + unsigned long age; /* (jiffies | 1UL) when flow was emptied, for gc */ }; struct rb_node fq_node; /* anchor in fq_root[] trees */ struct sock *sk; + u32 socket_hash; /* sk_hash */ int qlen; /* number of packets in flow queue */ + +/* Second cache line, used in fq_dequeue() */ int credit; - u32 socket_hash; /* sk_hash */ - struct fq_flow *next; /* next pointer in RR lists, or &detached */ + /* 32bit hole on 64bit arches */ + + struct fq_flow *next; /* next pointer in RR lists */ struct rb_node rate_node; /* anchor in q->delayed tree */ u64 time_next_packet; -}; +} ____cacheline_aligned_in_smp; struct fq_flow_head { struct fq_flow *first; @@ -95,6 +100,7 @@ struct fq_sched_data { struct rb_root delayed; /* for rate limited flows */ u64 time_next_delayed_flow; + u64 ktime_cache; /* copy of last ktime_get_ns() */ unsigned long unthrottle_latency_ns; struct fq_flow internal; /* for non classified or high prio packets */ @@ -104,12 +110,13 @@ struct fq_sched_data { u32 flow_plimit; /* max packets per flow */ unsigned long flow_max_rate; /* optional max rate per flow */ u64 ce_threshold; + u64 horizon; /* horizon in ns */ u32 orphan_mask; /* mask for orphaned skb */ u32 low_rate_threshold; struct rb_root *fq_root; u8 rate_enable; u8 fq_trees_log; - + u8 horizon_drop; u32 flows; u32 inactive_flows; u32 throttled_flows; @@ -118,6 +125,8 @@ struct fq_sched_data { u64 stat_internal_packets; u64 stat_throttled; u64 stat_ce_mark; + u64 stat_horizon_drops; + u64 stat_horizon_caps; u64 stat_flows_plimit; u64 stat_pkts_too_long; u64 stat_allocation_errors; @@ -126,20 +135,25 @@ struct fq_sched_data { struct qdisc_watchdog watchdog; }; -/* special value to mark a detached flow (not on old/new list) */ -static struct fq_flow detached, throttled; - +/* + * f->tail and f->age share the same location. + * We can use the low order bit to differentiate if this location points + * to a sk_buff or contains a jiffies value, if we force this value to be odd. + * This assumes f->tail low order bit must be 0 since alignof(struct sk_buff) >= 2 + */ static void fq_flow_set_detached(struct fq_flow *f) { - f->next = &detached; - f->age = jiffies; + f->age = jiffies | 1UL; } static bool fq_flow_is_detached(const struct fq_flow *f) { - return f->next == &detached; + return !!(f->age & 1UL); } +/* special value to mark a throttled flow (not on old/new list) */ +static struct fq_flow throttled; + static bool fq_flow_is_throttled(const struct fq_flow *f) { return f->next == &throttled; @@ -204,9 +218,10 @@ static void fq_gc(struct fq_sched_data *q, struct rb_root *root, struct sock *sk) { - struct fq_flow *f, *tofree[FQ_GC_MAX]; struct rb_node **p, *parent; - int fcnt = 0; + void *tofree[FQ_GC_MAX]; + struct fq_flow *f; + int i, fcnt = 0; p = &root->rb_node; parent = NULL; @@ -229,15 +244,18 @@ static void fq_gc(struct fq_sched_data *q, p = &parent->rb_left; } + if (!fcnt) + return; + + for (i = fcnt; i > 0; ) { + f = tofree[--i]; + rb_erase(&f->fq_node, root); + } q->flows -= fcnt; q->inactive_flows -= fcnt; q->stat_gc_flows += fcnt; - while (fcnt) { - struct fq_flow *f = tofree[--fcnt]; - rb_erase(&f->fq_node, root); - kmem_cache_free(fq_flow_cachep, f); - } + kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree); } static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) @@ -370,19 +388,17 @@ static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow, } } -/* remove one skb from head of flow queue */ -static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow) +/* Remove one skb from flow queue. + * This skb must be the return value of prior fq_peek(). + */ +static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow, + struct sk_buff *skb) { - struct sk_buff *skb = fq_peek(flow); - - if (skb) { - fq_erase_head(sch, flow, skb); - skb_mark_not_on_list(skb); - flow->qlen--; - qdisc_qstats_backlog_dec(sch, skb); - sch->q.qlen--; - } - return skb; + fq_erase_head(sch, flow, skb); + skb_mark_not_on_list(skb); + flow->qlen--; + qdisc_qstats_backlog_dec(sch, skb); + sch->q.qlen--; } static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) @@ -390,8 +406,6 @@ static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) struct rb_node **p, *parent; struct sk_buff *head, *aux; - fq_skb_cb(skb)->time_to_send = skb->tstamp ?: ktime_get_ns(); - head = flow->head; if (!head || fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) { @@ -419,6 +433,12 @@ static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) rb_insert_color(&skb->rbnode, &flow->t_root); } +static bool fq_packet_beyond_horizon(const struct sk_buff *skb, + const struct fq_sched_data *q) +{ + return unlikely((s64)skb->tstamp > (s64)(q->ktime_cache + q->horizon)); +} + static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { @@ -428,6 +448,28 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (unlikely(sch->q.qlen >= sch->limit)) return qdisc_drop(skb, sch, to_free); + if (!skb->tstamp) { + fq_skb_cb(skb)->time_to_send = q->ktime_cache = ktime_get_ns(); + } else { + /* Check if packet timestamp is too far in the future. + * Try first if our cached value, to avoid ktime_get_ns() + * cost in most cases. + */ + if (fq_packet_beyond_horizon(skb, q)) { + /* Refresh our cache and check another time */ + q->ktime_cache = ktime_get_ns(); + if (fq_packet_beyond_horizon(skb, q)) { + if (q->horizon_drop) { + q->stat_horizon_drops++; + return qdisc_drop(skb, sch, to_free); + } + q->stat_horizon_caps++; + skb->tstamp = q->ktime_cache + q->horizon; + } + } + fq_skb_cb(skb)->time_to_send = skb->tstamp; + } + f = fq_classify(skb, q); if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) { q->stat_flows_plimit++; @@ -494,11 +536,13 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) if (!sch->q.qlen) return NULL; - skb = fq_dequeue_head(sch, &q->internal); - if (skb) + skb = fq_peek(&q->internal); + if (unlikely(skb)) { + fq_dequeue_skb(sch, &q->internal, skb); goto out; + } - now = ktime_get_ns(); + q->ktime_cache = now = ktime_get_ns(); fq_check_throttled(q, now); begin: head = &q->new_flows; @@ -532,14 +576,13 @@ begin: fq_flow_set_throttled(q, f); goto begin; } + prefetch(&skb->end); if ((s64)(now - time_next_packet - q->ce_threshold) > 0) { INET_ECN_set_ce(skb); q->stat_ce_mark++; } - } - - skb = fq_dequeue_head(sch, f); - if (!skb) { + fq_dequeue_skb(sch, f, skb); + } else { head->first = f->next; /* force a pass through old_flows to prevent starvation */ if ((head == &q->new_flows) && q->old_flows.first) { @@ -550,7 +593,6 @@ begin: } goto begin; } - prefetch(&skb->end); plen = qdisc_pkt_len(skb); f->credit -= plen; @@ -753,6 +795,8 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 }, [TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 }, [TCA_FQ_TIMER_SLACK] = { .type = NLA_U32 }, + [TCA_FQ_HORIZON] = { .type = NLA_U32 }, + [TCA_FQ_HORIZON_DROP] = { .type = NLA_U8 }, }; static int fq_change(struct Qdisc *sch, struct nlattr *opt, @@ -842,7 +886,15 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_FQ_TIMER_SLACK]) q->timer_slack = nla_get_u32(tb[TCA_FQ_TIMER_SLACK]); + if (tb[TCA_FQ_HORIZON]) + q->horizon = (u64)NSEC_PER_USEC * + nla_get_u32(tb[TCA_FQ_HORIZON]); + + if (tb[TCA_FQ_HORIZON_DROP]) + q->horizon_drop = nla_get_u8(tb[TCA_FQ_HORIZON_DROP]); + if (!err) { + sch_tree_unlock(sch); err = fq_resize(sch, fq_log); sch_tree_lock(sch); @@ -895,6 +947,9 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt, q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */ + q->horizon = 10ULL * NSEC_PER_SEC; /* 10 seconds */ + q->horizon_drop = 1; /* by default, drop packets beyond horizon */ + /* Default ce_threshold of 4294 seconds */ q->ce_threshold = (u64)NSEC_PER_USEC * ~0U; @@ -912,6 +967,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) { struct fq_sched_data *q = qdisc_priv(sch); u64 ce_threshold = q->ce_threshold; + u64 horizon = q->horizon; struct nlattr *opts; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); @@ -921,6 +977,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */ do_div(ce_threshold, NSEC_PER_USEC); + do_div(horizon, NSEC_PER_USEC); if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) || nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) || @@ -936,7 +993,9 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) q->low_rate_threshold) || nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) || nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log) || - nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack)) + nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack) || + nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) || + nla_put_u8(skb, TCA_FQ_HORIZON_DROP, q->horizon_drop)) goto nla_put_failure; return nla_nest_end(skb, opts); @@ -967,6 +1026,8 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) st.unthrottle_latency_ns = min_t(unsigned long, q->unthrottle_latency_ns, ~0U); st.ce_mark = q->stat_ce_mark; + st.horizon_drops = q->stat_horizon_drops; + st.horizon_caps = q->stat_horizon_caps; sch_tree_unlock(sch); return gnet_stats_copy_app(d, &st, sizeof(st)); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 2efd5b61acef..b19a0021a0bd 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -794,6 +794,9 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = { }; EXPORT_SYMBOL(pfifo_fast_ops); +static struct lock_class_key qdisc_tx_busylock; +static struct lock_class_key qdisc_running_key; + struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, struct netlink_ext_ack *extack) @@ -846,9 +849,17 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, } spin_lock_init(&sch->busylock); + lockdep_set_class(&sch->busylock, + dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); + /* seqlock has the same scope of busylock, for NOLOCK qdisc */ spin_lock_init(&sch->seqlock); + lockdep_set_class(&sch->busylock, + dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); + seqcount_init(&sch->running); + lockdep_set_class(&sch->running, + dev->qdisc_running_key ?: &qdisc_running_key); sch->ops = ops; sch->flags = ops->static_flags; @@ -859,12 +870,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, dev_hold(dev); refcount_set(&sch->refcnt, 1); - if (sch != &noop_qdisc) { - lockdep_set_class(&sch->busylock, &dev->qdisc_tx_busylock_key); - lockdep_set_class(&sch->seqlock, &dev->qdisc_tx_busylock_key); - lockdep_set_class(&sch->running, &dev->qdisc_running_key); - } - return sch; errout1: kfree(p); @@ -891,8 +896,10 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, } sch->parent = parentid; - if (!ops->init || ops->init(sch, NULL, extack) == 0) + if (!ops->init || ops->init(sch, NULL, extack) == 0) { + trace_qdisc_create(ops, dev_queue->dev, parentid); return sch; + } qdisc_put(sch); return NULL; @@ -906,6 +913,8 @@ void qdisc_reset(struct Qdisc *qdisc) const struct Qdisc_ops *ops = qdisc->ops; struct sk_buff *skb, *tmp; + trace_qdisc_reset(qdisc); + if (ops->reset) ops->reset(qdisc); @@ -944,7 +953,6 @@ static void qdisc_free_cb(struct rcu_head *head) static void qdisc_destroy(struct Qdisc *qdisc) { const struct Qdisc_ops *ops = qdisc->ops; - struct sk_buff *skb, *tmp; #ifdef CONFIG_NET_SCHED qdisc_hash_del(qdisc); @@ -952,23 +960,16 @@ static void qdisc_destroy(struct Qdisc *qdisc) qdisc_put_stab(rtnl_dereference(qdisc->stab)); #endif gen_kill_estimator(&qdisc->rate_est); - if (ops->reset) - ops->reset(qdisc); + + qdisc_reset(qdisc); + if (ops->destroy) ops->destroy(qdisc); module_put(ops->owner); dev_put(qdisc_dev(qdisc)); - skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) { - __skb_unlink(skb, &qdisc->gso_skb); - kfree_skb_list(skb); - } - - skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) { - __skb_unlink(skb, &qdisc->skb_bad_txq); - kfree_skb_list(skb); - } + trace_qdisc_destroy(qdisc); call_rcu(&qdisc->rcu, qdisc_free_cb); } @@ -1037,10 +1038,9 @@ static void attach_one_default_qdisc(struct net_device *dev, ops = &pfifo_fast_ops; qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL); - if (!qdisc) { - netdev_info(dev, "activation failed\n"); + if (!qdisc) return; - } + if (!netif_is_multiqueue(dev)) qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; dev_queue->qdisc_sleeping = qdisc; @@ -1065,6 +1065,18 @@ static void attach_default_qdiscs(struct net_device *dev) qdisc->ops->attach(qdisc); } } + + /* Detect default qdisc setup/init failed and fallback to "noqueue" */ + if (dev->qdisc == &noop_qdisc) { + netdev_warn(dev, "default qdisc (%s) fail, fallback to %s\n", + default_qdisc_ops->id, noqueue_qdisc_ops.id); + dev->priv_flags |= IFF_NO_QUEUE; + netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); + dev->qdisc = txq->qdisc_sleeping; + qdisc_refcount_inc(dev->qdisc); + dev->priv_flags ^= IFF_NO_QUEUE; + } + #ifdef CONFIG_NET_SCHED if (dev->qdisc != &noop_qdisc) qdisc_hash_add(dev->qdisc, false); @@ -1116,6 +1128,28 @@ void dev_activate(struct net_device *dev) } EXPORT_SYMBOL(dev_activate); +static void qdisc_deactivate(struct Qdisc *qdisc) +{ + bool nolock = qdisc->flags & TCQ_F_NOLOCK; + + if (qdisc->flags & TCQ_F_BUILTIN) + return; + if (test_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state)) + return; + + if (nolock) + spin_lock_bh(&qdisc->seqlock); + spin_lock_bh(qdisc_lock(qdisc)); + + set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state); + + qdisc_reset(qdisc); + + spin_unlock_bh(qdisc_lock(qdisc)); + if (nolock) + spin_unlock_bh(&qdisc->seqlock); +} + static void dev_deactivate_queue(struct net_device *dev, struct netdev_queue *dev_queue, void *_qdisc_default) @@ -1125,21 +1159,8 @@ static void dev_deactivate_queue(struct net_device *dev, qdisc = rtnl_dereference(dev_queue->qdisc); if (qdisc) { - bool nolock = qdisc->flags & TCQ_F_NOLOCK; - - if (nolock) - spin_lock_bh(&qdisc->seqlock); - spin_lock_bh(qdisc_lock(qdisc)); - - if (!(qdisc->flags & TCQ_F_BUILTIN)) - set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state); - + qdisc_deactivate(qdisc); rcu_assign_pointer(dev_queue->qdisc, qdisc_default); - qdisc_reset(qdisc); - - spin_unlock_bh(qdisc_lock(qdisc)); - if (nolock) - spin_unlock_bh(&qdisc->seqlock); } } @@ -1170,16 +1191,6 @@ static bool some_qdisc_is_busy(struct net_device *dev) return false; } -static void dev_qdisc_reset(struct net_device *dev, - struct netdev_queue *dev_queue, - void *none) -{ - struct Qdisc *qdisc = dev_queue->qdisc_sleeping; - - if (qdisc) - qdisc_reset(qdisc); -} - /** * dev_deactivate_many - deactivate transmissions on several devices * @head: list of devices to deactivate @@ -1216,12 +1227,6 @@ void dev_deactivate_many(struct list_head *head) */ schedule_timeout_uninterruptible(1); } - /* The new qdisc is assigned at this point so we can safely - * unwind stale skb lists and qdisc statistics - */ - netdev_for_each_tx_queue(dev, dev_qdisc_reset, NULL); - if (dev_ingress_queue(dev)) - dev_qdisc_reset(dev, dev_ingress_queue(dev), NULL); } } diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index c7de47c942e3..555a1b9e467f 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -48,7 +48,7 @@ struct red_sched_data { struct Qdisc *qdisc; }; -static const u32 red_supported_flags = TC_RED_HISTORIC_FLAGS | TC_RED_NODROP; +#define TC_RED_SUPPORTED_FLAGS (TC_RED_HISTORIC_FLAGS | TC_RED_NODROP) static inline int red_use_ecn(struct red_sched_data *q) { @@ -212,8 +212,7 @@ static const struct nla_policy red_policy[TCA_RED_MAX + 1] = { [TCA_RED_PARMS] = { .len = sizeof(struct tc_red_qopt) }, [TCA_RED_STAB] = { .len = RED_STAB_SIZE }, [TCA_RED_MAX_P] = { .type = NLA_U32 }, - [TCA_RED_FLAGS] = { .type = NLA_BITFIELD32, - .validation_data = &red_supported_flags }, + [TCA_RED_FLAGS] = NLA_POLICY_BITFIELD32(TC_RED_SUPPORTED_FLAGS), }; static int red_change(struct Qdisc *sch, struct nlattr *opt, @@ -248,7 +247,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt, return -EINVAL; err = red_get_flags(ctl->flags, TC_RED_HISTORIC_FLAGS, - tb[TCA_RED_FLAGS], red_supported_flags, + tb[TCA_RED_FLAGS], TC_RED_SUPPORTED_FLAGS, &flags_bf, &userbits, extack); if (err) return err; @@ -372,7 +371,7 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put(skb, TCA_RED_PARMS, sizeof(opt), &opt) || nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P) || nla_put_bitfield32(skb, TCA_RED_FLAGS, - q->flags, red_supported_flags)) + q->flags, TC_RED_SUPPORTED_FLAGS)) goto nla_put_failure; return nla_nest_end(skb, opts); |