diff options
Diffstat (limited to 'net/sched')
-rw-r--r-- | net/sched/Kconfig | 30 | ||||
-rw-r--r-- | net/sched/Makefile | 2 | ||||
-rw-r--r-- | net/sched/act_ctinfo.c | 11 | ||||
-rw-r--r-- | net/sched/act_ife.c | 7 | ||||
-rw-r--r-- | net/sched/cls_api.c | 5 | ||||
-rw-r--r-- | net/sched/cls_basic.c | 11 | ||||
-rw-r--r-- | net/sched/cls_bpf.c | 11 | ||||
-rw-r--r-- | net/sched/cls_flower.c | 11 | ||||
-rw-r--r-- | net/sched/cls_fw.c | 11 | ||||
-rw-r--r-- | net/sched/cls_matchall.c | 11 | ||||
-rw-r--r-- | net/sched/cls_route.c | 11 | ||||
-rw-r--r-- | net/sched/cls_rsvp.h | 11 | ||||
-rw-r--r-- | net/sched/cls_tcindex.c | 11 | ||||
-rw-r--r-- | net/sched/cls_u32.c | 11 | ||||
-rw-r--r-- | net/sched/ematch.c | 5 | ||||
-rw-r--r-- | net/sched/sch_api.c | 47 | ||||
-rw-r--r-- | net/sched/sch_cake.c | 65 | ||||
-rw-r--r-- | net/sched/sch_choke.c | 2 | ||||
-rw-r--r-- | net/sched/sch_ets.c | 828 | ||||
-rw-r--r-- | net/sched/sch_fq.c | 6 | ||||
-rw-r--r-- | net/sched/sch_fq_pie.c | 562 | ||||
-rw-r--r-- | net/sched/sch_generic.c | 2 | ||||
-rw-r--r-- | net/sched/sch_pie.c | 289 | ||||
-rw-r--r-- | net/sched/sch_prio.c | 10 | ||||
-rw-r--r-- | net/sched/sch_tbf.c | 60 |
25 files changed, 1742 insertions, 288 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 2985509147a2..edde0e519438 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -366,6 +366,19 @@ config NET_SCH_PIE If unsure, say N. +config NET_SCH_FQ_PIE + depends on NET_SCH_PIE + tristate "Flow Queue Proportional Integral controller Enhanced (FQ-PIE)" + help + Say Y here if you want to use the Flow Queue Proportional Integral + controller Enhanced (FQ-PIE) packet scheduling algorithm. + For more information, please see https://tools.ietf.org/html/rfc8033 + + To compile this driver as a module, choose M here: the module + will be called sch_fq_pie. + + If unsure, say N. + config NET_SCH_INGRESS tristate "Ingress/classifier-action Qdisc" depends on NET_CLS_ACT @@ -409,6 +422,23 @@ config NET_SCH_PLUG To compile this code as a module, choose M here: the module will be called sch_plug. +config NET_SCH_ETS + tristate "Enhanced transmission selection scheduler (ETS)" + help + The Enhanced Transmission Selection scheduler is a classful + queuing discipline that merges functionality of PRIO and DRR + qdiscs in one scheduler. ETS makes it easy to configure a set of + strict and bandwidth-sharing bands to implement the transmission + selection described in 802.1Qaz. + + Say Y here if you want to use the ETS packet scheduling + algorithm. + + To compile this driver as a module, choose M here: the module + will be called sch_ets. + + If unsure, say N. + menuconfig NET_SCH_DEFAULT bool "Allow override default queue discipline" ---help--- diff --git a/net/sched/Makefile b/net/sched/Makefile index 415d1e1f237e..31c367a6cd09 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -48,6 +48,7 @@ obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o +obj-$(CONFIG_NET_SCH_ETS) += sch_ets.o obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o obj-$(CONFIG_NET_SCH_SKBPRIO) += sch_skbprio.o obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o @@ -58,6 +59,7 @@ obj-$(CONFIG_NET_SCH_CAKE) += sch_cake.o obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o +obj-$(CONFIG_NET_SCH_FQ_PIE) += sch_fq_pie.o obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index 40038c321b4a..19649623493b 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -360,6 +360,16 @@ static int tcf_ctinfo_search(struct net *net, struct tc_action **a, u32 index) return tcf_idr_search(tn, a, index); } +static void tcf_ctinfo_cleanup(struct tc_action *a) +{ + struct tcf_ctinfo *ci = to_ctinfo(a); + struct tcf_ctinfo_params *cp; + + cp = rcu_dereference_protected(ci->params, 1); + if (cp) + kfree_rcu(cp, rcu); +} + static struct tc_action_ops act_ctinfo_ops = { .kind = "ctinfo", .id = TCA_ID_CTINFO, @@ -367,6 +377,7 @@ static struct tc_action_ops act_ctinfo_ops = { .act = tcf_ctinfo_act, .dump = tcf_ctinfo_dump, .init = tcf_ctinfo_init, + .cleanup= tcf_ctinfo_cleanup, .walk = tcf_ctinfo_walker, .lookup = tcf_ctinfo_search, .size = sizeof(struct tcf_ctinfo), diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 5e6379028fc3..c1fcd85719d6 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -537,6 +537,9 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, } ife = to_ife(*a); + if (ret == ACT_P_CREATED) + INIT_LIST_HEAD(&ife->metalist); + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; @@ -566,10 +569,6 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, p->eth_type = ife_type; } - - if (ret == ACT_P_CREATED) - INIT_LIST_HEAD(&ife->metalist); - if (tb[TCA_IFE_METALST]) { err = nla_parse_nested_deprecated(tb2, IFE_META_MAX, tb[TCA_IFE_METALST], NULL, diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 76e0d122616a..c2cdd0fc2e70 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -2055,9 +2055,8 @@ replay: &chain_info)); mutex_unlock(&chain->filter_chain_lock); - tp_new = tcf_proto_create(nla_data(tca[TCA_KIND]), - protocol, prio, chain, rtnl_held, - extack); + tp_new = tcf_proto_create(name, protocol, prio, chain, + rtnl_held, extack); if (IS_ERR(tp_new)) { err = PTR_ERR(tp_new); goto errout_tp; diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 4aafbe3d435c..f256a7c69093 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -263,12 +263,17 @@ skip: } } -static void basic_bind_class(void *fh, u32 classid, unsigned long cl) +static void basic_bind_class(void *fh, u32 classid, unsigned long cl, void *q, + unsigned long base) { struct basic_filter *f = fh; - if (f && f->res.classid == classid) - f->res.class = cl; + if (f && f->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &f->res, base); + else + __tcf_unbind_filter(q, &f->res); + } } static int basic_dump(struct net *net, struct tcf_proto *tp, void *fh, diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 8229ed4a67be..6e3e63db0e01 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -631,12 +631,17 @@ nla_put_failure: return -1; } -static void cls_bpf_bind_class(void *fh, u32 classid, unsigned long cl) +static void cls_bpf_bind_class(void *fh, u32 classid, unsigned long cl, + void *q, unsigned long base) { struct cls_bpf_prog *prog = fh; - if (prog && prog->res.classid == classid) - prog->res.class = cl; + if (prog && prog->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &prog->res, base); + else + __tcf_unbind_filter(q, &prog->res); + } } static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg, diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index b0f42e62dd76..f9c0d1e8d380 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -2765,12 +2765,17 @@ nla_put_failure: return -EMSGSIZE; } -static void fl_bind_class(void *fh, u32 classid, unsigned long cl) +static void fl_bind_class(void *fh, u32 classid, unsigned long cl, void *q, + unsigned long base) { struct cls_fl_filter *f = fh; - if (f && f->res.classid == classid) - f->res.class = cl; + if (f && f->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &f->res, base); + else + __tcf_unbind_filter(q, &f->res); + } } static bool fl_delete_empty(struct tcf_proto *tp) diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index c9496c920d6f..ec945294626a 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -419,12 +419,17 @@ nla_put_failure: return -1; } -static void fw_bind_class(void *fh, u32 classid, unsigned long cl) +static void fw_bind_class(void *fh, u32 classid, unsigned long cl, void *q, + unsigned long base) { struct fw_filter *f = fh; - if (f && f->res.classid == classid) - f->res.class = cl; + if (f && f->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &f->res, base); + else + __tcf_unbind_filter(q, &f->res); + } } static struct tcf_proto_ops cls_fw_ops __read_mostly = { diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 7fc2eb62aa98..039cc86974f4 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -393,12 +393,17 @@ nla_put_failure: return -1; } -static void mall_bind_class(void *fh, u32 classid, unsigned long cl) +static void mall_bind_class(void *fh, u32 classid, unsigned long cl, void *q, + unsigned long base) { struct cls_mall_head *head = fh; - if (head && head->res.classid == classid) - head->res.class = cl; + if (head && head->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &head->res, base); + else + __tcf_unbind_filter(q, &head->res); + } } static struct tcf_proto_ops cls_mall_ops __read_mostly = { diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 2d9e0b4484ea..6f8786b06bde 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -641,12 +641,17 @@ nla_put_failure: return -1; } -static void route4_bind_class(void *fh, u32 classid, unsigned long cl) +static void route4_bind_class(void *fh, u32 classid, unsigned long cl, void *q, + unsigned long base) { struct route4_filter *f = fh; - if (f && f->res.classid == classid) - f->res.class = cl; + if (f && f->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &f->res, base); + else + __tcf_unbind_filter(q, &f->res); + } } static struct tcf_proto_ops cls_route4_ops __read_mostly = { diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 2f3c03b25d5d..c22624131949 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -738,12 +738,17 @@ nla_put_failure: return -1; } -static void rsvp_bind_class(void *fh, u32 classid, unsigned long cl) +static void rsvp_bind_class(void *fh, u32 classid, unsigned long cl, void *q, + unsigned long base) { struct rsvp_filter *f = fh; - if (f && f->res.classid == classid) - f->res.class = cl; + if (f && f->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &f->res, base); + else + __tcf_unbind_filter(q, &f->res); + } } static struct tcf_proto_ops RSVP_OPS __read_mostly = { diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index e573e5a5c794..3d4a1280352f 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -654,12 +654,17 @@ nla_put_failure: return -1; } -static void tcindex_bind_class(void *fh, u32 classid, unsigned long cl) +static void tcindex_bind_class(void *fh, u32 classid, unsigned long cl, + void *q, unsigned long base) { struct tcindex_filter_result *r = fh; - if (r && r->res.classid == classid) - r->res.class = cl; + if (r && r->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &r->res, base); + else + __tcf_unbind_filter(q, &r->res); + } } static struct tcf_proto_ops cls_tcindex_ops __read_mostly = { diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index a0e6fac613de..e15ff335953d 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -1255,12 +1255,17 @@ static int u32_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, return 0; } -static void u32_bind_class(void *fh, u32 classid, unsigned long cl) +static void u32_bind_class(void *fh, u32 classid, unsigned long cl, void *q, + unsigned long base) { struct tc_u_knode *n = fh; - if (n && n->res.classid == classid) - n->res.class = cl; + if (n && n->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &n->res, base); + else + __tcf_unbind_filter(q, &n->res); + } } static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh, diff --git a/net/sched/ematch.c b/net/sched/ematch.c index 8f2ad706784d..dd3b8c11a2e0 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -238,6 +238,9 @@ static int tcf_em_validate(struct tcf_proto *tp, goto errout; if (em->ops->change) { + err = -EINVAL; + if (em_hdr->flags & TCF_EM_SIMPLE) + goto errout; err = em->ops->change(net, data, data_len, em); if (err < 0) goto errout; @@ -263,12 +266,12 @@ static int tcf_em_validate(struct tcf_proto *tp, } em->data = (unsigned long) v; } + em->datalen = data_len; } } em->matchid = em_hdr->matchid; em->flags = em_hdr->flags; - em->datalen = data_len; em->net = net; err = 0; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 1047825d9f48..50794125bf02 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1891,8 +1891,9 @@ static int tclass_del_notify(struct net *net, struct tcf_bind_args { struct tcf_walker w; - u32 classid; + unsigned long base; unsigned long cl; + u32 classid; }; static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg) @@ -1903,28 +1904,30 @@ static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg) struct Qdisc *q = tcf_block_q(tp->chain->block); sch_tree_lock(q); - tp->ops->bind_class(n, a->classid, a->cl); + tp->ops->bind_class(n, a->classid, a->cl, q, a->base); sch_tree_unlock(q); } return 0; } -static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, - unsigned long new_cl) +struct tc_bind_class_args { + struct qdisc_walker w; + unsigned long new_cl; + u32 portid; + u32 clid; +}; + +static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl, + struct qdisc_walker *w) { + struct tc_bind_class_args *a = (struct tc_bind_class_args *)w; const struct Qdisc_class_ops *cops = q->ops->cl_ops; struct tcf_block *block; struct tcf_chain *chain; - unsigned long cl; - cl = cops->find(q, portid); - if (!cl) - return; - if (!cops->tcf_block) - return; block = cops->tcf_block(q, cl, NULL); if (!block) - return; + return 0; for (chain = tcf_get_next_chain(block, NULL); chain; chain = tcf_get_next_chain(block, chain)) { @@ -1935,11 +1938,29 @@ static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, struct tcf_bind_args arg = {}; arg.w.fn = tcf_node_bind; - arg.classid = clid; - arg.cl = new_cl; + arg.classid = a->clid; + arg.base = cl; + arg.cl = a->new_cl; tp->ops->walk(tp, &arg.w, true); } } + + return 0; +} + +static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, + unsigned long new_cl) +{ + const struct Qdisc_class_ops *cops = q->ops->cl_ops; + struct tc_bind_class_args args = {}; + + if (!cops->tcf_block) + return; + args.portid = portid; + args.clid = clid; + args.new_cl = new_cl; + args.w.fn = tc_bind_class_walker; + q->ops->cl_ops->walk(q, &args.w); } #else diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index e0f40400f679..1496e87cd07b 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -173,8 +173,7 @@ struct cake_tin_data { u64 tin_rate_bps; u16 tin_rate_shft; - u16 tin_quantum_prio; - u16 tin_quantum_band; + u16 tin_quantum; s32 tin_deficit; u32 tin_backlog; u32 tin_dropped; @@ -1683,8 +1682,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (IS_ERR_OR_NULL(segs)) return qdisc_drop(skb, sch, to_free); - while (segs) { - nskb = segs->next; + skb_list_walk_safe(segs, segs, nskb) { skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; cobalt_set_enqueue_time(segs, now); @@ -1697,7 +1695,6 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, slen += segs->len; q->buffer_used += segs->truesize; b->packets++; - segs = nskb; } /* stats */ @@ -1769,7 +1766,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, q->avg_window_begin)); u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC; - do_div(b, window_interval); + b = div64_u64(b, window_interval); q->avg_peak_bandwidth = cake_ewma(q->avg_peak_bandwidth, b, b > q->avg_peak_bandwidth ? 2 : 8); @@ -1919,7 +1916,7 @@ begin: while (b->tin_deficit < 0 || !(b->sparse_flow_count + b->bulk_flow_count)) { if (b->tin_deficit <= 0) - b->tin_deficit += b->tin_quantum_band; + b->tin_deficit += b->tin_quantum; if (b->sparse_flow_count + b->bulk_flow_count) empty = false; @@ -2241,8 +2238,7 @@ static int cake_config_besteffort(struct Qdisc *sch) cake_set_rate(b, rate, mtu, us_to_ns(q->target), us_to_ns(q->interval)); - b->tin_quantum_band = 65535; - b->tin_quantum_prio = 65535; + b->tin_quantum = 65535; return 0; } @@ -2253,8 +2249,7 @@ static int cake_config_precedence(struct Qdisc *sch) struct cake_sched_data *q = qdisc_priv(sch); u32 mtu = psched_mtu(qdisc_dev(sch)); u64 rate = q->rate_bps; - u32 quantum1 = 256; - u32 quantum2 = 256; + u32 quantum = 256; u32 i; q->tin_cnt = 8; @@ -2267,18 +2262,14 @@ static int cake_config_precedence(struct Qdisc *sch) cake_set_rate(b, rate, mtu, us_to_ns(q->target), us_to_ns(q->interval)); - b->tin_quantum_prio = max_t(u16, 1U, quantum1); - b->tin_quantum_band = max_t(u16, 1U, quantum2); + b->tin_quantum = max_t(u16, 1U, quantum); /* calculate next class's parameters */ rate *= 7; rate >>= 3; - quantum1 *= 3; - quantum1 >>= 1; - - quantum2 *= 7; - quantum2 >>= 3; + quantum *= 7; + quantum >>= 3; } return 0; @@ -2347,8 +2338,7 @@ static int cake_config_diffserv8(struct Qdisc *sch) struct cake_sched_data *q = qdisc_priv(sch); u32 mtu = psched_mtu(qdisc_dev(sch)); u64 rate = q->rate_bps; - u32 quantum1 = 256; - u32 quantum2 = 256; + u32 quantum = 256; u32 i; q->tin_cnt = 8; @@ -2364,18 +2354,14 @@ static int cake_config_diffserv8(struct Qdisc *sch) cake_set_rate(b, rate, mtu, us_to_ns(q->target), us_to_ns(q->interval)); - b->tin_quantum_prio = max_t(u16, 1U, quantum1); - b->tin_quantum_band = max_t(u16, 1U, quantum2); + b->tin_quantum = max_t(u16, 1U, quantum); /* calculate next class's parameters */ rate *= 7; rate >>= 3; - quantum1 *= 3; - quantum1 >>= 1; - - quantum2 *= 7; - quantum2 >>= 3; + quantum *= 7; + quantum >>= 3; } return 0; @@ -2414,17 +2400,11 @@ static int cake_config_diffserv4(struct Qdisc *sch) cake_set_rate(&q->tins[3], rate >> 2, mtu, us_to_ns(q->target), us_to_ns(q->interval)); - /* priority weights */ - q->tins[0].tin_quantum_prio = quantum; - q->tins[1].tin_quantum_prio = quantum >> 4; - q->tins[2].tin_quantum_prio = quantum << 2; - q->tins[3].tin_quantum_prio = quantum << 4; - /* bandwidth-sharing weights */ - q->tins[0].tin_quantum_band = quantum; - q->tins[1].tin_quantum_band = quantum >> 4; - q->tins[2].tin_quantum_band = quantum >> 1; - q->tins[3].tin_quantum_band = quantum >> 2; + q->tins[0].tin_quantum = quantum; + q->tins[1].tin_quantum = quantum >> 4; + q->tins[2].tin_quantum = quantum >> 1; + q->tins[3].tin_quantum = quantum >> 2; return 0; } @@ -2455,15 +2435,10 @@ static int cake_config_diffserv3(struct Qdisc *sch) cake_set_rate(&q->tins[2], rate >> 2, mtu, us_to_ns(q->target), us_to_ns(q->interval)); - /* priority weights */ - q->tins[0].tin_quantum_prio = quantum; - q->tins[1].tin_quantum_prio = quantum >> 4; - q->tins[2].tin_quantum_prio = quantum << 4; - /* bandwidth-sharing weights */ - q->tins[0].tin_quantum_band = quantum; - q->tins[1].tin_quantum_band = quantum >> 4; - q->tins[2].tin_quantum_band = quantum >> 2; + q->tins[0].tin_quantum = quantum; + q->tins[1].tin_quantum = quantum >> 4; + q->tins[2].tin_quantum = quantum >> 2; return 0; } diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index dba70377bbd9..a36974e9c601 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -377,7 +377,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt, if (mask != q->tab_mask) { struct sk_buff **ntab; - ntab = kvmalloc_array((mask + 1), sizeof(struct sk_buff *), GFP_KERNEL | __GFP_ZERO); + ntab = kvcalloc(mask + 1, sizeof(struct sk_buff *), GFP_KERNEL); if (!ntab) return -ENOMEM; diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c new file mode 100644 index 000000000000..a87e9159338c --- /dev/null +++ b/net/sched/sch_ets.c @@ -0,0 +1,828 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * net/sched/sch_ets.c Enhanced Transmission Selection scheduler + * + * Description + * ----------- + * + * The Enhanced Transmission Selection scheduler is a classful queuing + * discipline that merges functionality of PRIO and DRR qdiscs in one scheduler. + * ETS makes it easy to configure a set of strict and bandwidth-sharing bands to + * implement the transmission selection described in 802.1Qaz. + * + * Although ETS is technically classful, it's not possible to add and remove + * classes at will. Instead one specifies number of classes, how many are + * PRIO-like and how many DRR-like, and quanta for the latter. + * + * Algorithm + * --------- + * + * The strict classes, if any, are tried for traffic first: first band 0, if it + * has no traffic then band 1, etc. + * + * When there is no traffic in any of the strict queues, the bandwidth-sharing + * ones are tried next. Each band is assigned a deficit counter, initialized to + * "quantum" of that band. ETS maintains a list of active bandwidth-sharing + * bands whose qdiscs are non-empty. A packet is dequeued from the band at the + * head of the list if the packet size is smaller or equal to the deficit + * counter. If the counter is too small, it is increased by "quantum" and the + * scheduler moves on to the next band in the active list. + */ + +#include <linux/module.h> +#include <net/gen_stats.h> +#include <net/netlink.h> +#include <net/pkt_cls.h> +#include <net/pkt_sched.h> +#include <net/sch_generic.h> + +struct ets_class { + struct list_head alist; /* In struct ets_sched.active. */ + struct Qdisc *qdisc; + u32 quantum; + u32 deficit; + struct gnet_stats_basic_packed bstats; + struct gnet_stats_queue qstats; +}; + +struct ets_sched { + struct list_head active; + struct tcf_proto __rcu *filter_list; + struct tcf_block *block; + unsigned int nbands; + unsigned int nstrict; + u8 prio2band[TC_PRIO_MAX + 1]; + struct ets_class classes[TCQ_ETS_MAX_BANDS]; +}; + +static const struct nla_policy ets_policy[TCA_ETS_MAX + 1] = { + [TCA_ETS_NBANDS] = { .type = NLA_U8 }, + [TCA_ETS_NSTRICT] = { .type = NLA_U8 }, + [TCA_ETS_QUANTA] = { .type = NLA_NESTED }, + [TCA_ETS_PRIOMAP] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy ets_priomap_policy[TCA_ETS_MAX + 1] = { + [TCA_ETS_PRIOMAP_BAND] = { .type = NLA_U8 }, +}; + +static const struct nla_policy ets_quanta_policy[TCA_ETS_MAX + 1] = { + [TCA_ETS_QUANTA_BAND] = { .type = NLA_U32 }, +}; + +static const struct nla_policy ets_class_policy[TCA_ETS_MAX + 1] = { + [TCA_ETS_QUANTA_BAND] = { .type = NLA_U32 }, +}; + +static int ets_quantum_parse(struct Qdisc *sch, const struct nlattr *attr, + unsigned int *quantum, + struct netlink_ext_ack *extack) +{ + *quantum = nla_get_u32(attr); + if (!*quantum) { + NL_SET_ERR_MSG(extack, "ETS quantum cannot be zero"); + return -EINVAL; + } + return 0; +} + +static struct ets_class * +ets_class_from_arg(struct Qdisc *sch, unsigned long arg) +{ + struct ets_sched *q = qdisc_priv(sch); + + return &q->classes[arg - 1]; +} + +static u32 ets_class_id(struct Qdisc *sch, const struct ets_class *cl) +{ + struct ets_sched *q = qdisc_priv(sch); + int band = cl - q->classes; + + return TC_H_MAKE(sch->handle, band + 1); +} + +static void ets_offload_change(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + struct ets_sched *q = qdisc_priv(sch); + struct tc_ets_qopt_offload qopt; + unsigned int w_psum_prev = 0; + unsigned int q_psum = 0; + unsigned int q_sum = 0; + unsigned int quantum; + unsigned int w_psum; + unsigned int weight; + unsigned int i; + + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return; + + qopt.command = TC_ETS_REPLACE; + qopt.handle = sch->handle; + qopt.parent = sch->parent; + qopt.replace_params.bands = q->nbands; + qopt.replace_params.qstats = &sch->qstats; + memcpy(&qopt.replace_params.priomap, + q->prio2band, sizeof(q->prio2band)); + + for (i = 0; i < q->nbands; i++) + q_sum += q->classes[i].quantum; + + for (i = 0; i < q->nbands; i++) { + quantum = q->classes[i].quantum; + q_psum += quantum; + w_psum = quantum ? q_psum * 100 / q_sum : 0; + weight = w_psum - w_psum_prev; + w_psum_prev = w_psum; + + qopt.replace_params.quanta[i] = quantum; + qopt.replace_params.weights[i] = weight; + } + + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETS, &qopt); +} + +static void ets_offload_destroy(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + struct tc_ets_qopt_offload qopt; + + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return; + + qopt.command = TC_ETS_DESTROY; + qopt.handle = sch->handle; + qopt.parent = sch->parent; + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETS, &qopt); +} + +static void ets_offload_graft(struct Qdisc *sch, struct Qdisc *new, + struct Qdisc *old, unsigned long arg, + struct netlink_ext_ack *extack) +{ + struct net_device *dev = qdisc_dev(sch); + struct tc_ets_qopt_offload qopt; + + qopt.command = TC_ETS_GRAFT; + qopt.handle = sch->handle; + qopt.parent = sch->parent; + qopt.graft_params.band = arg - 1; + qopt.graft_params.child_handle = new->handle; + + qdisc_offload_graft_helper(dev, sch, new, old, TC_SETUP_QDISC_ETS, + &qopt, extack); +} + +static int ets_offload_dump(struct Qdisc *sch) +{ + struct tc_ets_qopt_offload qopt; + + qopt.command = TC_ETS_STATS; + qopt.handle = sch->handle; + qopt.parent = sch->parent; + qopt.stats.bstats = &sch->bstats; + qopt.stats.qstats = &sch->qstats; + + return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_ETS, &qopt); +} + +static bool ets_class_is_strict(struct ets_sched *q, const struct ets_class *cl) +{ + unsigned int band = cl - q->classes; + + return band < q->nstrict; +} + +static int ets_class_change(struct Qdisc *sch, u32 classid, u32 parentid, + struct nlattr **tca, unsigned long *arg, + struct netlink_ext_ack *extack) +{ + struct ets_class *cl = ets_class_from_arg(sch, *arg); + struct ets_sched *q = qdisc_priv(sch); + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_ETS_MAX + 1]; + unsigned int quantum; + int err; + + /* Classes can be added and removed only through Qdisc_ops.change + * interface. + */ + if (!cl) { + NL_SET_ERR_MSG(extack, "Fine-grained class addition and removal is not supported"); + return -EOPNOTSUPP; + } + + if (!opt) { + NL_SET_ERR_MSG(extack, "ETS options are required for this operation"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_ETS_MAX, opt, ets_class_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_ETS_QUANTA_BAND]) + /* Nothing to configure. */ + return 0; + + if (ets_class_is_strict(q, cl)) { + NL_SET_ERR_MSG(extack, "Strict bands do not have a configurable quantum"); + return -EINVAL; + } + + err = ets_quantum_parse(sch, tb[TCA_ETS_QUANTA_BAND], &quantum, + extack); + if (err) + return err; + + sch_tree_lock(sch); + cl->quantum = quantum; + sch_tree_unlock(sch); + + ets_offload_change(sch); + return 0; +} + +static int ets_class_graft(struct Qdisc *sch, unsigned long arg, + struct Qdisc *new, struct Qdisc **old, + struct netlink_ext_ack *extack) +{ + struct ets_class *cl = ets_class_from_arg(sch, arg); + + if (!new) { + new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + ets_class_id(sch, cl), NULL); + if (!new) + new = &noop_qdisc; + else + qdisc_hash_add(new, true); + } + + *old = qdisc_replace(sch, new, &cl->qdisc); + ets_offload_graft(sch, new, *old, arg, extack); + return 0; +} + +static struct Qdisc *ets_class_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct ets_class *cl = ets_class_from_arg(sch, arg); + + return cl->qdisc; +} + +static unsigned long ets_class_find(struct Qdisc *sch, u32 classid) +{ + unsigned long band = TC_H_MIN(classid); + struct ets_sched *q = qdisc_priv(sch); + + if (band - 1 >= q->nbands) + return 0; + return band; +} + +static void ets_class_qlen_notify(struct Qdisc *sch, unsigned long arg) +{ + struct ets_class *cl = ets_class_from_arg(sch, arg); + struct ets_sched *q = qdisc_priv(sch); + + /* We get notified about zero-length child Qdiscs as well if they are + * offloaded. Those aren't on the active list though, so don't attempt + * to remove them. + */ + if (!ets_class_is_strict(q, cl) && sch->q.qlen) + list_del(&cl->alist); +} + +static int ets_class_dump(struct Qdisc *sch, unsigned long arg, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct ets_class *cl = ets_class_from_arg(sch, arg); + struct ets_sched *q = qdisc_priv(sch); + struct nlattr *nest; + + tcm->tcm_parent = TC_H_ROOT; + tcm->tcm_handle = ets_class_id(sch, cl); + tcm->tcm_info = cl->qdisc->handle; + + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); + if (!nest) + goto nla_put_failure; + if (!ets_class_is_strict(q, cl)) { + if (nla_put_u32(skb, TCA_ETS_QUANTA_BAND, cl->quantum)) + goto nla_put_failure; + } + return nla_nest_end(skb, nest); + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int ets_class_dump_stats(struct Qdisc *sch, unsigned long arg, + struct gnet_dump *d) +{ + struct ets_class *cl = ets_class_from_arg(sch, arg); + struct Qdisc *cl_q = cl->qdisc; + + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), + d, NULL, &cl_q->bstats) < 0 || + qdisc_qstats_copy(d, cl_q) < 0) + return -1; + + return 0; +} + +static void ets_qdisc_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct ets_sched *q = qdisc_priv(sch); + int i; + + if (arg->stop) + return; + + for (i = 0; i < q->nbands; i++) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, i + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static struct tcf_block * +ets_qdisc_tcf_block(struct Qdisc *sch, unsigned long cl, + struct netlink_ext_ack *extack) +{ + struct ets_sched *q = qdisc_priv(sch); + + if (cl) { + NL_SET_ERR_MSG(extack, "ETS classid must be zero"); + return NULL; + } + + return q->block; +} + +static unsigned long ets_qdisc_bind_tcf(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + return ets_class_find(sch, classid); +} + +static void ets_qdisc_unbind_tcf(struct Qdisc *sch, unsigned long arg) +{ +} + +static struct ets_class *ets_classify(struct sk_buff *skb, struct Qdisc *sch, + int *qerr) +{ + struct ets_sched *q = qdisc_priv(sch); + u32 band = skb->priority; + struct tcf_result res; + struct tcf_proto *fl; + int err; + + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + if (TC_H_MAJ(skb->priority) != sch->handle) { + fl = rcu_dereference_bh(q->filter_list); + err = tcf_classify(skb, fl, &res, false); +#ifdef CONFIG_NET_CLS_ACT + switch (err) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + /* fall through */ + case TC_ACT_SHOT: + return NULL; + } +#endif + if (!fl || err < 0) { + if (TC_H_MAJ(band)) + band = 0; + return &q->classes[q->prio2band[band & TC_PRIO_MAX]]; + } + band = res.classid; + } + band = TC_H_MIN(band) - 1; + if (band >= q->nbands) + return &q->classes[q->prio2band[0]]; + return &q->classes[band]; +} + +static int ets_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + unsigned int len = qdisc_pkt_len(skb); + struct ets_sched *q = qdisc_priv(sch); + struct ets_class *cl; + int err = 0; + bool first; + + cl = ets_classify(skb, sch, &err); + if (!cl) { + if (err & __NET_XMIT_BYPASS) + qdisc_qstats_drop(sch); + __qdisc_drop(skb, to_free); + return err; + } + + first = !cl->qdisc->q.qlen; + err = qdisc_enqueue(skb, cl->qdisc, to_free); + if (unlikely(err != NET_XMIT_SUCCESS)) { + if (net_xmit_drop_count(err)) { + cl->qstats.drops++; + qdisc_qstats_drop(sch); + } + return err; + } + + if (first && !ets_class_is_strict(q, cl)) { + list_add_tail(&cl->alist, &q->active); + cl->deficit = cl->quantum; + } + + sch->qstats.backlog += len; + sch->q.qlen++; + return err; +} + +static struct sk_buff * +ets_qdisc_dequeue_skb(struct Qdisc *sch, struct sk_buff *skb) +{ + qdisc_bstats_update(sch, skb); + qdisc_qstats_backlog_dec(sch, skb); + sch->q.qlen--; + return skb; +} + +static struct sk_buff *ets_qdisc_dequeue(struct Qdisc *sch) +{ + struct ets_sched *q = qdisc_priv(sch); + struct ets_class *cl; + struct sk_buff *skb; + unsigned int band; + unsigned int len; + + while (1) { + for (band = 0; band < q->nstrict; band++) { + cl = &q->classes[band]; + skb = qdisc_dequeue_peeked(cl->qdisc); + if (skb) + return ets_qdisc_dequeue_skb(sch, skb); + } + + if (list_empty(&q->active)) + goto out; + + cl = list_first_entry(&q->active, struct ets_class, alist); + skb = cl->qdisc->ops->peek(cl->qdisc); + if (!skb) { + qdisc_warn_nonwc(__func__, cl->qdisc); + goto out; + } + + len = qdisc_pkt_len(skb); + if (len <= cl->deficit) { + cl->deficit -= len; + skb = qdisc_dequeue_peeked(cl->qdisc); + if (unlikely(!skb)) + goto out; + if (cl->qdisc->q.qlen == 0) + list_del(&cl->alist); + return ets_qdisc_dequeue_skb(sch, skb); + } + + cl->deficit += cl->quantum; + list_move_tail(&cl->alist, &q->active); + } +out: + return NULL; +} + +static int ets_qdisc_priomap_parse(struct nlattr *priomap_attr, + unsigned int nbands, u8 *priomap, + struct netlink_ext_ack *extack) +{ + const struct nlattr *attr; + int prio = 0; + u8 band; + int rem; + int err; + + err = __nla_validate_nested(priomap_attr, TCA_ETS_MAX, + ets_priomap_policy, NL_VALIDATE_STRICT, + extack); + if (err) + return err; + + nla_for_each_nested(attr, priomap_attr, rem) { + switch (nla_type(attr)) { + case TCA_ETS_PRIOMAP_BAND: + if (prio > TC_PRIO_MAX) { + NL_SET_ERR_MSG_MOD(extack, "Too many priorities in ETS priomap"); + return -EINVAL; + } + band = nla_get_u8(attr); + if (band >= nbands) { + NL_SET_ERR_MSG_MOD(extack, "Invalid band number in ETS priomap"); + return -EINVAL; + } + priomap[prio++] = band; + break; + default: + WARN_ON_ONCE(1); /* Validate should have caught this. */ + return -EINVAL; + } + } + + return 0; +} + +static int ets_qdisc_quanta_parse(struct Qdisc *sch, struct nlattr *quanta_attr, + unsigned int nbands, unsigned int nstrict, + unsigned int *quanta, + struct netlink_ext_ack *extack) +{ + const struct nlattr *attr; + int band = nstrict; + int rem; + int err; + + err = __nla_validate_nested(quanta_attr, TCA_ETS_MAX, + ets_quanta_policy, NL_VALIDATE_STRICT, + extack); + if (err < 0) + return err; + + nla_for_each_nested(attr, quanta_attr, rem) { + switch (nla_type(attr)) { + case TCA_ETS_QUANTA_BAND: + if (band >= nbands) { + NL_SET_ERR_MSG_MOD(extack, "ETS quanta has more values than bands"); + return -EINVAL; + } + err = ets_quantum_parse(sch, attr, &quanta[band++], + extack); + if (err) + return err; + break; + default: + WARN_ON_ONCE(1); /* Validate should have caught this. */ + return -EINVAL; + } + } + + return 0; +} + +static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + unsigned int quanta[TCQ_ETS_MAX_BANDS] = {0}; + struct Qdisc *queues[TCQ_ETS_MAX_BANDS]; + struct ets_sched *q = qdisc_priv(sch); + struct nlattr *tb[TCA_ETS_MAX + 1]; + unsigned int oldbands = q->nbands; + u8 priomap[TC_PRIO_MAX + 1]; + unsigned int nstrict = 0; + unsigned int nbands; + unsigned int i; + int err; + + if (!opt) { + NL_SET_ERR_MSG(extack, "ETS options are required for this operation"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_ETS_MAX, opt, ets_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_ETS_NBANDS]) { + NL_SET_ERR_MSG_MOD(extack, "Number of bands is a required argument"); + return -EINVAL; + } + nbands = nla_get_u8(tb[TCA_ETS_NBANDS]); + if (nbands < 1 || nbands > TCQ_ETS_MAX_BANDS) { + NL_SET_ERR_MSG_MOD(extack, "Invalid number of bands"); + return -EINVAL; + } + /* Unless overridden, traffic goes to the last band. */ + memset(priomap, nbands - 1, sizeof(priomap)); + + if (tb[TCA_ETS_NSTRICT]) { + nstrict = nla_get_u8(tb[TCA_ETS_NSTRICT]); + if (nstrict > nbands) { + NL_SET_ERR_MSG_MOD(extack, "Invalid number of strict bands"); + return -EINVAL; + } + } + + if (tb[TCA_ETS_PRIOMAP]) { + err = ets_qdisc_priomap_parse(tb[TCA_ETS_PRIOMAP], + nbands, priomap, extack); + if (err) + return err; + } + + if (tb[TCA_ETS_QUANTA]) { + err = ets_qdisc_quanta_parse(sch, tb[TCA_ETS_QUANTA], + nbands, nstrict, quanta, extack); + if (err) + return err; + } + /* If there are more bands than strict + quanta provided, the remaining + * ones are ETS with quantum of MTU. Initialize the missing values here. + */ + for (i = nstrict; i < nbands; i++) { + if (!quanta[i]) + quanta[i] = psched_mtu(qdisc_dev(sch)); + } + + /* Before commit, make sure we can allocate all new qdiscs */ + for (i = oldbands; i < nbands; i++) { + queues[i] = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + ets_class_id(sch, &q->classes[i]), + extack); + if (!queues[i]) { + while (i > oldbands) + qdisc_put(queues[--i]); + return -ENOMEM; + } + } + + sch_tree_lock(sch); + + q->nbands = nbands; + q->nstrict = nstrict; + memcpy(q->prio2band, priomap, sizeof(priomap)); + + for (i = q->nbands; i < oldbands; i++) + qdisc_tree_flush_backlog(q->classes[i].qdisc); + + for (i = 0; i < q->nbands; i++) + q->classes[i].quantum = quanta[i]; + + for (i = oldbands; i < q->nbands; i++) { + q->classes[i].qdisc = queues[i]; + if (q->classes[i].qdisc != &noop_qdisc) + qdisc_hash_add(q->classes[i].qdisc, true); + } + + sch_tree_unlock(sch); + + ets_offload_change(sch); + for (i = q->nbands; i < oldbands; i++) { + qdisc_put(q->classes[i].qdisc); + memset(&q->classes[i], 0, sizeof(q->classes[i])); + } + return 0; +} + +static int ets_qdisc_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct ets_sched *q = qdisc_priv(sch); + int err; + + if (!opt) + return -EINVAL; + + err = tcf_block_get(&q->block, &q->filter_list, sch, extack); + if (err) + return err; + + INIT_LIST_HEAD(&q->active); + return ets_qdisc_change(sch, opt, extack); +} + +static void ets_qdisc_reset(struct Qdisc *sch) +{ + struct ets_sched *q = qdisc_priv(sch); + int band; + + for (band = q->nstrict; band < q->nbands; band++) { + if (q->classes[band].qdisc->q.qlen) + list_del(&q->classes[band].alist); + } + for (band = 0; band < q->nbands; band++) + qdisc_reset(q->classes[band].qdisc); + sch->qstats.backlog = 0; + sch->q.qlen = 0; +} + +static void ets_qdisc_destroy(struct Qdisc *sch) +{ + struct ets_sched *q = qdisc_priv(sch); + int band; + + ets_offload_destroy(sch); + tcf_block_put(q->block); + for (band = 0; band < q->nbands; band++) + qdisc_put(q->classes[band].qdisc); +} + +static int ets_qdisc_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct ets_sched *q = qdisc_priv(sch); + struct nlattr *opts; + struct nlattr *nest; + int band; + int prio; + int err; + + err = ets_offload_dump(sch); + if (err) + return err; + + opts = nla_nest_start_noflag(skb, TCA_OPTIONS); + if (!opts) + goto nla_err; + + if (nla_put_u8(skb, TCA_ETS_NBANDS, q->nbands)) + goto nla_err; + + if (q->nstrict && + nla_put_u8(skb, TCA_ETS_NSTRICT, q->nstrict)) + goto nla_err; + + if (q->nbands > q->nstrict) { + nest = nla_nest_start(skb, TCA_ETS_QUANTA); + if (!nest) + goto nla_err; + + for (band = q->nstrict; band < q->nbands; band++) { + if (nla_put_u32(skb, TCA_ETS_QUANTA_BAND, + q->classes[band].quantum)) + goto nla_err; + } + + nla_nest_end(skb, nest); + } + + nest = nla_nest_start(skb, TCA_ETS_PRIOMAP); + if (!nest) + goto nla_err; + + for (prio = 0; prio <= TC_PRIO_MAX; prio++) { + if (nla_put_u8(skb, TCA_ETS_PRIOMAP_BAND, q->prio2band[prio])) + goto nla_err; + } + + nla_nest_end(skb, nest); + + return nla_nest_end(skb, opts); + +nla_err: + nla_nest_cancel(skb, opts); + return -EMSGSIZE; +} + +static const struct Qdisc_class_ops ets_class_ops = { + .change = ets_class_change, + .graft = ets_class_graft, + .leaf = ets_class_leaf, + .find = ets_class_find, + .qlen_notify = ets_class_qlen_notify, + .dump = ets_class_dump, + .dump_stats = ets_class_dump_stats, + .walk = ets_qdisc_walk, + .tcf_block = ets_qdisc_tcf_block, + .bind_tcf = ets_qdisc_bind_tcf, + .unbind_tcf = ets_qdisc_unbind_tcf, +}; + +static struct Qdisc_ops ets_qdisc_ops __read_mostly = { + .cl_ops = &ets_class_ops, + .id = "ets", + .priv_size = sizeof(struct ets_sched), + .enqueue = ets_qdisc_enqueue, + .dequeue = ets_qdisc_dequeue, + .peek = qdisc_peek_dequeued, + .change = ets_qdisc_change, + .init = ets_qdisc_init, + .reset = ets_qdisc_reset, + .destroy = ets_qdisc_destroy, + .dump = ets_qdisc_dump, + .owner = THIS_MODULE, +}; + +static int __init ets_init(void) +{ + return register_qdisc(&ets_qdisc_ops); +} + +static void __exit ets_exit(void) +{ + unregister_qdisc(&ets_qdisc_ops); +} + +module_init(ets_init); +module_exit(ets_exit); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index ff4c5e9d0d77..a5a295477ecc 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -786,10 +786,12 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_FQ_QUANTUM]) { u32 quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]); - if (quantum > 0) + if (quantum > 0 && quantum <= (1 << 20)) { q->quantum = quantum; - else + } else { + NL_SET_ERR_MSG_MOD(extack, "invalid quantum"); err = -EINVAL; + } } if (tb[TCA_FQ_INITIAL_QUANTUM]) diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c new file mode 100644 index 000000000000..bbd0dea6b6b9 --- /dev/null +++ b/net/sched/sch_fq_pie.c @@ -0,0 +1,562 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Flow Queue PIE discipline + * + * Copyright (C) 2019 Mohit P. Tahiliani <tahiliani@nitk.edu.in> + * Copyright (C) 2019 Sachin D. Patil <sdp.sachin@gmail.com> + * Copyright (C) 2019 V. Saicharan <vsaicharan1998@gmail.com> + * Copyright (C) 2019 Mohit Bhasi <mohitbhasi1998@gmail.com> + * Copyright (C) 2019 Leslie Monis <lesliemonis@gmail.com> + * Copyright (C) 2019 Gautam Ramakrishnan <gautamramk@gmail.com> + */ + +#include <linux/jhash.h> +#include <linux/sizes.h> +#include <linux/vmalloc.h> +#include <net/pkt_cls.h> +#include <net/pie.h> + +/* Flow Queue PIE + * + * Principles: + * - Packets are classified on flows. + * - This is a Stochastic model (as we use a hash, several flows might + * be hashed to the same slot) + * - Each flow has a PIE managed queue. + * - Flows are linked onto two (Round Robin) lists, + * so that new flows have priority on old ones. + * - For a given flow, packets are not reordered. + * - Drops during enqueue only. + * - ECN capability is off by default. + * - ECN threshold (if ECN is enabled) is at 10% by default. + * - Uses timestamps to calculate queue delay by default. + */ + +/** + * struct fq_pie_flow - contains data for each flow + * @vars: pie vars associated with the flow + * @deficit: number of remaining byte credits + * @backlog: size of data in the flow + * @qlen: number of packets in the flow + * @flowchain: flowchain for the flow + * @head: first packet in the flow + * @tail: last packet in the flow + */ +struct fq_pie_flow { + struct pie_vars vars; + s32 deficit; + u32 backlog; + u32 qlen; + struct list_head flowchain; + struct sk_buff *head; + struct sk_buff *tail; +}; + +struct fq_pie_sched_data { + struct tcf_proto __rcu *filter_list; /* optional external classifier */ + struct tcf_block *block; + struct fq_pie_flow *flows; + struct Qdisc *sch; + struct list_head old_flows; + struct list_head new_flows; + struct pie_params p_params; + u32 ecn_prob; + u32 flows_cnt; + u32 quantum; + u32 memory_limit; + u32 new_flow_count; + u32 memory_usage; + u32 overmemory; + struct pie_stats stats; + struct timer_list adapt_timer; +}; + +static unsigned int fq_pie_hash(const struct fq_pie_sched_data *q, + struct sk_buff *skb) +{ + return reciprocal_scale(skb_get_hash(skb), q->flows_cnt); +} + +static unsigned int fq_pie_classify(struct sk_buff *skb, struct Qdisc *sch, + int *qerr) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + struct tcf_proto *filter; + struct tcf_result res; + int result; + + if (TC_H_MAJ(skb->priority) == sch->handle && + TC_H_MIN(skb->priority) > 0 && + TC_H_MIN(skb->priority) <= q->flows_cnt) + return TC_H_MIN(skb->priority); + + filter = rcu_dereference_bh(q->filter_list); + if (!filter) + return fq_pie_hash(q, skb) + 1; + + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + result = tcf_classify(skb, filter, &res, false); + if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + /* fall through */ + case TC_ACT_SHOT: + return 0; + } +#endif + if (TC_H_MIN(res.classid) <= q->flows_cnt) + return TC_H_MIN(res.classid); + } + return 0; +} + +/* add skb to flow queue (tail add) */ +static inline void flow_queue_add(struct fq_pie_flow *flow, + struct sk_buff *skb) +{ + if (!flow->head) + flow->head = skb; + else + flow->tail->next = skb; + flow->tail = skb; + skb->next = NULL; +} + +static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + struct fq_pie_flow *sel_flow; + int uninitialized_var(ret); + u8 memory_limited = false; + u8 enqueue = false; + u32 pkt_len; + u32 idx; + + /* Classifies packet into corresponding flow */ + idx = fq_pie_classify(skb, sch, &ret); + sel_flow = &q->flows[idx]; + + /* Checks whether adding a new packet would exceed memory limit */ + get_pie_cb(skb)->mem_usage = skb->truesize; + memory_limited = q->memory_usage > q->memory_limit + skb->truesize; + + /* Checks if the qdisc is full */ + if (unlikely(qdisc_qlen(sch) >= sch->limit)) { + q->stats.overlimit++; + goto out; + } else if (unlikely(memory_limited)) { + q->overmemory++; + } + + if (!pie_drop_early(sch, &q->p_params, &sel_flow->vars, + sel_flow->backlog, skb->len)) { + enqueue = true; + } else if (q->p_params.ecn && + sel_flow->vars.prob <= (MAX_PROB / 100) * q->ecn_prob && + INET_ECN_set_ce(skb)) { + /* If packet is ecn capable, mark it if drop probability + * is lower than the parameter ecn_prob, else drop it. + */ + q->stats.ecn_mark++; + enqueue = true; + } + if (enqueue) { + /* Set enqueue time only when dq_rate_estimator is disabled. */ + if (!q->p_params.dq_rate_estimator) + pie_set_enqueue_time(skb); + + pkt_len = qdisc_pkt_len(skb); + q->stats.packets_in++; + q->memory_usage += skb->truesize; + sch->qstats.backlog += pkt_len; + sch->q.qlen++; + flow_queue_add(sel_flow, skb); + if (list_empty(&sel_flow->flowchain)) { + list_add_tail(&sel_flow->flowchain, &q->new_flows); + q->new_flow_count++; + sel_flow->deficit = q->quantum; + sel_flow->qlen = 0; + sel_flow->backlog = 0; + } + sel_flow->qlen++; + sel_flow->backlog += pkt_len; + return NET_XMIT_SUCCESS; + } +out: + q->stats.dropped++; + sel_flow->vars.accu_prob = 0; + sel_flow->vars.accu_prob_overflows = 0; + __qdisc_drop(skb, to_free); + qdisc_qstats_drop(sch); + return NET_XMIT_CN; +} + +static const struct nla_policy fq_pie_policy[TCA_FQ_PIE_MAX + 1] = { + [TCA_FQ_PIE_LIMIT] = {.type = NLA_U32}, + [TCA_FQ_PIE_FLOWS] = {.type = NLA_U32}, + [TCA_FQ_PIE_TARGET] = {.type = NLA_U32}, + [TCA_FQ_PIE_TUPDATE] = {.type = NLA_U32}, + [TCA_FQ_PIE_ALPHA] = {.type = NLA_U32}, + [TCA_FQ_PIE_BETA] = {.type = NLA_U32}, + [TCA_FQ_PIE_QUANTUM] = {.type = NLA_U32}, + [TCA_FQ_PIE_MEMORY_LIMIT] = {.type = NLA_U32}, + [TCA_FQ_PIE_ECN_PROB] = {.type = NLA_U32}, + [TCA_FQ_PIE_ECN] = {.type = NLA_U32}, + [TCA_FQ_PIE_BYTEMODE] = {.type = NLA_U32}, + [TCA_FQ_PIE_DQ_RATE_ESTIMATOR] = {.type = NLA_U32}, +}; + +static inline struct sk_buff *dequeue_head(struct fq_pie_flow *flow) +{ + struct sk_buff *skb = flow->head; + + flow->head = skb->next; + skb->next = NULL; + return skb; +} + +static struct sk_buff *fq_pie_qdisc_dequeue(struct Qdisc *sch) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb = NULL; + struct fq_pie_flow *flow; + struct list_head *head; + u32 pkt_len; + +begin: + head = &q->new_flows; + if (list_empty(head)) { + head = &q->old_flows; + if (list_empty(head)) + return NULL; + } + + flow = list_first_entry(head, struct fq_pie_flow, flowchain); + /* Flow has exhausted all its credits */ + if (flow->deficit <= 0) { + flow->deficit += q->quantum; + list_move_tail(&flow->flowchain, &q->old_flows); + goto begin; + } + + if (flow->head) { + skb = dequeue_head(flow); + pkt_len = qdisc_pkt_len(skb); + sch->qstats.backlog -= pkt_len; + sch->q.qlen--; + qdisc_bstats_update(sch, skb); + } + + if (!skb) { + /* force a pass through old_flows to prevent starvation */ + if (head == &q->new_flows && !list_empty(&q->old_flows)) + list_move_tail(&flow->flowchain, &q->old_flows); + else + list_del_init(&flow->flowchain); + goto begin; + } + + flow->qlen--; + flow->deficit -= pkt_len; + flow->backlog -= pkt_len; + q->memory_usage -= get_pie_cb(skb)->mem_usage; + pie_process_dequeue(skb, &q->p_params, &flow->vars, flow->backlog); + return skb; +} + +static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_FQ_PIE_MAX + 1]; + unsigned int len_dropped = 0; + unsigned int num_dropped = 0; + int err; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_FQ_PIE_MAX, opt, fq_pie_policy, extack); + if (err < 0) + return err; + + sch_tree_lock(sch); + if (tb[TCA_FQ_PIE_LIMIT]) { + u32 limit = nla_get_u32(tb[TCA_FQ_PIE_LIMIT]); + + q->p_params.limit = limit; + sch->limit = limit; + } + if (tb[TCA_FQ_PIE_FLOWS]) { + if (q->flows) { + NL_SET_ERR_MSG_MOD(extack, + "Number of flows cannot be changed"); + goto flow_error; + } + q->flows_cnt = nla_get_u32(tb[TCA_FQ_PIE_FLOWS]); + if (!q->flows_cnt || q->flows_cnt > 65536) { + NL_SET_ERR_MSG_MOD(extack, + "Number of flows must be < 65536"); + goto flow_error; + } + } + + /* convert from microseconds to pschedtime */ + if (tb[TCA_FQ_PIE_TARGET]) { + /* target is in us */ + u32 target = nla_get_u32(tb[TCA_FQ_PIE_TARGET]); + + /* convert to pschedtime */ + q->p_params.target = + PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC); + } + + /* tupdate is in jiffies */ + if (tb[TCA_FQ_PIE_TUPDATE]) + q->p_params.tupdate = + usecs_to_jiffies(nla_get_u32(tb[TCA_FQ_PIE_TUPDATE])); + + if (tb[TCA_FQ_PIE_ALPHA]) + q->p_params.alpha = nla_get_u32(tb[TCA_FQ_PIE_ALPHA]); + + if (tb[TCA_FQ_PIE_BETA]) + q->p_params.beta = nla_get_u32(tb[TCA_FQ_PIE_BETA]); + + if (tb[TCA_FQ_PIE_QUANTUM]) + q->quantum = nla_get_u32(tb[TCA_FQ_PIE_QUANTUM]); + + if (tb[TCA_FQ_PIE_MEMORY_LIMIT]) + q->memory_limit = nla_get_u32(tb[TCA_FQ_PIE_MEMORY_LIMIT]); + + if (tb[TCA_FQ_PIE_ECN_PROB]) + q->ecn_prob = nla_get_u32(tb[TCA_FQ_PIE_ECN_PROB]); + + if (tb[TCA_FQ_PIE_ECN]) + q->p_params.ecn = nla_get_u32(tb[TCA_FQ_PIE_ECN]); + + if (tb[TCA_FQ_PIE_BYTEMODE]) + q->p_params.bytemode = nla_get_u32(tb[TCA_FQ_PIE_BYTEMODE]); + + if (tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR]) + q->p_params.dq_rate_estimator = + nla_get_u32(tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR]); + + /* Drop excess packets if new limit is lower */ + while (sch->q.qlen > sch->limit) { + struct sk_buff *skb = fq_pie_qdisc_dequeue(sch); + + kfree_skb(skb); + len_dropped += qdisc_pkt_len(skb); + num_dropped += 1; + } + qdisc_tree_reduce_backlog(sch, num_dropped, len_dropped); + + sch_tree_unlock(sch); + return 0; + +flow_error: + sch_tree_unlock(sch); + return -EINVAL; +} + +static void fq_pie_timer(struct timer_list *t) +{ + struct fq_pie_sched_data *q = from_timer(q, t, adapt_timer); + struct Qdisc *sch = q->sch; + spinlock_t *root_lock; /* to lock qdisc for probability calculations */ + u16 idx; + + root_lock = qdisc_lock(qdisc_root_sleeping(sch)); + spin_lock(root_lock); + + for (idx = 0; idx < q->flows_cnt; idx++) + pie_calculate_probability(&q->p_params, &q->flows[idx].vars, + q->flows[idx].backlog); + + /* reset the timer to fire after 'tupdate' jiffies. */ + if (q->p_params.tupdate) + mod_timer(&q->adapt_timer, jiffies + q->p_params.tupdate); + + spin_unlock(root_lock); +} + +static int fq_pie_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + int err; + u16 idx; + + pie_params_init(&q->p_params); + sch->limit = 10 * 1024; + q->p_params.limit = sch->limit; + q->quantum = psched_mtu(qdisc_dev(sch)); + q->sch = sch; + q->ecn_prob = 10; + q->flows_cnt = 1024; + q->memory_limit = SZ_32M; + + INIT_LIST_HEAD(&q->new_flows); + INIT_LIST_HEAD(&q->old_flows); + + if (opt) { + err = fq_pie_change(sch, opt, extack); + + if (err) + return err; + } + + err = tcf_block_get(&q->block, &q->filter_list, sch, extack); + if (err) + goto init_failure; + + q->flows = kvcalloc(q->flows_cnt, sizeof(struct fq_pie_flow), + GFP_KERNEL); + if (!q->flows) { + err = -ENOMEM; + goto init_failure; + } + for (idx = 0; idx < q->flows_cnt; idx++) { + struct fq_pie_flow *flow = q->flows + idx; + + INIT_LIST_HEAD(&flow->flowchain); + pie_vars_init(&flow->vars); + } + + timer_setup(&q->adapt_timer, fq_pie_timer, 0); + mod_timer(&q->adapt_timer, jiffies + HZ / 2); + + return 0; + +init_failure: + q->flows_cnt = 0; + + return err; +} + +static int fq_pie_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (!opts) + return -EMSGSIZE; + + /* convert target from pschedtime to us */ + if (nla_put_u32(skb, TCA_FQ_PIE_LIMIT, sch->limit) || + nla_put_u32(skb, TCA_FQ_PIE_FLOWS, q->flows_cnt) || + nla_put_u32(skb, TCA_FQ_PIE_TARGET, + ((u32)PSCHED_TICKS2NS(q->p_params.target)) / + NSEC_PER_USEC) || + nla_put_u32(skb, TCA_FQ_PIE_TUPDATE, + jiffies_to_usecs(q->p_params.tupdate)) || + nla_put_u32(skb, TCA_FQ_PIE_ALPHA, q->p_params.alpha) || + nla_put_u32(skb, TCA_FQ_PIE_BETA, q->p_params.beta) || + nla_put_u32(skb, TCA_FQ_PIE_QUANTUM, q->quantum) || + nla_put_u32(skb, TCA_FQ_PIE_MEMORY_LIMIT, q->memory_limit) || + nla_put_u32(skb, TCA_FQ_PIE_ECN_PROB, q->ecn_prob) || + nla_put_u32(skb, TCA_FQ_PIE_ECN, q->p_params.ecn) || + nla_put_u32(skb, TCA_FQ_PIE_BYTEMODE, q->p_params.bytemode) || + nla_put_u32(skb, TCA_FQ_PIE_DQ_RATE_ESTIMATOR, + q->p_params.dq_rate_estimator)) + goto nla_put_failure; + + return nla_nest_end(skb, opts); + +nla_put_failure: + nla_nest_cancel(skb, opts); + return -EMSGSIZE; +} + +static int fq_pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + struct tc_fq_pie_xstats st = { + .packets_in = q->stats.packets_in, + .overlimit = q->stats.overlimit, + .overmemory = q->overmemory, + .dropped = q->stats.dropped, + .ecn_mark = q->stats.ecn_mark, + .new_flow_count = q->new_flow_count, + .memory_usage = q->memory_usage, + }; + struct list_head *pos; + + sch_tree_lock(sch); + list_for_each(pos, &q->new_flows) + st.new_flows_len++; + + list_for_each(pos, &q->old_flows) + st.old_flows_len++; + sch_tree_unlock(sch); + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static void fq_pie_reset(struct Qdisc *sch) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + u16 idx; + + INIT_LIST_HEAD(&q->new_flows); + INIT_LIST_HEAD(&q->old_flows); + for (idx = 0; idx < q->flows_cnt; idx++) { + struct fq_pie_flow *flow = q->flows + idx; + + /* Removes all packets from flow */ + rtnl_kfree_skbs(flow->head, flow->tail); + flow->head = NULL; + + INIT_LIST_HEAD(&flow->flowchain); + pie_vars_init(&flow->vars); + } + + sch->q.qlen = 0; + sch->qstats.backlog = 0; +} + +static void fq_pie_destroy(struct Qdisc *sch) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + + tcf_block_put(q->block); + del_timer_sync(&q->adapt_timer); + kvfree(q->flows); +} + +static struct Qdisc_ops fq_pie_qdisc_ops __read_mostly = { + .id = "fq_pie", + .priv_size = sizeof(struct fq_pie_sched_data), + .enqueue = fq_pie_qdisc_enqueue, + .dequeue = fq_pie_qdisc_dequeue, + .peek = qdisc_peek_dequeued, + .init = fq_pie_init, + .destroy = fq_pie_destroy, + .reset = fq_pie_reset, + .change = fq_pie_change, + .dump = fq_pie_dump, + .dump_stats = fq_pie_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init fq_pie_module_init(void) +{ + return register_qdisc(&fq_pie_qdisc_ops); +} + +static void __exit fq_pie_module_exit(void) +{ + unregister_qdisc(&fq_pie_qdisc_ops); +} + +module_init(fq_pie_module_init); +module_exit(fq_pie_module_exit); + +MODULE_DESCRIPTION("Flow Queue Proportional Integral controller Enhanced (FQ-PIE)"); +MODULE_AUTHOR("Mohit P. Tahiliani"); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 5ab696efca95..6c9595f1048a 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -441,7 +441,7 @@ static void dev_watchdog(struct timer_list *t) trace_net_dev_xmit_timeout(dev, i); WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n", dev->name, netdev_drivername(dev), i); - dev->netdev_ops->ndo_tx_timeout(dev); + dev->netdev_ops->ndo_tx_timeout(dev, i); } if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index b0b0dc46af61..915bcdb59a9f 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -19,159 +19,76 @@ #include <linux/skbuff.h> #include <net/pkt_sched.h> #include <net/inet_ecn.h> - -#define QUEUE_THRESHOLD 16384 -#define DQCOUNT_INVALID -1 -#define DTIME_INVALID 0xffffffffffffffff -#define MAX_PROB 0xffffffffffffffff -#define PIE_SCALE 8 - -/* parameters used */ -struct pie_params { - psched_time_t target; /* user specified target delay in pschedtime */ - u32 tupdate; /* timer frequency (in jiffies) */ - u32 limit; /* number of packets that can be enqueued */ - u32 alpha; /* alpha and beta are between 0 and 32 */ - u32 beta; /* and are used for shift relative to 1 */ - bool ecn; /* true if ecn is enabled */ - bool bytemode; /* to scale drop early prob based on pkt size */ - u8 dq_rate_estimator; /* to calculate delay using Little's law */ -}; - -/* variables used */ -struct pie_vars { - u64 prob; /* probability but scaled by u64 limit. */ - psched_time_t burst_time; - psched_time_t qdelay; - psched_time_t qdelay_old; - u64 dq_count; /* measured in bytes */ - psched_time_t dq_tstamp; /* drain rate */ - u64 accu_prob; /* accumulated drop probability */ - u32 avg_dq_rate; /* bytes per pschedtime tick,scaled */ - u32 qlen_old; /* in bytes */ - u8 accu_prob_overflows; /* overflows of accu_prob */ -}; - -/* statistics gathering */ -struct pie_stats { - u32 packets_in; /* total number of packets enqueued */ - u32 dropped; /* packets dropped due to pie_action */ - u32 overlimit; /* dropped due to lack of space in queue */ - u32 maxq; /* maximum queue size */ - u32 ecn_mark; /* packets marked with ECN */ -}; +#include <net/pie.h> /* private data for the Qdisc */ struct pie_sched_data { - struct pie_params params; struct pie_vars vars; + struct pie_params params; struct pie_stats stats; struct timer_list adapt_timer; struct Qdisc *sch; }; -static void pie_params_init(struct pie_params *params) +bool pie_drop_early(struct Qdisc *sch, struct pie_params *params, + struct pie_vars *vars, u32 qlen, u32 packet_size) { - params->alpha = 2; - params->beta = 20; - params->tupdate = usecs_to_jiffies(15 * USEC_PER_MSEC); /* 15 ms */ - params->limit = 1000; /* default of 1000 packets */ - params->target = PSCHED_NS2TICKS(15 * NSEC_PER_MSEC); /* 15 ms */ - params->ecn = false; - params->bytemode = false; - params->dq_rate_estimator = false; -} - -/* private skb vars */ -struct pie_skb_cb { - psched_time_t enqueue_time; -}; - -static struct pie_skb_cb *get_pie_cb(const struct sk_buff *skb) -{ - qdisc_cb_private_validate(skb, sizeof(struct pie_skb_cb)); - return (struct pie_skb_cb *)qdisc_skb_cb(skb)->data; -} - -static psched_time_t pie_get_enqueue_time(const struct sk_buff *skb) -{ - return get_pie_cb(skb)->enqueue_time; -} - -static void pie_set_enqueue_time(struct sk_buff *skb) -{ - get_pie_cb(skb)->enqueue_time = psched_get_time(); -} - -static void pie_vars_init(struct pie_vars *vars) -{ - vars->dq_count = DQCOUNT_INVALID; - vars->dq_tstamp = DTIME_INVALID; - vars->accu_prob = 0; - vars->avg_dq_rate = 0; - /* default of 150 ms in pschedtime */ - vars->burst_time = PSCHED_NS2TICKS(150 * NSEC_PER_MSEC); - vars->accu_prob_overflows = 0; -} - -static bool drop_early(struct Qdisc *sch, u32 packet_size) -{ - struct pie_sched_data *q = qdisc_priv(sch); u64 rnd; - u64 local_prob = q->vars.prob; + u64 local_prob = vars->prob; u32 mtu = psched_mtu(qdisc_dev(sch)); /* If there is still burst allowance left skip random early drop */ - if (q->vars.burst_time > 0) + if (vars->burst_time > 0) return false; /* If current delay is less than half of target, and * if drop prob is low already, disable early_drop */ - if ((q->vars.qdelay < q->params.target / 2) && - (q->vars.prob < MAX_PROB / 5)) + if ((vars->qdelay < params->target / 2) && + (vars->prob < MAX_PROB / 5)) return false; - /* If we have fewer than 2 mtu-sized packets, disable drop_early, + /* If we have fewer than 2 mtu-sized packets, disable pie_drop_early, * similar to min_th in RED */ - if (sch->qstats.backlog < 2 * mtu) + if (qlen < 2 * mtu) return false; /* If bytemode is turned on, use packet size to compute new * probablity. Smaller packets will have lower drop prob in this case */ - if (q->params.bytemode && packet_size <= mtu) + if (params->bytemode && packet_size <= mtu) local_prob = (u64)packet_size * div_u64(local_prob, mtu); else - local_prob = q->vars.prob; + local_prob = vars->prob; if (local_prob == 0) { - q->vars.accu_prob = 0; - q->vars.accu_prob_overflows = 0; + vars->accu_prob = 0; + vars->accu_prob_overflows = 0; } - if (local_prob > MAX_PROB - q->vars.accu_prob) - q->vars.accu_prob_overflows++; + if (local_prob > MAX_PROB - vars->accu_prob) + vars->accu_prob_overflows++; - q->vars.accu_prob += local_prob; + vars->accu_prob += local_prob; - if (q->vars.accu_prob_overflows == 0 && - q->vars.accu_prob < (MAX_PROB / 100) * 85) + if (vars->accu_prob_overflows == 0 && + vars->accu_prob < (MAX_PROB / 100) * 85) return false; - if (q->vars.accu_prob_overflows == 8 && - q->vars.accu_prob >= MAX_PROB / 2) + if (vars->accu_prob_overflows == 8 && + vars->accu_prob >= MAX_PROB / 2) return true; prandom_bytes(&rnd, 8); if (rnd < local_prob) { - q->vars.accu_prob = 0; - q->vars.accu_prob_overflows = 0; + vars->accu_prob = 0; + vars->accu_prob_overflows = 0; return true; } return false; } +EXPORT_SYMBOL_GPL(pie_drop_early); static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) @@ -184,7 +101,8 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, goto out; } - if (!drop_early(sch, skb->len)) { + if (!pie_drop_early(sch, &q->params, &q->vars, sch->qstats.backlog, + skb->len)) { enqueue = true; } else if (q->params.ecn && (q->vars.prob <= MAX_PROB / 10) && INET_ECN_set_ce(skb)) { @@ -216,14 +134,14 @@ out: } static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = { - [TCA_PIE_TARGET] = {.type = NLA_U32}, - [TCA_PIE_LIMIT] = {.type = NLA_U32}, - [TCA_PIE_TUPDATE] = {.type = NLA_U32}, - [TCA_PIE_ALPHA] = {.type = NLA_U32}, - [TCA_PIE_BETA] = {.type = NLA_U32}, - [TCA_PIE_ECN] = {.type = NLA_U32}, - [TCA_PIE_BYTEMODE] = {.type = NLA_U32}, - [TCA_PIE_DQ_RATE_ESTIMATOR] = {.type = NLA_U32}, + [TCA_PIE_TARGET] = {.type = NLA_U32}, + [TCA_PIE_LIMIT] = {.type = NLA_U32}, + [TCA_PIE_TUPDATE] = {.type = NLA_U32}, + [TCA_PIE_ALPHA] = {.type = NLA_U32}, + [TCA_PIE_BETA] = {.type = NLA_U32}, + [TCA_PIE_ECN] = {.type = NLA_U32}, + [TCA_PIE_BYTEMODE] = {.type = NLA_U32}, + [TCA_PIE_DQ_RATE_ESTIMATOR] = {.type = NLA_U32}, }; static int pie_change(struct Qdisc *sch, struct nlattr *opt, @@ -296,26 +214,25 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt, return 0; } -static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) +void pie_process_dequeue(struct sk_buff *skb, struct pie_params *params, + struct pie_vars *vars, u32 qlen) { - struct pie_sched_data *q = qdisc_priv(sch); - int qlen = sch->qstats.backlog; /* current queue size in bytes */ psched_time_t now = psched_get_time(); u32 dtime = 0; /* If dq_rate_estimator is disabled, calculate qdelay using the * packet timestamp. */ - if (!q->params.dq_rate_estimator) { - q->vars.qdelay = now - pie_get_enqueue_time(skb); + if (!params->dq_rate_estimator) { + vars->qdelay = now - pie_get_enqueue_time(skb); - if (q->vars.dq_tstamp != DTIME_INVALID) - dtime = now - q->vars.dq_tstamp; + if (vars->dq_tstamp != DTIME_INVALID) + dtime = now - vars->dq_tstamp; - q->vars.dq_tstamp = now; + vars->dq_tstamp = now; if (qlen == 0) - q->vars.qdelay = 0; + vars->qdelay = 0; if (dtime == 0) return; @@ -327,39 +244,39 @@ static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) * we have enough packets to calculate the drain rate. Save * current time as dq_tstamp and start measurement cycle. */ - if (qlen >= QUEUE_THRESHOLD && q->vars.dq_count == DQCOUNT_INVALID) { - q->vars.dq_tstamp = psched_get_time(); - q->vars.dq_count = 0; + if (qlen >= QUEUE_THRESHOLD && vars->dq_count == DQCOUNT_INVALID) { + vars->dq_tstamp = psched_get_time(); + vars->dq_count = 0; } - /* Calculate the average drain rate from this value. If queue length - * has receded to a small value viz., <= QUEUE_THRESHOLD bytes,reset + /* Calculate the average drain rate from this value. If queue length + * has receded to a small value viz., <= QUEUE_THRESHOLD bytes, reset * the dq_count to -1 as we don't have enough packets to calculate the - * drain rate anymore The following if block is entered only when we + * drain rate anymore. The following if block is entered only when we * have a substantial queue built up (QUEUE_THRESHOLD bytes or more) * and we calculate the drain rate for the threshold here. dq_count is * in bytes, time difference in psched_time, hence rate is in * bytes/psched_time. */ - if (q->vars.dq_count != DQCOUNT_INVALID) { - q->vars.dq_count += skb->len; + if (vars->dq_count != DQCOUNT_INVALID) { + vars->dq_count += skb->len; - if (q->vars.dq_count >= QUEUE_THRESHOLD) { - u32 count = q->vars.dq_count << PIE_SCALE; + if (vars->dq_count >= QUEUE_THRESHOLD) { + u32 count = vars->dq_count << PIE_SCALE; - dtime = now - q->vars.dq_tstamp; + dtime = now - vars->dq_tstamp; if (dtime == 0) return; count = count / dtime; - if (q->vars.avg_dq_rate == 0) - q->vars.avg_dq_rate = count; + if (vars->avg_dq_rate == 0) + vars->avg_dq_rate = count; else - q->vars.avg_dq_rate = - (q->vars.avg_dq_rate - - (q->vars.avg_dq_rate >> 3)) + (count >> 3); + vars->avg_dq_rate = + (vars->avg_dq_rate - + (vars->avg_dq_rate >> 3)) + (count >> 3); /* If the queue has receded below the threshold, we hold * on to the last drain rate calculated, else we reset @@ -367,10 +284,10 @@ static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) * packet is dequeued */ if (qlen < QUEUE_THRESHOLD) { - q->vars.dq_count = DQCOUNT_INVALID; + vars->dq_count = DQCOUNT_INVALID; } else { - q->vars.dq_count = 0; - q->vars.dq_tstamp = psched_get_time(); + vars->dq_count = 0; + vars->dq_tstamp = psched_get_time(); } goto burst_allowance_reduction; @@ -380,18 +297,18 @@ static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) return; burst_allowance_reduction: - if (q->vars.burst_time > 0) { - if (q->vars.burst_time > dtime) - q->vars.burst_time -= dtime; + if (vars->burst_time > 0) { + if (vars->burst_time > dtime) + vars->burst_time -= dtime; else - q->vars.burst_time = 0; + vars->burst_time = 0; } } +EXPORT_SYMBOL_GPL(pie_process_dequeue); -static void calculate_probability(struct Qdisc *sch) +void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars, + u32 qlen) { - struct pie_sched_data *q = qdisc_priv(sch); - u32 qlen = sch->qstats.backlog; /* queue size in bytes */ psched_time_t qdelay = 0; /* in pschedtime */ psched_time_t qdelay_old = 0; /* in pschedtime */ s64 delta = 0; /* determines the change in probability */ @@ -400,21 +317,21 @@ static void calculate_probability(struct Qdisc *sch) u32 power; bool update_prob = true; - if (q->params.dq_rate_estimator) { - qdelay_old = q->vars.qdelay; - q->vars.qdelay_old = q->vars.qdelay; + if (params->dq_rate_estimator) { + qdelay_old = vars->qdelay; + vars->qdelay_old = vars->qdelay; - if (q->vars.avg_dq_rate > 0) - qdelay = (qlen << PIE_SCALE) / q->vars.avg_dq_rate; + if (vars->avg_dq_rate > 0) + qdelay = (qlen << PIE_SCALE) / vars->avg_dq_rate; else qdelay = 0; } else { - qdelay = q->vars.qdelay; - qdelay_old = q->vars.qdelay_old; + qdelay = vars->qdelay; + qdelay_old = vars->qdelay_old; } - /* If qdelay is zero and qlen is not, it means qlen is very small, less - * than dequeue_rate, so we do not update probabilty in this round + /* If qdelay is zero and qlen is not, it means qlen is very small, + * so we do not update probabilty in this round. */ if (qdelay == 0 && qlen != 0) update_prob = false; @@ -426,18 +343,18 @@ static void calculate_probability(struct Qdisc *sch) * probability. alpha/beta are updated locally below by scaling down * by 16 to come to 0-2 range. */ - alpha = ((u64)q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; - beta = ((u64)q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; + alpha = ((u64)params->alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; + beta = ((u64)params->beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; /* We scale alpha and beta differently depending on how heavy the * congestion is. Please see RFC 8033 for details. */ - if (q->vars.prob < MAX_PROB / 10) { + if (vars->prob < MAX_PROB / 10) { alpha >>= 1; beta >>= 1; power = 100; - while (q->vars.prob < div_u64(MAX_PROB, power) && + while (vars->prob < div_u64(MAX_PROB, power) && power <= 1000000) { alpha >>= 2; beta >>= 2; @@ -446,14 +363,14 @@ static void calculate_probability(struct Qdisc *sch) } /* alpha and beta should be between 0 and 32, in multiples of 1/16 */ - delta += alpha * (u64)(qdelay - q->params.target); + delta += alpha * (u64)(qdelay - params->target); delta += beta * (u64)(qdelay - qdelay_old); - oldprob = q->vars.prob; + oldprob = vars->prob; /* to ensure we increase probability in steps of no more than 2% */ if (delta > (s64)(MAX_PROB / (100 / 2)) && - q->vars.prob >= MAX_PROB / 10) + vars->prob >= MAX_PROB / 10) delta = (MAX_PROB / 100) * 2; /* Non-linear drop: @@ -464,12 +381,12 @@ static void calculate_probability(struct Qdisc *sch) if (qdelay > (PSCHED_NS2TICKS(250 * NSEC_PER_MSEC))) delta += MAX_PROB / (100 / 2); - q->vars.prob += delta; + vars->prob += delta; if (delta > 0) { /* prevent overflow */ - if (q->vars.prob < oldprob) { - q->vars.prob = MAX_PROB; + if (vars->prob < oldprob) { + vars->prob = MAX_PROB; /* Prevent normalization error. If probability is at * maximum value already, we normalize it here, and * skip the check to do a non-linear drop in the next @@ -479,8 +396,8 @@ static void calculate_probability(struct Qdisc *sch) } } else { /* prevent underflow */ - if (q->vars.prob > oldprob) - q->vars.prob = 0; + if (vars->prob > oldprob) + vars->prob = 0; } /* Non-linear drop in probability: Reduce drop probability quickly if @@ -489,10 +406,10 @@ static void calculate_probability(struct Qdisc *sch) if (qdelay == 0 && qdelay_old == 0 && update_prob) /* Reduce drop probability to 98.4% */ - q->vars.prob -= q->vars.prob / 64u; + vars->prob -= vars->prob / 64; - q->vars.qdelay = qdelay; - q->vars.qlen_old = qlen; + vars->qdelay = qdelay; + vars->qlen_old = qlen; /* We restart the measurement cycle if the following conditions are met * 1. If the delay has been low for 2 consecutive Tupdate periods @@ -500,16 +417,17 @@ static void calculate_probability(struct Qdisc *sch) * 3. If average dq_rate_estimator is enabled, we have atleast one * estimate for the avg_dq_rate ie., is a non-zero value */ - if ((q->vars.qdelay < q->params.target / 2) && - (q->vars.qdelay_old < q->params.target / 2) && - q->vars.prob == 0 && - (!q->params.dq_rate_estimator || q->vars.avg_dq_rate > 0)) { - pie_vars_init(&q->vars); + if ((vars->qdelay < params->target / 2) && + (vars->qdelay_old < params->target / 2) && + vars->prob == 0 && + (!params->dq_rate_estimator || vars->avg_dq_rate > 0)) { + pie_vars_init(vars); } - if (!q->params.dq_rate_estimator) - q->vars.qdelay_old = qdelay; + if (!params->dq_rate_estimator) + vars->qdelay_old = qdelay; } +EXPORT_SYMBOL_GPL(pie_calculate_probability); static void pie_timer(struct timer_list *t) { @@ -518,7 +436,7 @@ static void pie_timer(struct timer_list *t) spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch)); spin_lock(root_lock); - calculate_probability(sch); + pie_calculate_probability(&q->params, &q->vars, sch->qstats.backlog); /* reset the timer to fire after 'tupdate'. tupdate is in jiffies. */ if (q->params.tupdate) @@ -607,12 +525,13 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch) { + struct pie_sched_data *q = qdisc_priv(sch); struct sk_buff *skb = qdisc_dequeue_head(sch); if (!skb) return NULL; - pie_process_dequeue(sch, skb); + pie_process_dequeue(skb, &q->params, &q->vars, sch->qstats.backlog); return skb; } @@ -633,7 +552,7 @@ static void pie_destroy(struct Qdisc *sch) } static struct Qdisc_ops pie_qdisc_ops __read_mostly = { - .id = "pie", + .id = "pie", .priv_size = sizeof(struct pie_sched_data), .enqueue = pie_qdisc_enqueue, .dequeue = pie_qdisc_dequeue, diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 18b884cfdfe8..647941702f9f 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -292,8 +292,14 @@ static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct tc_prio_qopt_offload graft_offload; unsigned long band = arg - 1; - if (new == NULL) - new = &noop_qdisc; + if (!new) { + new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + TC_H_MAKE(sch->handle, arg), extack); + if (!new) + new = &noop_qdisc; + else + qdisc_hash_add(new, true); + } *old = qdisc_replace(sch, new, &q->queues[band]); diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 5f72f3f916a5..78e79029dc63 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -15,6 +15,7 @@ #include <linux/skbuff.h> #include <net/netlink.h> #include <net/sch_generic.h> +#include <net/pkt_cls.h> #include <net/pkt_sched.h> @@ -137,6 +138,52 @@ static u64 psched_ns_t2l(const struct psched_ratecfg *r, return len; } +static void tbf_offload_change(struct Qdisc *sch) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct tc_tbf_qopt_offload qopt; + + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return; + + qopt.command = TC_TBF_REPLACE; + qopt.handle = sch->handle; + qopt.parent = sch->parent; + qopt.replace_params.rate = q->rate; + qopt.replace_params.max_size = q->max_size; + qopt.replace_params.qstats = &sch->qstats; + + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt); +} + +static void tbf_offload_destroy(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + struct tc_tbf_qopt_offload qopt; + + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return; + + qopt.command = TC_TBF_DESTROY; + qopt.handle = sch->handle; + qopt.parent = sch->parent; + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt); +} + +static int tbf_offload_dump(struct Qdisc *sch) +{ + struct tc_tbf_qopt_offload qopt; + + qopt.command = TC_TBF_STATS; + qopt.handle = sch->handle; + qopt.parent = sch->parent; + qopt.stats.bstats = &sch->bstats; + qopt.stats.qstats = &sch->qstats; + + return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_TBF, &qopt); +} + /* GSO packet is too big, segment it so that tbf can transmit * each segment in time */ @@ -155,8 +202,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch, return qdisc_drop(skb, sch, to_free); nb = 0; - while (segs) { - nskb = segs->next; + skb_list_walk_safe(segs, segs, nskb) { skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; len += segs->len; @@ -167,7 +213,6 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch, } else { nb++; } - segs = nskb; } sch->q.qlen += nb; if (nb > 1) @@ -409,6 +454,8 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt, sch_tree_unlock(sch); err = 0; + + tbf_offload_change(sch); done: return err; } @@ -434,6 +481,7 @@ static void tbf_destroy(struct Qdisc *sch) struct tbf_sched_data *q = qdisc_priv(sch); qdisc_watchdog_cancel(&q->watchdog); + tbf_offload_destroy(sch); qdisc_put(q->qdisc); } @@ -442,8 +490,12 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) struct tbf_sched_data *q = qdisc_priv(sch); struct nlattr *nest; struct tc_tbf_qopt opt; + int err; + + err = tbf_offload_dump(sch); + if (err) + return err; - sch->qstats.backlog = q->qdisc->qstats.backlog; nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; |