From f858cc9eed5b05cbe38d7ffd2787c21e3718eb7d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 3 Oct 2024 12:12:18 +0000 Subject: net: add IFLA_MAX_PACING_OFFLOAD_HORIZON device attribute Some network devices have the ability to offload EDT (Earliest Departure Time) which is the model used for TCP pacing and FQ packet scheduler. Some of them implement the timing wheel mechanism described in https://saeed.github.io/files/carousel-sigcomm17.pdf with an associated 'timing wheel horizon'. This patch adds dev->max_pacing_offload_horizon expressing this timing wheel horizon in nsec units. This is a read-only attribute. Unless a driver sets it, dev->max_pacing_offload_horizon is zero. v2: addressed Jakub feedback ( https://lore.kernel.org/netdev/20240930152304.472767-2-edumazet@google.com/T/#mf6294d714c41cc459962154cc2580ce3c9693663 ) v3: added yaml doc (also per Jakub feedback) Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20241003121219.2396589-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 4 ++++ include/uapi/linux/if_link.h | 1 + 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4d20c776a4ff..49a7e7db0883 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2009,6 +2009,8 @@ enum netdev_reg_state { * @dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem, * where the clock is recovered. * + * @max_pacing_offload_horizon: max EDT offload horizon in nsec. + * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ @@ -2399,6 +2401,8 @@ struct net_device { /** @irq_moder: dim parameters used if IS_ENABLED(CONFIG_DIMLIB). */ struct dim_irq_moder *irq_moder; + u64 max_pacing_offload_horizon; + u8 priv[] ____cacheline_aligned __counted_by(priv_len); } ____cacheline_aligned; diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 6dc258993b17..506ba9c80e83 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -377,6 +377,7 @@ enum { IFLA_GSO_IPV4_MAX_SIZE, IFLA_GRO_IPV4_MAX_SIZE, IFLA_DPLL_PIN, + IFLA_MAX_PACING_OFFLOAD_HORIZON, __IFLA_MAX }; -- cgit v1.2.3 From f26080d47007df2ee90e65b7d390207ff3a588af Mon Sep 17 00:00:00 2001 From: Jeffrey Ji Date: Thu, 3 Oct 2024 12:12:19 +0000 Subject: net_sched: sch_fq: add the ability to offload pacing Some network devices have the ability to offload EDT (Earliest Departure Time) which is the model used for TCP pacing and FQ packet scheduler. Some of them implement the timing wheel mechanism described in https://saeed.github.io/files/carousel-sigcomm17.pdf with an associated 'timing wheel horizon'. This patchs adds to FQ packet scheduler TCA_FQ_OFFLOAD_HORIZON attribute. Its value is capped by the device max_pacing_offload_horizon, added in the prior patch. It allows FQ to let packets within pacing offload horizon to be delivered to the device, which will handle the needed delay without host involvement. Signed-off-by: Jeffrey Ji Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20241003121219.2396589-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/pkt_sched.h | 2 ++ net/sched/sch_fq.c | 33 +++++++++++++++++++++++++++------ 2 files changed, 29 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index a3cd0c2dc995..25a9a47001cd 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -836,6 +836,8 @@ enum { TCA_FQ_WEIGHTS, /* Weights for each band */ + TCA_FQ_OFFLOAD_HORIZON, /* dequeue paced packets within this horizon immediately (us units) */ + __TCA_FQ_MAX }; diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 19a49af5a9e5..aeabf45c9200 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -111,6 +111,7 @@ struct fq_perband_flows { struct fq_sched_data { /* Read mostly cache line */ + u64 offload_horizon; u32 quantum; u32 initial_quantum; u32 flow_refill_delay; @@ -299,7 +300,7 @@ static void fq_gc(struct fq_sched_data *q, } /* Fast path can be used if : - * 1) Packet tstamp is in the past. + * 1) Packet tstamp is in the past, or within the pacing offload horizon. * 2) FQ qlen == 0 OR * (no flow is currently eligible for transmit, * AND fast path queue has less than 8 packets) @@ -314,7 +315,7 @@ static bool fq_fastpath_check(const struct Qdisc *sch, struct sk_buff *skb, const struct fq_sched_data *q = qdisc_priv(sch); const struct sock *sk; - if (fq_skb_cb(skb)->time_to_send > now) + if (fq_skb_cb(skb)->time_to_send > now + q->offload_horizon) return false; if (sch->q.qlen != 0) { @@ -595,15 +596,18 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now) unsigned long sample; struct rb_node *p; - if (q->time_next_delayed_flow > now) + if (q->time_next_delayed_flow > now + q->offload_horizon) return; /* Update unthrottle latency EWMA. * This is cheap and can help diagnosing timer/latency problems. */ sample = (unsigned long)(now - q->time_next_delayed_flow); - q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3; - q->unthrottle_latency_ns += sample >> 3; + if ((long)sample > 0) { + q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3; + q->unthrottle_latency_ns += sample >> 3; + } + now += q->offload_horizon; q->time_next_delayed_flow = ~0ULL; while ((p = rb_first(&q->delayed)) != NULL) { @@ -687,7 +691,7 @@ begin: u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send, f->time_next_packet); - if (now < time_next_packet) { + if (now + q->offload_horizon < time_next_packet) { head->first = f->next; f->time_next_packet = time_next_packet; fq_flow_set_throttled(q, f); @@ -925,6 +929,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { [TCA_FQ_HORIZON_DROP] = { .type = NLA_U8 }, [TCA_FQ_PRIOMAP] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_prio_qopt)), [TCA_FQ_WEIGHTS] = NLA_POLICY_EXACT_LEN(FQ_BANDS * sizeof(s32)), + [TCA_FQ_OFFLOAD_HORIZON] = { .type = NLA_U32 }, }; /* compress a u8 array with all elems <= 3 to an array of 2-bit fields */ @@ -1100,6 +1105,17 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, WRITE_ONCE(q->horizon_drop, nla_get_u8(tb[TCA_FQ_HORIZON_DROP])); + if (tb[TCA_FQ_OFFLOAD_HORIZON]) { + u64 offload_horizon = (u64)NSEC_PER_USEC * + nla_get_u32(tb[TCA_FQ_OFFLOAD_HORIZON]); + + if (offload_horizon <= qdisc_dev(sch)->max_pacing_offload_horizon) { + WRITE_ONCE(q->offload_horizon, offload_horizon); + } else { + NL_SET_ERR_MSG_MOD(extack, "invalid offload_horizon"); + err = -EINVAL; + } + } if (!err) { sch_tree_unlock(sch); @@ -1183,6 +1199,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) .bands = FQ_BANDS, }; struct nlattr *opts; + u64 offload_horizon; u64 ce_threshold; s32 weights[3]; u64 horizon; @@ -1199,6 +1216,9 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) horizon = READ_ONCE(q->horizon); do_div(horizon, NSEC_PER_USEC); + offload_horizon = READ_ONCE(q->offload_horizon); + do_div(offload_horizon, NSEC_PER_USEC); + if (nla_put_u32(skb, TCA_FQ_PLIMIT, READ_ONCE(sch->limit)) || nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, @@ -1224,6 +1244,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u32(skb, TCA_FQ_TIMER_SLACK, READ_ONCE(q->timer_slack)) || nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) || + nla_put_u32(skb, TCA_FQ_OFFLOAD_HORIZON, (u32)offload_horizon) || nla_put_u8(skb, TCA_FQ_HORIZON_DROP, READ_ONCE(q->horizon_drop))) goto nla_put_failure; -- cgit v1.2.3