diff options
author | Jakub Kicinski <kuba@kernel.org> | 2024-08-13 03:50:36 +0300 |
---|---|---|
committer | Jakub Kicinski <kuba@kernel.org> | 2024-08-13 03:50:36 +0300 |
commit | e96f6fd30eec4ba6e2ff7f855f24b552c5853b06 (patch) | |
tree | bdac67c20b3de4cf6e7fc9a5755bf222b1cbd0d4 | |
parent | 246ef40670b71fef0c3e2cd11404279bc6d6468e (diff) | |
parent | 4b808f44733243f981ae04046ef93a65ecc631a9 (diff) | |
download | linux-e96f6fd30eec4ba6e2ff7f855f24b552c5853b06.tar.xz |
Merge branch 'net-nexthop-increase-weight-to-u16'
Petr Machata says:
====================
net: nexthop: Increase weight to u16
In CLOS networks, as link failures occur at various points in the network,
ECMP weights of the involved nodes are adjusted to compensate. With high
fan-out of the involved nodes, and overall high number of nodes,
a (non-)ECMP weight ratio that we would like to configure does not fit into
8 bits. Instead of, say, 255:254, we might like to configure something like
1000:999. For these deployments, the 8-bit weight may not be enough.
To that end, in this patchset increase the next hop weight from u8 to u16.
Patch #1 adds a flag that indicates whether the reserved fields are zeroed.
This is a follow-up to a new fix merged in commit 6d745cd0e972 ("net:
nexthop: Initialize all fields in dumped nexthops"). The theory behind this
patch is that there is a strict ordering between the fields actually being
zeroed, the kernel declaring that they are, and the kernel repurposing the
fields. Thus clients can use the flag to tell if it is safe to interpret
the reserved fields in any way.
Patch #2 contains the substantial code and the commit message covers the
details of the changes.
Patches #3 to #6 add selftests.
====================
Link: https://patch.msgid.link/cover.1723036486.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r-- | include/net/nexthop.h | 4 | ||||
-rw-r--r-- | include/uapi/linux/nexthop.h | 10 | ||||
-rw-r--r-- | net/ipv4/nexthop.c | 49 | ||||
-rwxr-xr-x | tools/testing/selftests/net/fib_nexthops.sh | 55 | ||||
-rw-r--r-- | tools/testing/selftests/net/forwarding/lib.sh | 7 | ||||
-rwxr-xr-x | tools/testing/selftests/net/forwarding/router_mpath_nh.sh | 40 | ||||
-rw-r--r-- | tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh | 13 | ||||
-rwxr-xr-x | tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh | 58 | ||||
-rwxr-xr-x | tools/testing/selftests/net/forwarding/router_multipath.sh | 2 |
9 files changed, 201 insertions, 37 deletions
diff --git a/include/net/nexthop.h b/include/net/nexthop.h index 68463aebcc05..d9fb44e8b321 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -105,7 +105,7 @@ struct nh_grp_entry_stats { struct nh_grp_entry { struct nexthop *nh; struct nh_grp_entry_stats __percpu *stats; - u8 weight; + u16 weight; union { struct { @@ -192,7 +192,7 @@ struct nh_notifier_single_info { }; struct nh_notifier_grp_entry_info { - u8 weight; + u16 weight; struct nh_notifier_single_info nh; }; diff --git a/include/uapi/linux/nexthop.h b/include/uapi/linux/nexthop.h index dd8787f9cf39..bc49baf4a267 100644 --- a/include/uapi/linux/nexthop.h +++ b/include/uapi/linux/nexthop.h @@ -16,10 +16,15 @@ struct nhmsg { struct nexthop_grp { __u32 id; /* nexthop id - must exist */ __u8 weight; /* weight of this nexthop */ - __u8 resvd1; + __u8 weight_high; /* high order bits of weight */ __u16 resvd2; }; +static inline __u16 nexthop_grp_weight(const struct nexthop_grp *entry) +{ + return ((entry->weight_high << 8) | entry->weight) + 1; +} + enum { NEXTHOP_GRP_TYPE_MPATH, /* hash-threshold nexthop group * default type if not specified @@ -33,6 +38,9 @@ enum { #define NHA_OP_FLAG_DUMP_STATS BIT(0) #define NHA_OP_FLAG_DUMP_HW_STATS BIT(1) +/* Response OP_FLAGS. */ +#define NHA_OP_FLAG_RESP_GRP_RESVD_0 BIT(31) /* Dump clears resvd fields. */ + enum { NHA_UNSPEC, NHA_ID, /* u32; id for nexthop. id == 0 means auto-assign */ diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 6b9787ee8601..9db10d409074 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -865,15 +865,18 @@ out: } static int nla_put_nh_group(struct sk_buff *skb, struct nexthop *nh, - u32 op_flags) + u32 op_flags, u32 *resp_op_flags) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); struct nexthop_grp *p; size_t len = nhg->num_nh * sizeof(*p); struct nlattr *nla; u16 group_type = 0; + u16 weight; int i; + *resp_op_flags |= NHA_OP_FLAG_RESP_GRP_RESVD_0; + if (nhg->hash_threshold) group_type = NEXTHOP_GRP_TYPE_MPATH; else if (nhg->resilient) @@ -888,9 +891,12 @@ static int nla_put_nh_group(struct sk_buff *skb, struct nexthop *nh, p = nla_data(nla); for (i = 0; i < nhg->num_nh; ++i) { + weight = nhg->nh_entries[i].weight - 1; + *p++ = (struct nexthop_grp) { .id = nhg->nh_entries[i].nh->id, - .weight = nhg->nh_entries[i].weight - 1, + .weight = weight, + .weight_high = weight >> 8, }; } @@ -934,10 +940,12 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, if (nh->is_group) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); + u32 resp_op_flags = 0; if (nhg->fdb_nh && nla_put_flag(skb, NHA_FDB)) goto nla_put_failure; - if (nla_put_nh_group(skb, nh, op_flags)) + if (nla_put_nh_group(skb, nh, op_flags, &resp_op_flags) || + nla_put_u32(skb, NHA_OP_FLAGS, resp_op_flags)) goto nla_put_failure; goto out; } @@ -1050,7 +1058,9 @@ static size_t nh_nlmsg_size(struct nexthop *nh) sz += nla_total_size(4); /* NHA_ID */ if (nh->is_group) - sz += nh_nlmsg_size_grp(nh); + sz += nh_nlmsg_size_grp(nh) + + nla_total_size(4) + /* NHA_OP_FLAGS */ + 0; else sz += nh_nlmsg_size_single(nh); @@ -1280,11 +1290,14 @@ static int nh_check_attr_group(struct net *net, nhg = nla_data(tb[NHA_GROUP]); for (i = 0; i < len; ++i) { - if (nhg[i].resvd1 || nhg[i].resvd2) { - NL_SET_ERR_MSG(extack, "Reserved fields in nexthop_grp must be 0"); + if (nhg[i].resvd2) { + NL_SET_ERR_MSG(extack, "Reserved field in nexthop_grp must be 0"); return -EINVAL; } - if (nhg[i].weight > 254) { + if (nexthop_grp_weight(&nhg[i]) == 0) { + /* 0xffff got passed in, representing weight of 0x10000, + * which is too heavy. + */ NL_SET_ERR_MSG(extack, "Invalid value for weight"); return -EINVAL; } @@ -1880,9 +1893,9 @@ static void nh_res_table_cancel_upkeep(struct nh_res_table *res_table) static void nh_res_group_rebalance(struct nh_group *nhg, struct nh_res_table *res_table) { - int prev_upper_bound = 0; - int total = 0; - int w = 0; + u16 prev_upper_bound = 0; + u32 total = 0; + u32 w = 0; int i; INIT_LIST_HEAD(&res_table->uw_nh_entries); @@ -1892,11 +1905,12 @@ static void nh_res_group_rebalance(struct nh_group *nhg, for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; - int upper_bound; + u16 upper_bound; + u64 btw; w += nhge->weight; - upper_bound = DIV_ROUND_CLOSEST(res_table->num_nh_buckets * w, - total); + btw = ((u64)res_table->num_nh_buckets) * w; + upper_bound = DIV_ROUND_CLOSEST_ULL(btw, total); nhge->res.wants_buckets = upper_bound - prev_upper_bound; prev_upper_bound = upper_bound; @@ -1962,8 +1976,8 @@ static void replace_nexthop_grp_res(struct nh_group *oldg, static void nh_hthr_group_rebalance(struct nh_group *nhg) { - int total = 0; - int w = 0; + u32 total = 0; + u32 w = 0; int i; for (i = 0; i < nhg->num_nh; ++i) @@ -1971,7 +1985,7 @@ static void nh_hthr_group_rebalance(struct nh_group *nhg) for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; - int upper_bound; + u32 upper_bound; w += nhge->weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1; @@ -2713,7 +2727,8 @@ static struct nexthop *nexthop_create_group(struct net *net, goto out_no_nh; } nhg->nh_entries[i].nh = nhe; - nhg->nh_entries[i].weight = entry[i].weight + 1; + nhg->nh_entries[i].weight = nexthop_grp_weight(&entry[i]); + list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list); nhg->nh_entries[i].nh_parent = nh; } diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index ac0b2c6a5761..77c83d9508d3 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -78,7 +78,12 @@ log_test() else ret=1 nfail=$((nfail+1)) - printf "TEST: %-60s [FAIL]\n" "${msg}" + if [[ $rc -eq $ksft_skip ]]; then + printf "TEST: %-60s [SKIP]\n" "${msg}" + else + printf "TEST: %-60s [FAIL]\n" "${msg}" + fi + if [ "$VERBOSE" = "1" ]; then echo " rc=$rc, expected $expected" fi @@ -923,6 +928,29 @@ ipv6_grp_fcnal() ipv6_grp_refs log_test $? 0 "Nexthop group replace refcounts" + + # + # 16-bit weights. + # + run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1" + run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1" + run_cmd "$IP nexthop add id 65 via 2001:db8:91::5 dev veth1" + run_cmd "$IP nexthop add id 66 dev veth1" + + run_cmd "$IP nexthop add id 103 group 62,1000" + if [[ $? == 0 ]]; then + local GRP="id 103 group 62,254/63,255/64,256/65,257/66,65535" + run_cmd "$IP nexthop replace $GRP" + check_nexthop "id 103" "$GRP" + rc=$? + else + rc=$ksft_skip + fi + + $IP nexthop flush >/dev/null 2>&1 + + log_test $rc 0 "16-bit weights" } ipv6_res_grp_fcnal() @@ -987,6 +1015,31 @@ ipv6_res_grp_fcnal() check_nexthop_bucket "list id 102" \ "id 102 index 0 nhid 63 id 102 index 1 nhid 62 id 102 index 2 nhid 62 id 102 index 3 nhid 62" log_test $? 0 "Nexthop buckets updated after replace - nECMP" + + # + # 16-bit weights. + # + run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1" + run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1" + run_cmd "$IP nexthop add id 65 via 2001:db8:91::5 dev veth1" + run_cmd "$IP nexthop add id 66 dev veth1" + + run_cmd "$IP nexthop add id 103 group 62,1000 type resilient buckets 32" + if [[ $? == 0 ]]; then + local GRP="id 103 group 62,254/63,255/64,256/65,257/66,65535 $(: + )type resilient buckets 32 idle_timer 0 $(: + )unbalanced_timer 0" + run_cmd "$IP nexthop replace $GRP" + check_nexthop "id 103" "$GRP unbalanced_time 0" + rc=$? + else + rc=$ksft_skip + fi + + $IP nexthop flush >/dev/null 2>&1 + + log_test $rc 0 "16-bit weights" } ipv6_fcnal_runtime() diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index ff96bb7535ff..cb0fcd6f0293 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -509,6 +509,13 @@ xfail_on_slow() fi } +omit_on_slow() +{ + if [[ $KSFT_MACHINE_SLOW != yes ]]; then + "$@" + fi +} + xfail_on_veth() { local dev=$1; shift diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh.sh index 2ba44247c60a..a7d8399c8d4f 100755 --- a/tools/testing/selftests/net/forwarding/router_mpath_nh.sh +++ b/tools/testing/selftests/net/forwarding/router_mpath_nh.sh @@ -40,6 +40,7 @@ ALL_TESTS=" ping_ipv4 ping_ipv6 multipath_test + multipath16_test ping_ipv4_blackhole ping_ipv6_blackhole nh_stats_test_v4 @@ -226,9 +227,11 @@ routing_nh_obj() multipath4_test() { - local desc="$1" - local weight_rp12=$2 - local weight_rp13=$3 + local desc=$1; shift + local weight_rp12=$1; shift + local weight_rp13=$1; shift + local ports=${1-sp=1024,dp=0-32768}; shift + local t0_rp12 t0_rp13 t1_rp12 t1_rp13 local packets_rp12 packets_rp13 @@ -242,7 +245,8 @@ multipath4_test() t0_rp13=$(link_stats_tx_packets_get $rp13) ip vrf exec vrf-h1 $MZ $h1 -q -p 64 -A 192.0.2.2 -B 198.51.100.2 \ - -d $MZ_DELAY -t udp "sp=1024,dp=0-32768" + -d $MZ_DELAY -t udp "$ports" + sleep 1 t1_rp12=$(link_stats_tx_packets_get $rp12) t1_rp13=$(link_stats_tx_packets_get $rp13) @@ -258,9 +262,11 @@ multipath4_test() multipath6_test() { - local desc="$1" - local weight_rp12=$2 - local weight_rp13=$3 + local desc=$1; shift + local weight_rp12=$1; shift + local weight_rp13=$1; shift + local ports=${1-sp=1024,dp=0-32768}; shift + local t0_rp12 t0_rp13 t1_rp12 t1_rp13 local packets_rp12 packets_rp13 @@ -275,7 +281,8 @@ multipath6_test() t0_rp13=$(link_stats_tx_packets_get $rp13) $MZ $h1 -6 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:2::2 \ - -d $MZ_DELAY -t udp "sp=1024,dp=0-32768" + -d $MZ_DELAY -t udp "$ports" + sleep 1 t1_rp12=$(link_stats_tx_packets_get $rp12) t1_rp13=$(link_stats_tx_packets_get $rp13) @@ -313,6 +320,23 @@ multipath_test() multipath6_test "Weighted MP 11:45" 11 45 } +multipath16_test() +{ + check_nhgw16 104 || return + + log_info "Running 16-bit IPv4 multipath tests" + multipath4_test "65535:65535" 65535 65535 + multipath4_test "128:512" 128 512 + omit_on_slow \ + multipath4_test "255:65535" 255 65535 sp=1024-1026,dp=0-65535 + + log_info "Running 16-bit IPv6 multipath tests" + multipath6_test "65535:65535" 65535 65535 + multipath6_test "128:512" 128 512 + omit_on_slow \ + multipath6_test "255:65535" 255 65535 sp=1024-1026,dp=0-65535 +} + ping_ipv4_blackhole() { RET=0 diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh index 2903294d8bca..507b2852dabe 100644 --- a/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh +++ b/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh @@ -117,3 +117,16 @@ __nh_stats_test_v6() $MZ -6 $h1 -A 2001:db8:1::2 -B 2001:db8:2::2 sysctl_restore net.ipv6.fib_multipath_hash_policy } + +check_nhgw16() +{ + local nhid=$1; shift + + ip nexthop replace id 9999 group "$nhid,65535" &>/dev/null + if (( $? )); then + log_test_skip "16-bit multipath tests" \ + "iproute2 or the kernel do not support 16-bit next hop weights" + return 1 + fi + ip nexthop del id 9999 ||: +} diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh index cd9e346436fc..88ddae05b39d 100755 --- a/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh +++ b/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh @@ -40,6 +40,7 @@ ALL_TESTS=" ping_ipv4 ping_ipv6 multipath_test + multipath16_test nh_stats_test_v4 nh_stats_test_v6 " @@ -228,9 +229,11 @@ routing_nh_obj() multipath4_test() { - local desc="$1" - local weight_rp12=$2 - local weight_rp13=$3 + local desc=$1; shift + local weight_rp12=$1; shift + local weight_rp13=$1; shift + local ports=${1-sp=1024,dp=0-32768}; shift + local t0_rp12 t0_rp13 t1_rp12 t1_rp13 local packets_rp12 packets_rp13 @@ -243,7 +246,8 @@ multipath4_test() t0_rp13=$(link_stats_tx_packets_get $rp13) ip vrf exec vrf-h1 $MZ $h1 -q -p 64 -A 192.0.2.2 -B 198.51.100.2 \ - -d $MZ_DELAY -t udp "sp=1024,dp=0-32768" + -d $MZ_DELAY -t udp "$ports" + sleep 1 t1_rp12=$(link_stats_tx_packets_get $rp12) t1_rp13=$(link_stats_tx_packets_get $rp13) @@ -258,9 +262,11 @@ multipath4_test() multipath6_l4_test() { - local desc="$1" - local weight_rp12=$2 - local weight_rp13=$3 + local desc=$1; shift + local weight_rp12=$1; shift + local weight_rp13=$1; shift + local ports=${1-sp=1024,dp=0-32768}; shift + local t0_rp12 t0_rp13 t1_rp12 t1_rp13 local packets_rp12 packets_rp13 @@ -273,7 +279,8 @@ multipath6_l4_test() t0_rp13=$(link_stats_tx_packets_get $rp13) $MZ $h1 -6 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:2::2 \ - -d $MZ_DELAY -t udp "sp=1024,dp=0-32768" + -d $MZ_DELAY -t udp "$ports" + sleep 1 t1_rp12=$(link_stats_tx_packets_get $rp12) t1_rp13=$(link_stats_tx_packets_get $rp13) @@ -371,6 +378,41 @@ multipath_test() ip nexthop replace id 106 group 104,1/105,1 type resilient } +multipath16_test() +{ + check_nhgw16 104 || return + + log_info "Running 16-bit IPv4 multipath tests" + ip nexthop replace id 103 group 101/102 type resilient idle_timer 0 + + ip nexthop replace id 103 group 101,65535/102,65535 type resilient + multipath4_test "65535:65535" 65535 65535 + + ip nexthop replace id 103 group 101,128/102,512 type resilient + multipath4_test "128:512" 128 512 + + ip nexthop replace id 103 group 101,255/102,65535 type resilient + omit_on_slow \ + multipath4_test "255:65535" 255 65535 sp=1024-1026,dp=0-65535 + + ip nexthop replace id 103 group 101,1/102,1 type resilient + + log_info "Running 16-bit IPv6 L4 hash multipath tests" + ip nexthop replace id 106 group 104/105 type resilient idle_timer 0 + + ip nexthop replace id 106 group 104,65535/105,65535 type resilient + multipath6_l4_test "65535:65535" 65535 65535 + + ip nexthop replace id 106 group 104,128/105,512 type resilient + multipath6_l4_test "128:512" 128 512 + + ip nexthop replace id 106 group 104,255/105,65535 type resilient + omit_on_slow \ + multipath6_l4_test "255:65535" 255 65535 sp=1024-1026,dp=0-65535 + + ip nexthop replace id 106 group 104,1/105,1 type resilient +} + nh_stats_test_v4() { __nh_stats_test_v4 resilient diff --git a/tools/testing/selftests/net/forwarding/router_multipath.sh b/tools/testing/selftests/net/forwarding/router_multipath.sh index e2be354167a1..46f365b557b7 100755 --- a/tools/testing/selftests/net/forwarding/router_multipath.sh +++ b/tools/testing/selftests/net/forwarding/router_multipath.sh @@ -180,6 +180,7 @@ multipath4_test() ip vrf exec vrf-h1 $MZ $h1 -q -p 64 -A 192.0.2.2 -B 198.51.100.2 \ -d $MZ_DELAY -t udp "sp=1024,dp=0-32768" + sleep 1 t1_rp12=$(link_stats_tx_packets_get $rp12) t1_rp13=$(link_stats_tx_packets_get $rp13) @@ -217,6 +218,7 @@ multipath6_test() $MZ $h1 -6 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:2::2 \ -d $MZ_DELAY -t udp "sp=1024,dp=0-32768" + sleep 1 t1_rp12=$(link_stats_tx_packets_get $rp12) t1_rp13=$(link_stats_tx_packets_get $rp13) |