diff options
Diffstat (limited to 'net/ipv4')
35 files changed, 2301 insertions, 1187 deletions
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 5b77a46885b9..bbdd9c44f14e 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -62,7 +62,7 @@ obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o -obj-$(CONFIG_BPF_STREAM_PARSER) += udp_bpf.o +obj-$(CONFIG_BPF_SYSCALL) += udp_bpf.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 1355e6c0d567..f17870ee558b 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1070,6 +1070,7 @@ const struct proto_ops inet_dgram_ops = { .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, + .read_sock = udp_read_sock, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .sendpage = inet_sendpage, diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index d520e61649c8..dff4f0eb96b0 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -5,6 +5,7 @@ #include <linux/bpf_verifier.h> #include <linux/bpf.h> #include <linux/btf.h> +#include <linux/btf_ids.h> #include <linux/filter.h> #include <net/tcp.h> #include <net/bpf_sk_storage.h> @@ -178,10 +179,52 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, } } +BTF_SET_START(bpf_tcp_ca_kfunc_ids) +BTF_ID(func, tcp_reno_ssthresh) +BTF_ID(func, tcp_reno_cong_avoid) +BTF_ID(func, tcp_reno_undo_cwnd) +BTF_ID(func, tcp_slow_start) +BTF_ID(func, tcp_cong_avoid_ai) +#ifdef CONFIG_DYNAMIC_FTRACE +#if IS_BUILTIN(CONFIG_TCP_CONG_CUBIC) +BTF_ID(func, cubictcp_init) +BTF_ID(func, cubictcp_recalc_ssthresh) +BTF_ID(func, cubictcp_cong_avoid) +BTF_ID(func, cubictcp_state) +BTF_ID(func, cubictcp_cwnd_event) +BTF_ID(func, cubictcp_acked) +#endif +#if IS_BUILTIN(CONFIG_TCP_CONG_DCTCP) +BTF_ID(func, dctcp_init) +BTF_ID(func, dctcp_update_alpha) +BTF_ID(func, dctcp_cwnd_event) +BTF_ID(func, dctcp_ssthresh) +BTF_ID(func, dctcp_cwnd_undo) +BTF_ID(func, dctcp_state) +#endif +#if IS_BUILTIN(CONFIG_TCP_CONG_BBR) +BTF_ID(func, bbr_init) +BTF_ID(func, bbr_main) +BTF_ID(func, bbr_sndbuf_expand) +BTF_ID(func, bbr_undo_cwnd) +BTF_ID(func, bbr_cwnd_event) +BTF_ID(func, bbr_ssthresh) +BTF_ID(func, bbr_min_tso_segs) +BTF_ID(func, bbr_set_state) +#endif +#endif /* CONFIG_DYNAMIC_FTRACE */ +BTF_SET_END(bpf_tcp_ca_kfunc_ids) + +static bool bpf_tcp_ca_check_kfunc_call(u32 kfunc_btf_id) +{ + return btf_id_set_contains(&bpf_tcp_ca_kfunc_ids, kfunc_btf_id); +} + static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = { .get_func_proto = bpf_tcp_ca_get_func_proto, .is_valid_access = bpf_tcp_ca_is_valid_access, .btf_struct_access = bpf_tcp_ca_btf_struct_access, + .check_kfunc_call = bpf_tcp_ca_check_kfunc_call, }; static int bpf_tcp_ca_init_member(const struct btf_type *t, diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 4b834bbf95e0..35803ab7ac80 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -309,7 +309,7 @@ static struct ip_esp_hdr *esp_output_set_extra(struct sk_buff *skb, struct esp_output_extra *extra) { /* For ESN we move the header forward by 4 bytes to - * accomodate the high bits. We will move it back after + * accommodate the high bits. We will move it back after * encryption. */ if ((x->props.flags & XFRM_STATE_ESN)) { @@ -754,7 +754,7 @@ int esp_input_done2(struct sk_buff *skb, int err) int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead); int ihl; - if (!xo || (xo && !(xo->flags & CRYPTO_DONE))) + if (!xo || !(xo->flags & CRYPTO_DONE)) kfree(ESP_SKB_CB(skb)->tmp); if (unlikely(err)) @@ -854,7 +854,7 @@ static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi) struct ip_esp_hdr *esph; /* For ESN we move the header forward by 4 bytes to - * accomodate the high bits. We will move it back after + * accommodate the high bits. We will move it back after * decryption. */ if ((x->props.flags & XFRM_STATE_ESN)) { diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 616e2dc1c8fa..7b6931a4d775 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -971,7 +971,7 @@ static bool icmp_redirect(struct sk_buff *skb) } /* - * Handle ICMP_ECHO ("ping") requests. + * Handle ICMP_ECHO ("ping") and ICMP_EXT_ECHO ("PROBE") requests. * * RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo * requests. @@ -979,27 +979,125 @@ static bool icmp_redirect(struct sk_buff *skb) * included in the reply. * RFC 1812: 4.3.3.6 SHOULD have a config option for silently ignoring * echo requests, MUST have default=NOT. + * RFC 8335: 8 MUST have a config option to enable/disable ICMP + * Extended Echo Functionality, MUST be disabled by default * See also WRT handling of options once they are done and working. */ static bool icmp_echo(struct sk_buff *skb) { + struct icmp_ext_hdr *ext_hdr, _ext_hdr; + struct icmp_ext_echo_iio *iio, _iio; + struct icmp_bxm icmp_param; + struct net_device *dev; + char buff[IFNAMSIZ]; struct net *net; + u16 ident_len; + u8 status; net = dev_net(skb_dst(skb)->dev); - if (!net->ipv4.sysctl_icmp_echo_ignore_all) { - struct icmp_bxm icmp_param; + /* should there be an ICMP stat for ignored echos? */ + if (net->ipv4.sysctl_icmp_echo_ignore_all) + return true; - icmp_param.data.icmph = *icmp_hdr(skb); + icmp_param.data.icmph = *icmp_hdr(skb); + icmp_param.skb = skb; + icmp_param.offset = 0; + icmp_param.data_len = skb->len; + icmp_param.head_len = sizeof(struct icmphdr); + + if (icmp_param.data.icmph.type == ICMP_ECHO) { icmp_param.data.icmph.type = ICMP_ECHOREPLY; - icmp_param.skb = skb; - icmp_param.offset = 0; - icmp_param.data_len = skb->len; - icmp_param.head_len = sizeof(struct icmphdr); - icmp_reply(&icmp_param, skb); + goto send_reply; } - /* should there be an ICMP stat for ignored echos? */ - return true; + if (!net->ipv4.sysctl_icmp_echo_enable_probe) + return true; + /* We currently only support probing interfaces on the proxy node + * Check to ensure L-bit is set + */ + if (!(ntohs(icmp_param.data.icmph.un.echo.sequence) & 1)) + return true; + /* Clear status bits in reply message */ + icmp_param.data.icmph.un.echo.sequence &= htons(0xFF00); + icmp_param.data.icmph.type = ICMP_EXT_ECHOREPLY; + ext_hdr = skb_header_pointer(skb, 0, sizeof(_ext_hdr), &_ext_hdr); + /* Size of iio is class_type dependent. + * Only check header here and assign length based on ctype in the switch statement + */ + iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(iio->extobj_hdr), &_iio); + if (!ext_hdr || !iio) + goto send_mal_query; + if (ntohs(iio->extobj_hdr.length) <= sizeof(iio->extobj_hdr)) + goto send_mal_query; + ident_len = ntohs(iio->extobj_hdr.length) - sizeof(iio->extobj_hdr); + status = 0; + dev = NULL; + switch (iio->extobj_hdr.class_type) { + case ICMP_EXT_ECHO_CTYPE_NAME: + iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(_iio), &_iio); + if (ident_len >= IFNAMSIZ) + goto send_mal_query; + memset(buff, 0, sizeof(buff)); + memcpy(buff, &iio->ident.name, ident_len); + dev = dev_get_by_name(net, buff); + break; + case ICMP_EXT_ECHO_CTYPE_INDEX: + iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(iio->extobj_hdr) + + sizeof(iio->ident.ifindex), &_iio); + if (ident_len != sizeof(iio->ident.ifindex)) + goto send_mal_query; + dev = dev_get_by_index(net, ntohl(iio->ident.ifindex)); + break; + case ICMP_EXT_ECHO_CTYPE_ADDR: + if (ident_len != sizeof(iio->ident.addr.ctype3_hdr) + + iio->ident.addr.ctype3_hdr.addrlen) + goto send_mal_query; + switch (ntohs(iio->ident.addr.ctype3_hdr.afi)) { + case ICMP_AFI_IP: + iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(iio->extobj_hdr) + + sizeof(struct in_addr), &_iio); + if (ident_len != sizeof(iio->ident.addr.ctype3_hdr) + + sizeof(struct in_addr)) + goto send_mal_query; + dev = ip_dev_find(net, iio->ident.addr.ip_addr.ipv4_addr.s_addr); + break; +#if IS_ENABLED(CONFIG_IPV6) + case ICMP_AFI_IP6: + iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(_iio), &_iio); + if (ident_len != sizeof(iio->ident.addr.ctype3_hdr) + + sizeof(struct in6_addr)) + goto send_mal_query; + dev = ipv6_stub->ipv6_dev_find(net, &iio->ident.addr.ip_addr.ipv6_addr, dev); + if (dev) + dev_hold(dev); + break; +#endif + default: + goto send_mal_query; + } + break; + default: + goto send_mal_query; + } + if (!dev) { + icmp_param.data.icmph.code = ICMP_EXT_CODE_NO_IF; + goto send_reply; + } + /* Fill bits in reply message */ + if (dev->flags & IFF_UP) + status |= ICMP_EXT_ECHOREPLY_ACTIVE; + if (__in_dev_get_rcu(dev) && __in_dev_get_rcu(dev)->ifa_list) + status |= ICMP_EXT_ECHOREPLY_IPV4; + if (!list_empty(&rcu_dereference(dev->ip6_ptr)->addr_list)) + status |= ICMP_EXT_ECHOREPLY_IPV6; + dev_put(dev); + icmp_param.data.icmph.un.echo.sequence |= htons(status); +send_reply: + icmp_reply(&icmp_param, skb); + return true; +send_mal_query: + icmp_param.data.icmph.code = ICMP_EXT_CODE_MAL_QUERY; + goto send_reply; } /* @@ -1088,6 +1186,21 @@ int icmp_rcv(struct sk_buff *skb) icmph = icmp_hdr(skb); ICMPMSGIN_INC_STATS(net, icmph->type); + + /* Check for ICMP Extended Echo (PROBE) messages */ + if (icmph->type == ICMP_EXT_ECHO) { + /* We can't use icmp_pointers[].handler() because it is an array of + * size NR_ICMP_TYPES + 1 (19 elements) and PROBE has code 42. + */ + success = icmp_echo(skb); + goto success_check; + } + + if (icmph->type == ICMP_EXT_ECHOREPLY) { + success = ping_rcv(skb); + goto success_check; + } + /* * 18 is the highest 'known' ICMP type. Anything else is a mystery * @@ -1097,7 +1210,6 @@ int icmp_rcv(struct sk_buff *skb) if (icmph->type > NR_ICMP_TYPES) goto error; - /* * Parse the ICMP message */ @@ -1123,7 +1235,7 @@ int icmp_rcv(struct sk_buff *skb) } success = icmp_pointers[icmph->type].handler(skb); - +success_check: if (success) { consume_skb(skb); return NET_RX_SUCCESS; @@ -1340,6 +1452,7 @@ static int __net_init icmp_sk_init(struct net *net) /* Control parameters for ECHO replies. */ net->ipv4.sysctl_icmp_echo_ignore_all = 0; + net->ipv4.sysctl_icmp_echo_enable_probe = 0; net->ipv4.sysctl_icmp_echo_ignore_broadcasts = 1; /* Control parameter - ignore bogus broadcast responses? */ diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 3aab53beb4ea..c3efc7d658f6 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -34,7 +34,7 @@ * Andi Kleen : Replace ip_reply with ip_send_reply. * Andi Kleen : Split fast and slow ip_build_xmit path * for decreased register pressure on x86 - * and more readibility. + * and more readability. * Marc Boucher : When call_out_firewall returns FW_QUEUE, * silently drop skb instead of failing with -EPERM. * Detlev Wengorz : Copy protocol for fragments. @@ -262,7 +262,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, * interface with a smaller MTU. * - Arriving GRO skb (or GSO skb in a virtualized environment) that is * bridged to a NETIF_F_TSO tunnel stacked over an interface with an - * insufficent MTU. + * insufficient MTU. */ features = netif_skb_features(skb); BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_GSO_CB_OFFSET); diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 7c841037c533..aff707988e23 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -25,6 +25,7 @@ int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, un __be32 saddr = iph->saddr; __u8 flags; struct net_device *dev = skb_dst(skb)->dev; + struct flow_keys flkeys; unsigned int hh_len; sk = sk_to_full_sk(sk); @@ -48,6 +49,7 @@ int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, un fl4.flowi4_oif = l3mdev_master_ifindex(dev); fl4.flowi4_mark = skb->mark; fl4.flowi4_flags = flags; + fib4_rules_early_flow_dissect(net, skb, &fl4, &flkeys); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) return PTR_ERR(rt); diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index a2f4f894be2b..63cb953bd019 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -76,12 +76,18 @@ config NF_DUP_IPV4 config NF_LOG_ARP tristate "ARP packet logging" default m if NETFILTER_ADVANCED=n - select NF_LOG_COMMON + select NF_LOG_SYSLOG + help + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects CONFIG_NF_LOG_SYSLOG. config NF_LOG_IPV4 tristate "IPv4 packet logging" default m if NETFILTER_ADVANCED=n - select NF_LOG_COMMON + select NF_LOG_SYSLOG + help + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects CONFIG_NF_LOG_SYSLOG. config NF_REJECT_IPV4 tristate "IPv4 packet rejection" diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 7c497c78105f..f38fb1368ddb 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -9,10 +9,6 @@ obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o obj-$(CONFIG_NF_SOCKET_IPV4) += nf_socket_ipv4.o obj-$(CONFIG_NF_TPROXY_IPV4) += nf_tproxy_ipv4.o -# logging -obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o -obj-$(CONFIG_NF_LOG_IPV4) += nf_log_ipv4.o - # reject obj-$(CONFIG_NF_REJECT_IPV4) += nf_reject_ipv4.o diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index d6d45d820d79..cf20316094d0 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -713,7 +713,7 @@ static int copy_entries_to_user(unsigned int total_size, return ret; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT static void compat_standard_from_user(void *dst, const void *src) { int v = *(compat_int_t *)src; @@ -800,7 +800,7 @@ static int get_info(struct net *net, void __user *user, const int *len) return -EFAULT; name[XT_TABLE_MAXNAMELEN-1] = '\0'; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) xt_compat_lock(NFPROTO_ARP); #endif @@ -808,7 +808,7 @@ static int get_info(struct net *net, void __user *user, const int *len) if (!IS_ERR(t)) { struct arpt_getinfo info; const struct xt_table_info *private = t->private; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct xt_table_info tmp; if (in_compat_syscall()) { @@ -835,7 +835,7 @@ static int get_info(struct net *net, void __user *user, const int *len) module_put(t->me); } else ret = PTR_ERR(t); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) xt_compat_unlock(NFPROTO_ARP); #endif @@ -1044,7 +1044,7 @@ static int do_add_counters(struct net *net, sockptr_t arg, unsigned int len) return ret; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct compat_arpt_replace { char name[XT_TABLE_MAXNAMELEN]; u32 valid_hooks; @@ -1412,7 +1412,7 @@ static int do_arpt_set_ctl(struct sock *sk, int cmd, sockptr_t arg, switch (cmd) { case ARPT_SO_SET_REPLACE: -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) ret = compat_do_replace(sock_net(sk), arg, len); else @@ -1444,7 +1444,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len break; case ARPT_SO_GET_ENTRIES: -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) ret = compat_get_entries(sock_net(sk), user, len); else @@ -1499,10 +1499,11 @@ static void __arpt_unregister_table(struct net *net, struct xt_table *table) int arpt_register_table(struct net *net, const struct xt_table *table, const struct arpt_replace *repl, - const struct nf_hook_ops *ops, - struct xt_table **res) + const struct nf_hook_ops *template_ops) { - int ret; + struct nf_hook_ops *ops; + unsigned int num_ops; + int ret, i; struct xt_table_info *newinfo; struct xt_table_info bootstrap = {0}; void *loc_cpu_entry; @@ -1516,41 +1517,61 @@ int arpt_register_table(struct net *net, memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(net, newinfo, loc_cpu_entry, repl); - if (ret != 0) - goto out_free; + if (ret != 0) { + xt_free_table_info(newinfo); + return ret; + } new_table = xt_register_table(net, table, &bootstrap, newinfo); if (IS_ERR(new_table)) { - ret = PTR_ERR(new_table); - goto out_free; + xt_free_table_info(newinfo); + return PTR_ERR(new_table); } - /* set res now, will see skbs right after nf_register_net_hooks */ - WRITE_ONCE(*res, new_table); + num_ops = hweight32(table->valid_hooks); + if (num_ops == 0) { + ret = -EINVAL; + goto out_free; + } - ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); - if (ret != 0) { - __arpt_unregister_table(net, new_table); - *res = NULL; + ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL); + if (!ops) { + ret = -ENOMEM; + goto out_free; } + for (i = 0; i < num_ops; i++) + ops[i].priv = new_table; + + new_table->ops = ops; + + ret = nf_register_net_hooks(net, ops, num_ops); + if (ret != 0) + goto out_free; + return ret; out_free: - xt_free_table_info(newinfo); + __arpt_unregister_table(net, new_table); return ret; } -void arpt_unregister_table_pre_exit(struct net *net, struct xt_table *table, +void arpt_unregister_table_pre_exit(struct net *net, const char *name, const struct nf_hook_ops *ops) { - nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); + struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name); + + if (table) + nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); } EXPORT_SYMBOL(arpt_unregister_table_pre_exit); -void arpt_unregister_table(struct net *net, struct xt_table *table) +void arpt_unregister_table(struct net *net, const char *name) { - __arpt_unregister_table(net, table); + struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name); + + if (table) + __arpt_unregister_table(net, table); } /* The built-in targets: standard (NULL) and error. */ @@ -1559,7 +1580,7 @@ static struct xt_target arpt_builtin_tg[] __read_mostly = { .name = XT_STANDARD_TARGET, .targetsize = sizeof(int), .family = NFPROTO_ARP, -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(compat_int_t), .compat_from_user = compat_standard_from_user, .compat_to_user = compat_standard_to_user, diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 6c300ba5634e..b8f45e9bbec8 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -34,7 +34,7 @@ static unsigned int arptable_filter_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return arpt_do_table(skb, state, state->net->ipv4.arptable_filter); + return arpt_do_table(skb, state, priv); } static struct nf_hook_ops *arpfilter_ops __read_mostly; @@ -44,31 +44,22 @@ static int __net_init arptable_filter_table_init(struct net *net) struct arpt_replace *repl; int err; - if (net->ipv4.arptable_filter) - return 0; - repl = arpt_alloc_initial_table(&packet_filter); if (repl == NULL) return -ENOMEM; - err = arpt_register_table(net, &packet_filter, repl, arpfilter_ops, - &net->ipv4.arptable_filter); + err = arpt_register_table(net, &packet_filter, repl, arpfilter_ops); kfree(repl); return err; } static void __net_exit arptable_filter_net_pre_exit(struct net *net) { - if (net->ipv4.arptable_filter) - arpt_unregister_table_pre_exit(net, net->ipv4.arptable_filter, - arpfilter_ops); + arpt_unregister_table_pre_exit(net, "filter", arpfilter_ops); } static void __net_exit arptable_filter_net_exit(struct net *net) { - if (!net->ipv4.arptable_filter) - return; - arpt_unregister_table(net, net->ipv4.arptable_filter); - net->ipv4.arptable_filter = NULL; + arpt_unregister_table(net, "filter"); } static struct pernet_operations arptable_filter_net_ops = { diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index f77ea0dbe656..13acb687c19a 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -868,7 +868,7 @@ copy_entries_to_user(unsigned int total_size, return ret; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT static void compat_standard_from_user(void *dst, const void *src) { int v = *(compat_int_t *)src; @@ -957,7 +957,7 @@ static int get_info(struct net *net, void __user *user, const int *len) return -EFAULT; name[XT_TABLE_MAXNAMELEN-1] = '\0'; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) xt_compat_lock(AF_INET); #endif @@ -965,7 +965,7 @@ static int get_info(struct net *net, void __user *user, const int *len) if (!IS_ERR(t)) { struct ipt_getinfo info; const struct xt_table_info *private = t->private; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct xt_table_info tmp; if (in_compat_syscall()) { @@ -993,7 +993,7 @@ static int get_info(struct net *net, void __user *user, const int *len) module_put(t->me); } else ret = PTR_ERR(t); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) xt_compat_unlock(AF_INET); #endif @@ -1199,7 +1199,7 @@ do_add_counters(struct net *net, sockptr_t arg, unsigned int len) return ret; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct compat_ipt_replace { char name[XT_TABLE_MAXNAMELEN]; u32 valid_hooks; @@ -1621,7 +1621,7 @@ do_ipt_set_ctl(struct sock *sk, int cmd, sockptr_t arg, unsigned int len) switch (cmd) { case IPT_SO_SET_REPLACE: -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) ret = compat_do_replace(sock_net(sk), arg, len); else @@ -1654,7 +1654,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) break; case IPT_SO_GET_ENTRIES: -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) ret = compat_get_entries(sock_net(sk), user, len); else @@ -1716,9 +1716,11 @@ static void __ipt_unregister_table(struct net *net, struct xt_table *table) int ipt_register_table(struct net *net, const struct xt_table *table, const struct ipt_replace *repl, - const struct nf_hook_ops *ops, struct xt_table **res) + const struct nf_hook_ops *template_ops) { - int ret; + struct nf_hook_ops *ops; + unsigned int num_ops; + int ret, i; struct xt_table_info *newinfo; struct xt_table_info bootstrap = {0}; void *loc_cpu_entry; @@ -1732,50 +1734,65 @@ int ipt_register_table(struct net *net, const struct xt_table *table, memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(net, newinfo, loc_cpu_entry, repl); - if (ret != 0) - goto out_free; + if (ret != 0) { + xt_free_table_info(newinfo); + return ret; + } new_table = xt_register_table(net, table, &bootstrap, newinfo); if (IS_ERR(new_table)) { - ret = PTR_ERR(new_table); - goto out_free; + xt_free_table_info(newinfo); + return PTR_ERR(new_table); } - /* set res now, will see skbs right after nf_register_net_hooks */ - WRITE_ONCE(*res, new_table); - if (!ops) + /* No template? No need to do anything. This is used by 'nat' table, it registers + * with the nat core instead of the netfilter core. + */ + if (!template_ops) return 0; - ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); - if (ret != 0) { - __ipt_unregister_table(net, new_table); - *res = NULL; + num_ops = hweight32(table->valid_hooks); + if (num_ops == 0) { + ret = -EINVAL; + goto out_free; + } + + ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL); + if (!ops) { + ret = -ENOMEM; + goto out_free; } + for (i = 0; i < num_ops; i++) + ops[i].priv = new_table; + + new_table->ops = ops; + + ret = nf_register_net_hooks(net, ops, num_ops); + if (ret != 0) + goto out_free; + return ret; out_free: - xt_free_table_info(newinfo); + __ipt_unregister_table(net, new_table); return ret; } -void ipt_unregister_table_pre_exit(struct net *net, struct xt_table *table, - const struct nf_hook_ops *ops) +void ipt_unregister_table_pre_exit(struct net *net, const char *name) { - nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); -} + struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name); -void ipt_unregister_table_exit(struct net *net, struct xt_table *table) -{ - __ipt_unregister_table(net, table); + if (table) + nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); } -void ipt_unregister_table(struct net *net, struct xt_table *table, - const struct nf_hook_ops *ops) +void ipt_unregister_table_exit(struct net *net, const char *name) { - if (ops) - ipt_unregister_table_pre_exit(net, table, ops); - __ipt_unregister_table(net, table); + struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name); + + if (table) + __ipt_unregister_table(net, table); } /* Returns 1 if the type and code is matched by the range, 0 otherwise */ @@ -1829,7 +1846,7 @@ static struct xt_target ipt_builtin_tg[] __read_mostly = { .name = XT_STANDARD_TARGET, .targetsize = sizeof(int), .family = NFPROTO_IPV4, -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(compat_int_t), .compat_from_user = compat_standard_from_user, .compat_to_user = compat_standard_to_user, @@ -1924,7 +1941,6 @@ static void __exit ip_tables_fini(void) } EXPORT_SYMBOL(ipt_register_table); -EXPORT_SYMBOL(ipt_unregister_table); EXPORT_SYMBOL(ipt_unregister_table_pre_exit); EXPORT_SYMBOL(ipt_unregister_table_exit); EXPORT_SYMBOL(ipt_do_table); diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index a8b980ad11d4..8f7ca67475b7 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -541,7 +541,7 @@ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par) nf_ct_netns_put(par->net, par->family); } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct compat_ipt_clusterip_tgt_info { u_int32_t flags; @@ -553,7 +553,7 @@ struct compat_ipt_clusterip_tgt_info u_int32_t hash_initval; compat_uptr_t config; }; -#endif /* CONFIG_COMPAT */ +#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */ static struct xt_target clusterip_tg_reg __read_mostly = { .name = "CLUSTERIP", @@ -563,9 +563,9 @@ static struct xt_target clusterip_tg_reg __read_mostly = { .destroy = clusterip_tg_destroy, .targetsize = sizeof(struct ipt_clusterip_tgt_info), .usersize = offsetof(struct ipt_clusterip_tgt_info, config), -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(struct compat_ipt_clusterip_tgt_info), -#endif /* CONFIG_COMPAT */ +#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */ .me = THIS_MODULE }; diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 8f7bc1ee7453..8272df7c6ad5 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -34,7 +34,7 @@ static unsigned int iptable_filter_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ipt_do_table(skb, state, state->net->ipv4.iptable_filter); + return ipt_do_table(skb, state, priv); } static struct nf_hook_ops *filter_ops __read_mostly; @@ -48,9 +48,6 @@ static int __net_init iptable_filter_table_init(struct net *net) struct ipt_replace *repl; int err; - if (net->ipv4.iptable_filter) - return 0; - repl = ipt_alloc_initial_table(&packet_filter); if (repl == NULL) return -ENOMEM; @@ -58,8 +55,7 @@ static int __net_init iptable_filter_table_init(struct net *net) ((struct ipt_standard *)repl->entries)[1].target.verdict = forward ? -NF_ACCEPT - 1 : -NF_DROP - 1; - err = ipt_register_table(net, &packet_filter, repl, filter_ops, - &net->ipv4.iptable_filter); + err = ipt_register_table(net, &packet_filter, repl, filter_ops); kfree(repl); return err; } @@ -74,17 +70,12 @@ static int __net_init iptable_filter_net_init(struct net *net) static void __net_exit iptable_filter_net_pre_exit(struct net *net) { - if (net->ipv4.iptable_filter) - ipt_unregister_table_pre_exit(net, net->ipv4.iptable_filter, - filter_ops); + ipt_unregister_table_pre_exit(net, "filter"); } static void __net_exit iptable_filter_net_exit(struct net *net) { - if (!net->ipv4.iptable_filter) - return; - ipt_unregister_table_exit(net, net->ipv4.iptable_filter); - net->ipv4.iptable_filter = NULL; + ipt_unregister_table_exit(net, "filter"); } static struct pernet_operations iptable_filter_net_ops = { diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 833079589273..2abc3836f391 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -37,7 +37,7 @@ static const struct xt_table packet_mangler = { }; static unsigned int -ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) +ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state, void *priv) { unsigned int ret; const struct iphdr *iph; @@ -53,7 +53,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) daddr = iph->daddr; tos = iph->tos; - ret = ipt_do_table(skb, state, state->net->ipv4.iptable_mangle); + ret = ipt_do_table(skb, state, priv); /* Reroute for ANY change. */ if (ret != NF_DROP && ret != NF_STOLEN) { iph = ip_hdr(skb); @@ -78,8 +78,8 @@ iptable_mangle_hook(void *priv, const struct nf_hook_state *state) { if (state->hook == NF_INET_LOCAL_OUT) - return ipt_mangle_out(skb, state); - return ipt_do_table(skb, state, state->net->ipv4.iptable_mangle); + return ipt_mangle_out(skb, state, priv); + return ipt_do_table(skb, state, priv); } static struct nf_hook_ops *mangle_ops __read_mostly; @@ -88,31 +88,22 @@ static int __net_init iptable_mangle_table_init(struct net *net) struct ipt_replace *repl; int ret; - if (net->ipv4.iptable_mangle) - return 0; - repl = ipt_alloc_initial_table(&packet_mangler); if (repl == NULL) return -ENOMEM; - ret = ipt_register_table(net, &packet_mangler, repl, mangle_ops, - &net->ipv4.iptable_mangle); + ret = ipt_register_table(net, &packet_mangler, repl, mangle_ops); kfree(repl); return ret; } static void __net_exit iptable_mangle_net_pre_exit(struct net *net) { - if (net->ipv4.iptable_mangle) - ipt_unregister_table_pre_exit(net, net->ipv4.iptable_mangle, - mangle_ops); + ipt_unregister_table_pre_exit(net, "mangle"); } static void __net_exit iptable_mangle_net_exit(struct net *net) { - if (!net->ipv4.iptable_mangle) - return; - ipt_unregister_table_exit(net, net->ipv4.iptable_mangle); - net->ipv4.iptable_mangle = NULL; + ipt_unregister_table_exit(net, "mangle"); } static struct pernet_operations iptable_mangle_net_ops = { diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index b0143b109f25..a9913842ef18 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -13,8 +13,14 @@ #include <net/netfilter/nf_nat.h> +struct iptable_nat_pernet { + struct nf_hook_ops *nf_nat_ops; +}; + static int __net_init iptable_nat_table_init(struct net *net); +static unsigned int iptable_nat_net_id __read_mostly; + static const struct xt_table nf_nat_ipv4_table = { .name = "nat", .valid_hooks = (1 << NF_INET_PRE_ROUTING) | @@ -30,7 +36,7 @@ static unsigned int iptable_nat_do_chain(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ipt_do_table(skb, state, state->net->ipv4.nat_table); + return ipt_do_table(skb, state, priv); } static const struct nf_hook_ops nf_nat_ipv4_ops[] = { @@ -62,27 +68,49 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = { static int ipt_nat_register_lookups(struct net *net) { + struct iptable_nat_pernet *xt_nat_net; + struct nf_hook_ops *ops; + struct xt_table *table; int i, ret; + xt_nat_net = net_generic(net, iptable_nat_net_id); + table = xt_find_table(net, NFPROTO_IPV4, "nat"); + if (WARN_ON_ONCE(!table)) + return -ENOENT; + + ops = kmemdup(nf_nat_ipv4_ops, sizeof(nf_nat_ipv4_ops), GFP_KERNEL); + if (!ops) + return -ENOMEM; + for (i = 0; i < ARRAY_SIZE(nf_nat_ipv4_ops); i++) { - ret = nf_nat_ipv4_register_fn(net, &nf_nat_ipv4_ops[i]); + ops[i].priv = table; + ret = nf_nat_ipv4_register_fn(net, &ops[i]); if (ret) { while (i) - nf_nat_ipv4_unregister_fn(net, &nf_nat_ipv4_ops[--i]); + nf_nat_ipv4_unregister_fn(net, &ops[--i]); + kfree(ops); return ret; } } + xt_nat_net->nf_nat_ops = ops; return 0; } static void ipt_nat_unregister_lookups(struct net *net) { + struct iptable_nat_pernet *xt_nat_net = net_generic(net, iptable_nat_net_id); + struct nf_hook_ops *ops = xt_nat_net->nf_nat_ops; int i; + if (!ops) + return; + for (i = 0; i < ARRAY_SIZE(nf_nat_ipv4_ops); i++) - nf_nat_ipv4_unregister_fn(net, &nf_nat_ipv4_ops[i]); + nf_nat_ipv4_unregister_fn(net, &ops[i]); + + kfree(ops); } static int __net_init iptable_nat_table_init(struct net *net) @@ -90,24 +118,19 @@ static int __net_init iptable_nat_table_init(struct net *net) struct ipt_replace *repl; int ret; - if (net->ipv4.nat_table) - return 0; - repl = ipt_alloc_initial_table(&nf_nat_ipv4_table); if (repl == NULL) return -ENOMEM; - ret = ipt_register_table(net, &nf_nat_ipv4_table, repl, - NULL, &net->ipv4.nat_table); + + ret = ipt_register_table(net, &nf_nat_ipv4_table, repl, NULL); if (ret < 0) { kfree(repl); return ret; } ret = ipt_nat_register_lookups(net); - if (ret < 0) { - ipt_unregister_table(net, net->ipv4.nat_table, NULL); - net->ipv4.nat_table = NULL; - } + if (ret < 0) + ipt_unregister_table_exit(net, "nat"); kfree(repl); return ret; @@ -115,21 +138,19 @@ static int __net_init iptable_nat_table_init(struct net *net) static void __net_exit iptable_nat_net_pre_exit(struct net *net) { - if (net->ipv4.nat_table) - ipt_nat_unregister_lookups(net); + ipt_nat_unregister_lookups(net); } static void __net_exit iptable_nat_net_exit(struct net *net) { - if (!net->ipv4.nat_table) - return; - ipt_unregister_table_exit(net, net->ipv4.nat_table); - net->ipv4.nat_table = NULL; + ipt_unregister_table_exit(net, "nat"); } static struct pernet_operations iptable_nat_net_ops = { .pre_exit = iptable_nat_net_pre_exit, .exit = iptable_nat_net_exit, + .id = &iptable_nat_net_id, + .size = sizeof(struct iptable_nat_pernet), }; static int __init iptable_nat_init(void) diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 9abfe6bf2cb9..ceef397c1f5f 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -41,7 +41,7 @@ static unsigned int iptable_raw_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ipt_do_table(skb, state, state->net->ipv4.iptable_raw); + return ipt_do_table(skb, state, priv); } static struct nf_hook_ops *rawtable_ops __read_mostly; @@ -55,31 +55,22 @@ static int __net_init iptable_raw_table_init(struct net *net) if (raw_before_defrag) table = &packet_raw_before_defrag; - if (net->ipv4.iptable_raw) - return 0; - repl = ipt_alloc_initial_table(table); if (repl == NULL) return -ENOMEM; - ret = ipt_register_table(net, table, repl, rawtable_ops, - &net->ipv4.iptable_raw); + ret = ipt_register_table(net, table, repl, rawtable_ops); kfree(repl); return ret; } static void __net_exit iptable_raw_net_pre_exit(struct net *net) { - if (net->ipv4.iptable_raw) - ipt_unregister_table_pre_exit(net, net->ipv4.iptable_raw, - rawtable_ops); + ipt_unregister_table_pre_exit(net, "raw"); } static void __net_exit iptable_raw_net_exit(struct net *net) { - if (!net->ipv4.iptable_raw) - return; - ipt_unregister_table_exit(net, net->ipv4.iptable_raw); - net->ipv4.iptable_raw = NULL; + ipt_unregister_table_exit(net, "raw"); } static struct pernet_operations iptable_raw_net_ops = { diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index 415c1975d770..77973f5fd8f6 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -40,7 +40,7 @@ static unsigned int iptable_security_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ipt_do_table(skb, state, state->net->ipv4.iptable_security); + return ipt_do_table(skb, state, priv); } static struct nf_hook_ops *sectbl_ops __read_mostly; @@ -50,31 +50,22 @@ static int __net_init iptable_security_table_init(struct net *net) struct ipt_replace *repl; int ret; - if (net->ipv4.iptable_security) - return 0; - repl = ipt_alloc_initial_table(&security_table); if (repl == NULL) return -ENOMEM; - ret = ipt_register_table(net, &security_table, repl, sectbl_ops, - &net->ipv4.iptable_security); + ret = ipt_register_table(net, &security_table, repl, sectbl_ops); kfree(repl); return ret; } static void __net_exit iptable_security_net_pre_exit(struct net *net) { - if (net->ipv4.iptable_security) - ipt_unregister_table_pre_exit(net, net->ipv4.iptable_security, - sectbl_ops); + ipt_unregister_table_pre_exit(net, "security"); } static void __net_exit iptable_security_net_exit(struct net *net) { - if (!net->ipv4.iptable_security) - return; - ipt_unregister_table_exit(net, net->ipv4.iptable_security); - net->ipv4.iptable_security = NULL; + ipt_unregister_table_exit(net, "security"); } static struct pernet_operations iptable_security_net_ops = { diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 8115611aa47d..613432a36f0a 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -20,8 +20,13 @@ #endif #include <net/netfilter/nf_conntrack_zones.h> +static unsigned int defrag4_pernet_id __read_mostly; static DEFINE_MUTEX(defrag4_mutex); +struct defrag4_pernet { + unsigned int users; +}; + static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb, u_int32_t user) { @@ -106,15 +111,19 @@ static const struct nf_hook_ops ipv4_defrag_ops[] = { static void __net_exit defrag4_net_exit(struct net *net) { - if (net->nf.defrag_ipv4) { + struct defrag4_pernet *nf_defrag = net_generic(net, defrag4_pernet_id); + + if (nf_defrag->users) { nf_unregister_net_hooks(net, ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); - net->nf.defrag_ipv4 = false; + nf_defrag->users = 0; } } static struct pernet_operations defrag4_net_ops = { .exit = defrag4_net_exit, + .id = &defrag4_pernet_id, + .size = sizeof(struct defrag4_pernet), }; static int __init nf_defrag_init(void) @@ -129,21 +138,24 @@ static void __exit nf_defrag_fini(void) int nf_defrag_ipv4_enable(struct net *net) { + struct defrag4_pernet *nf_defrag = net_generic(net, defrag4_pernet_id); int err = 0; - might_sleep(); - - if (net->nf.defrag_ipv4) - return 0; - mutex_lock(&defrag4_mutex); - if (net->nf.defrag_ipv4) + if (nf_defrag->users == UINT_MAX) { + err = -EOVERFLOW; goto out_unlock; + } + + if (nf_defrag->users) { + nf_defrag->users++; + goto out_unlock; + } err = nf_register_net_hooks(net, ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); if (err == 0) - net->nf.defrag_ipv4 = true; + nf_defrag->users = 1; out_unlock: mutex_unlock(&defrag4_mutex); @@ -151,6 +163,22 @@ int nf_defrag_ipv4_enable(struct net *net) } EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable); +void nf_defrag_ipv4_disable(struct net *net) +{ + struct defrag4_pernet *nf_defrag = net_generic(net, defrag4_pernet_id); + + mutex_lock(&defrag4_mutex); + if (nf_defrag->users) { + nf_defrag->users--; + if (nf_defrag->users == 0) + nf_unregister_net_hooks(net, ipv4_defrag_ops, + ARRAY_SIZE(ipv4_defrag_ops)); + } + + mutex_unlock(&defrag4_mutex); +} +EXPORT_SYMBOL_GPL(nf_defrag_ipv4_disable); + module_init(nf_defrag_init); module_exit(nf_defrag_fini); diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c deleted file mode 100644 index 136030ad2e54..000000000000 --- a/net/ipv4/netfilter/nf_log_arp.c +++ /dev/null @@ -1,172 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * (C) 2014 by Pablo Neira Ayuso <pablo@netfilter.org> - * - * Based on code from ebt_log from: - * - * Bart De Schuymer <bdschuym@pandora.be> - * Harald Welte <laforge@netfilter.org> - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/spinlock.h> -#include <linux/skbuff.h> -#include <linux/if_arp.h> -#include <linux/ip.h> -#include <net/route.h> - -#include <linux/netfilter.h> -#include <linux/netfilter/xt_LOG.h> -#include <net/netfilter/nf_log.h> - -static const struct nf_loginfo default_loginfo = { - .type = NF_LOG_TYPE_LOG, - .u = { - .log = { - .level = LOGLEVEL_NOTICE, - .logflags = NF_LOG_DEFAULT_MASK, - }, - }, -}; - -struct arppayload { - unsigned char mac_src[ETH_ALEN]; - unsigned char ip_src[4]; - unsigned char mac_dst[ETH_ALEN]; - unsigned char ip_dst[4]; -}; - -static void dump_arp_packet(struct nf_log_buf *m, - const struct nf_loginfo *info, - const struct sk_buff *skb, unsigned int nhoff) -{ - const struct arppayload *ap; - struct arppayload _arpp; - const struct arphdr *ah; - unsigned int logflags; - struct arphdr _arph; - - ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph); - if (ah == NULL) { - nf_log_buf_add(m, "TRUNCATED"); - return; - } - - if (info->type == NF_LOG_TYPE_LOG) - logflags = info->u.log.logflags; - else - logflags = NF_LOG_DEFAULT_MASK; - - if (logflags & NF_LOG_MACDECODE) { - nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ", - eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest); - nf_log_dump_vlan(m, skb); - nf_log_buf_add(m, "MACPROTO=%04x ", - ntohs(eth_hdr(skb)->h_proto)); - } - - nf_log_buf_add(m, "ARP HTYPE=%d PTYPE=0x%04x OPCODE=%d", - ntohs(ah->ar_hrd), ntohs(ah->ar_pro), ntohs(ah->ar_op)); - - /* If it's for Ethernet and the lengths are OK, then log the ARP - * payload. - */ - if (ah->ar_hrd != htons(ARPHRD_ETHER) || - ah->ar_hln != ETH_ALEN || - ah->ar_pln != sizeof(__be32)) - return; - - ap = skb_header_pointer(skb, sizeof(_arph), sizeof(_arpp), &_arpp); - if (ap == NULL) { - nf_log_buf_add(m, " INCOMPLETE [%zu bytes]", - skb->len - sizeof(_arph)); - return; - } - nf_log_buf_add(m, " MACSRC=%pM IPSRC=%pI4 MACDST=%pM IPDST=%pI4", - ap->mac_src, ap->ip_src, ap->mac_dst, ap->ip_dst); -} - -static void nf_log_arp_packet(struct net *net, u_int8_t pf, - unsigned int hooknum, const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *loginfo, - const char *prefix) -{ - struct nf_log_buf *m; - - /* FIXME: Disabled from containers until syslog ns is supported */ - if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns) - return; - - m = nf_log_buf_open(); - - if (!loginfo) - loginfo = &default_loginfo; - - nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo, - prefix); - dump_arp_packet(m, loginfo, skb, 0); - - nf_log_buf_close(m); -} - -static struct nf_logger nf_arp_logger __read_mostly = { - .name = "nf_log_arp", - .type = NF_LOG_TYPE_LOG, - .logfn = nf_log_arp_packet, - .me = THIS_MODULE, -}; - -static int __net_init nf_log_arp_net_init(struct net *net) -{ - return nf_log_set(net, NFPROTO_ARP, &nf_arp_logger); -} - -static void __net_exit nf_log_arp_net_exit(struct net *net) -{ - nf_log_unset(net, &nf_arp_logger); -} - -static struct pernet_operations nf_log_arp_net_ops = { - .init = nf_log_arp_net_init, - .exit = nf_log_arp_net_exit, -}; - -static int __init nf_log_arp_init(void) -{ - int ret; - - ret = register_pernet_subsys(&nf_log_arp_net_ops); - if (ret < 0) - return ret; - - ret = nf_log_register(NFPROTO_ARP, &nf_arp_logger); - if (ret < 0) { - pr_err("failed to register logger\n"); - goto err1; - } - - return 0; - -err1: - unregister_pernet_subsys(&nf_log_arp_net_ops); - return ret; -} - -static void __exit nf_log_arp_exit(void) -{ - unregister_pernet_subsys(&nf_log_arp_net_ops); - nf_log_unregister(&nf_arp_logger); -} - -module_init(nf_log_arp_init); -module_exit(nf_log_arp_exit); - -MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); -MODULE_DESCRIPTION("Netfilter ARP packet logging"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_NF_LOGGER(3, 0); diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c deleted file mode 100644 index d07583fac8f8..000000000000 --- a/net/ipv4/netfilter/nf_log_ipv4.c +++ /dev/null @@ -1,395 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/spinlock.h> -#include <linux/skbuff.h> -#include <linux/if_arp.h> -#include <linux/ip.h> -#include <net/ipv6.h> -#include <net/icmp.h> -#include <net/udp.h> -#include <net/tcp.h> -#include <net/route.h> - -#include <linux/netfilter.h> -#include <linux/netfilter/xt_LOG.h> -#include <net/netfilter/nf_log.h> - -static const struct nf_loginfo default_loginfo = { - .type = NF_LOG_TYPE_LOG, - .u = { - .log = { - .level = LOGLEVEL_NOTICE, - .logflags = NF_LOG_DEFAULT_MASK, - }, - }, -}; - -/* One level of recursion won't kill us */ -static void dump_ipv4_packet(struct net *net, struct nf_log_buf *m, - const struct nf_loginfo *info, - const struct sk_buff *skb, unsigned int iphoff) -{ - struct iphdr _iph; - const struct iphdr *ih; - unsigned int logflags; - - if (info->type == NF_LOG_TYPE_LOG) - logflags = info->u.log.logflags; - else - logflags = NF_LOG_DEFAULT_MASK; - - ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); - if (ih == NULL) { - nf_log_buf_add(m, "TRUNCATED"); - return; - } - - /* Important fields: - * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ - /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ - nf_log_buf_add(m, "SRC=%pI4 DST=%pI4 ", &ih->saddr, &ih->daddr); - - /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ - nf_log_buf_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", - ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, - ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); - - /* Max length: 6 "CE DF MF " */ - if (ntohs(ih->frag_off) & IP_CE) - nf_log_buf_add(m, "CE "); - if (ntohs(ih->frag_off) & IP_DF) - nf_log_buf_add(m, "DF "); - if (ntohs(ih->frag_off) & IP_MF) - nf_log_buf_add(m, "MF "); - - /* Max length: 11 "FRAG:65535 " */ - if (ntohs(ih->frag_off) & IP_OFFSET) - nf_log_buf_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); - - if ((logflags & NF_LOG_IPOPT) && - ih->ihl * 4 > sizeof(struct iphdr)) { - const unsigned char *op; - unsigned char _opt[4 * 15 - sizeof(struct iphdr)]; - unsigned int i, optsize; - - optsize = ih->ihl * 4 - sizeof(struct iphdr); - op = skb_header_pointer(skb, iphoff+sizeof(_iph), - optsize, _opt); - if (op == NULL) { - nf_log_buf_add(m, "TRUNCATED"); - return; - } - - /* Max length: 127 "OPT (" 15*4*2chars ") " */ - nf_log_buf_add(m, "OPT ("); - for (i = 0; i < optsize; i++) - nf_log_buf_add(m, "%02X", op[i]); - nf_log_buf_add(m, ") "); - } - - switch (ih->protocol) { - case IPPROTO_TCP: - if (nf_log_dump_tcp_header(m, skb, ih->protocol, - ntohs(ih->frag_off) & IP_OFFSET, - iphoff+ih->ihl*4, logflags)) - return; - break; - case IPPROTO_UDP: - case IPPROTO_UDPLITE: - if (nf_log_dump_udp_header(m, skb, ih->protocol, - ntohs(ih->frag_off) & IP_OFFSET, - iphoff+ih->ihl*4)) - return; - break; - case IPPROTO_ICMP: { - struct icmphdr _icmph; - const struct icmphdr *ich; - static const size_t required_len[NR_ICMP_TYPES+1] - = { [ICMP_ECHOREPLY] = 4, - [ICMP_DEST_UNREACH] - = 8 + sizeof(struct iphdr), - [ICMP_SOURCE_QUENCH] - = 8 + sizeof(struct iphdr), - [ICMP_REDIRECT] - = 8 + sizeof(struct iphdr), - [ICMP_ECHO] = 4, - [ICMP_TIME_EXCEEDED] - = 8 + sizeof(struct iphdr), - [ICMP_PARAMETERPROB] - = 8 + sizeof(struct iphdr), - [ICMP_TIMESTAMP] = 20, - [ICMP_TIMESTAMPREPLY] = 20, - [ICMP_ADDRESS] = 12, - [ICMP_ADDRESSREPLY] = 12 }; - - /* Max length: 11 "PROTO=ICMP " */ - nf_log_buf_add(m, "PROTO=ICMP "); - - if (ntohs(ih->frag_off) & IP_OFFSET) - break; - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, - sizeof(_icmph), &_icmph); - if (ich == NULL) { - nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", - skb->len - iphoff - ih->ihl*4); - break; - } - - /* Max length: 18 "TYPE=255 CODE=255 " */ - nf_log_buf_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code); - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - if (ich->type <= NR_ICMP_TYPES && - required_len[ich->type] && - skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { - nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", - skb->len - iphoff - ih->ihl*4); - break; - } - - switch (ich->type) { - case ICMP_ECHOREPLY: - case ICMP_ECHO: - /* Max length: 19 "ID=65535 SEQ=65535 " */ - nf_log_buf_add(m, "ID=%u SEQ=%u ", - ntohs(ich->un.echo.id), - ntohs(ich->un.echo.sequence)); - break; - - case ICMP_PARAMETERPROB: - /* Max length: 14 "PARAMETER=255 " */ - nf_log_buf_add(m, "PARAMETER=%u ", - ntohl(ich->un.gateway) >> 24); - break; - case ICMP_REDIRECT: - /* Max length: 24 "GATEWAY=255.255.255.255 " */ - nf_log_buf_add(m, "GATEWAY=%pI4 ", &ich->un.gateway); - fallthrough; - case ICMP_DEST_UNREACH: - case ICMP_SOURCE_QUENCH: - case ICMP_TIME_EXCEEDED: - /* Max length: 3+maxlen */ - if (!iphoff) { /* Only recurse once. */ - nf_log_buf_add(m, "["); - dump_ipv4_packet(net, m, info, skb, - iphoff + ih->ihl*4+sizeof(_icmph)); - nf_log_buf_add(m, "] "); - } - - /* Max length: 10 "MTU=65535 " */ - if (ich->type == ICMP_DEST_UNREACH && - ich->code == ICMP_FRAG_NEEDED) { - nf_log_buf_add(m, "MTU=%u ", - ntohs(ich->un.frag.mtu)); - } - } - break; - } - /* Max Length */ - case IPPROTO_AH: { - struct ip_auth_hdr _ahdr; - const struct ip_auth_hdr *ah; - - if (ntohs(ih->frag_off) & IP_OFFSET) - break; - - /* Max length: 9 "PROTO=AH " */ - nf_log_buf_add(m, "PROTO=AH "); - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - ah = skb_header_pointer(skb, iphoff+ih->ihl*4, - sizeof(_ahdr), &_ahdr); - if (ah == NULL) { - nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", - skb->len - iphoff - ih->ihl*4); - break; - } - - /* Length: 15 "SPI=0xF1234567 " */ - nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi)); - break; - } - case IPPROTO_ESP: { - struct ip_esp_hdr _esph; - const struct ip_esp_hdr *eh; - - /* Max length: 10 "PROTO=ESP " */ - nf_log_buf_add(m, "PROTO=ESP "); - - if (ntohs(ih->frag_off) & IP_OFFSET) - break; - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - eh = skb_header_pointer(skb, iphoff+ih->ihl*4, - sizeof(_esph), &_esph); - if (eh == NULL) { - nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", - skb->len - iphoff - ih->ihl*4); - break; - } - - /* Length: 15 "SPI=0xF1234567 " */ - nf_log_buf_add(m, "SPI=0x%x ", ntohl(eh->spi)); - break; - } - /* Max length: 10 "PROTO 255 " */ - default: - nf_log_buf_add(m, "PROTO=%u ", ih->protocol); - } - - /* Max length: 15 "UID=4294967295 " */ - if ((logflags & NF_LOG_UID) && !iphoff) - nf_log_dump_sk_uid_gid(net, m, skb->sk); - - /* Max length: 16 "MARK=0xFFFFFFFF " */ - if (!iphoff && skb->mark) - nf_log_buf_add(m, "MARK=0x%x ", skb->mark); - - /* Proto Max log string length */ - /* IP: 40+46+6+11+127 = 230 */ - /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */ - /* UDP: 10+max(25,20) = 35 */ - /* UDPLITE: 14+max(25,20) = 39 */ - /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */ - /* ESP: 10+max(25)+15 = 50 */ - /* AH: 9+max(25)+15 = 49 */ - /* unknown: 10 */ - - /* (ICMP allows recursion one level deep) */ - /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */ - /* maxlen = 230+ 91 + 230 + 252 = 803 */ -} - -static void dump_ipv4_mac_header(struct nf_log_buf *m, - const struct nf_loginfo *info, - const struct sk_buff *skb) -{ - struct net_device *dev = skb->dev; - unsigned int logflags = 0; - - if (info->type == NF_LOG_TYPE_LOG) - logflags = info->u.log.logflags; - - if (!(logflags & NF_LOG_MACDECODE)) - goto fallback; - - switch (dev->type) { - case ARPHRD_ETHER: - nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ", - eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest); - nf_log_dump_vlan(m, skb); - nf_log_buf_add(m, "MACPROTO=%04x ", - ntohs(eth_hdr(skb)->h_proto)); - return; - default: - break; - } - -fallback: - nf_log_buf_add(m, "MAC="); - if (dev->hard_header_len && - skb->mac_header != skb->network_header) { - const unsigned char *p = skb_mac_header(skb); - unsigned int i; - - nf_log_buf_add(m, "%02x", *p++); - for (i = 1; i < dev->hard_header_len; i++, p++) - nf_log_buf_add(m, ":%02x", *p); - } - nf_log_buf_add(m, " "); -} - -static void nf_log_ip_packet(struct net *net, u_int8_t pf, - unsigned int hooknum, const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *loginfo, - const char *prefix) -{ - struct nf_log_buf *m; - - /* FIXME: Disabled from containers until syslog ns is supported */ - if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns) - return; - - m = nf_log_buf_open(); - - if (!loginfo) - loginfo = &default_loginfo; - - nf_log_dump_packet_common(m, pf, hooknum, skb, in, - out, loginfo, prefix); - - if (in != NULL) - dump_ipv4_mac_header(m, loginfo, skb); - - dump_ipv4_packet(net, m, loginfo, skb, 0); - - nf_log_buf_close(m); -} - -static struct nf_logger nf_ip_logger __read_mostly = { - .name = "nf_log_ipv4", - .type = NF_LOG_TYPE_LOG, - .logfn = nf_log_ip_packet, - .me = THIS_MODULE, -}; - -static int __net_init nf_log_ipv4_net_init(struct net *net) -{ - return nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger); -} - -static void __net_exit nf_log_ipv4_net_exit(struct net *net) -{ - nf_log_unset(net, &nf_ip_logger); -} - -static struct pernet_operations nf_log_ipv4_net_ops = { - .init = nf_log_ipv4_net_init, - .exit = nf_log_ipv4_net_exit, -}; - -static int __init nf_log_ipv4_init(void) -{ - int ret; - - ret = register_pernet_subsys(&nf_log_ipv4_net_ops); - if (ret < 0) - return ret; - - ret = nf_log_register(NFPROTO_IPV4, &nf_ip_logger); - if (ret < 0) { - pr_err("failed to register logger\n"); - goto err1; - } - - return 0; - -err1: - unregister_pernet_subsys(&nf_log_ipv4_net_ops); - return ret; -} - -static void __exit nf_log_ipv4_exit(void) -{ - unregister_pernet_subsys(&nf_log_ipv4_net_ops); - nf_log_unregister(&nf_ip_logger); -} - -module_init(nf_log_ipv4_init); -module_exit(nf_log_ipv4_exit); - -MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); -MODULE_DESCRIPTION("Netfilter IPv4 packet logging"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_NF_LOGGER(AF_INET, 0); diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 743777bce179..4075230b14c6 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -16,6 +16,9 @@ #include <net/route.h> #include <net/sock.h> +#define NH_RES_DEFAULT_IDLE_TIMER (120 * HZ) +#define NH_RES_DEFAULT_UNBALANCED_TIMER 0 /* No forced rebalancing. */ + static void remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo); @@ -32,6 +35,7 @@ static const struct nla_policy rtm_nh_policy_new[] = { [NHA_ENCAP_TYPE] = { .type = NLA_U16 }, [NHA_ENCAP] = { .type = NLA_NESTED }, [NHA_FDB] = { .type = NLA_FLAG }, + [NHA_RES_GROUP] = { .type = NLA_NESTED }, }; static const struct nla_policy rtm_nh_policy_get[] = { @@ -45,6 +49,32 @@ static const struct nla_policy rtm_nh_policy_dump[] = { [NHA_FDB] = { .type = NLA_FLAG }, }; +static const struct nla_policy rtm_nh_res_policy_new[] = { + [NHA_RES_GROUP_BUCKETS] = { .type = NLA_U16 }, + [NHA_RES_GROUP_IDLE_TIMER] = { .type = NLA_U32 }, + [NHA_RES_GROUP_UNBALANCED_TIMER] = { .type = NLA_U32 }, +}; + +static const struct nla_policy rtm_nh_policy_dump_bucket[] = { + [NHA_ID] = { .type = NLA_U32 }, + [NHA_OIF] = { .type = NLA_U32 }, + [NHA_MASTER] = { .type = NLA_U32 }, + [NHA_RES_BUCKET] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy rtm_nh_res_bucket_policy_dump[] = { + [NHA_RES_BUCKET_NH_ID] = { .type = NLA_U32 }, +}; + +static const struct nla_policy rtm_nh_policy_get_bucket[] = { + [NHA_ID] = { .type = NLA_U32 }, + [NHA_RES_BUCKET] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy rtm_nh_res_bucket_policy_get[] = { + [NHA_RES_BUCKET_INDEX] = { .type = NLA_U16 }, +}; + static bool nexthop_notifiers_is_empty(struct net *net) { return !net->nexthop.notifier_chain.head; @@ -52,10 +82,8 @@ static bool nexthop_notifiers_is_empty(struct net *net) static void __nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info, - const struct nexthop *nh) + const struct nh_info *nhi) { - struct nh_info *nhi = rtnl_dereference(nh->nh_info); - nh_info->dev = nhi->fib_nhc.nhc_dev; nh_info->gw_family = nhi->fib_nhc.nhc_gw_family; if (nh_info->gw_family == AF_INET) @@ -71,12 +99,14 @@ __nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info, static int nh_notifier_single_info_init(struct nh_notifier_info *info, const struct nexthop *nh) { + struct nh_info *nhi = rtnl_dereference(nh->nh_info); + info->type = NH_NOTIFIER_INFO_TYPE_SINGLE; info->nh = kzalloc(sizeof(*info->nh), GFP_KERNEL); if (!info->nh) return -ENOMEM; - __nh_notifier_single_info_init(info->nh, nh); + __nh_notifier_single_info_init(info->nh, nhi); return 0; } @@ -86,8 +116,8 @@ static void nh_notifier_single_info_fini(struct nh_notifier_info *info) kfree(info->nh); } -static int nh_notifier_mp_info_init(struct nh_notifier_info *info, - struct nh_group *nhg) +static int nh_notifier_mpath_info_init(struct nh_notifier_info *info, + struct nh_group *nhg) { u16 num_nh = nhg->num_nh; int i; @@ -103,11 +133,44 @@ static int nh_notifier_mp_info_init(struct nh_notifier_info *info, for (i = 0; i < num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + struct nh_info *nhi; + nhi = rtnl_dereference(nhge->nh->nh_info); info->nh_grp->nh_entries[i].id = nhge->nh->id; info->nh_grp->nh_entries[i].weight = nhge->weight; __nh_notifier_single_info_init(&info->nh_grp->nh_entries[i].nh, - nhge->nh); + nhi); + } + + return 0; +} + +static int nh_notifier_res_table_info_init(struct nh_notifier_info *info, + struct nh_group *nhg) +{ + struct nh_res_table *res_table = rtnl_dereference(nhg->res_table); + u16 num_nh_buckets = res_table->num_nh_buckets; + unsigned long size; + u16 i; + + info->type = NH_NOTIFIER_INFO_TYPE_RES_TABLE; + size = struct_size(info->nh_res_table, nhs, num_nh_buckets); + info->nh_res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | + __GFP_NOWARN); + if (!info->nh_res_table) + return -ENOMEM; + + info->nh_res_table->num_nh_buckets = num_nh_buckets; + + for (i = 0; i < num_nh_buckets; i++) { + struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; + struct nh_grp_entry *nhge; + struct nh_info *nhi; + + nhge = rtnl_dereference(bucket->nh_entry); + nhi = rtnl_dereference(nhge->nh->nh_info); + __nh_notifier_single_info_init(&info->nh_res_table->nhs[i], + nhi); } return 0; @@ -118,8 +181,10 @@ static int nh_notifier_grp_info_init(struct nh_notifier_info *info, { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); - if (nhg->mpath) - return nh_notifier_mp_info_init(info, nhg); + if (nhg->hash_threshold) + return nh_notifier_mpath_info_init(info, nhg); + else if (nhg->resilient) + return nh_notifier_res_table_info_init(info, nhg); return -EINVAL; } @@ -128,8 +193,10 @@ static void nh_notifier_grp_info_fini(struct nh_notifier_info *info, { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); - if (nhg->mpath) + if (nhg->hash_threshold) kfree(info->nh_grp); + else if (nhg->resilient) + vfree(info->nh_res_table); } static int nh_notifier_info_init(struct nh_notifier_info *info, @@ -181,6 +248,178 @@ static int call_nexthop_notifiers(struct net *net, return notifier_to_errno(err); } +static int +nh_notifier_res_bucket_idle_timer_get(const struct nh_notifier_info *info, + bool force, unsigned int *p_idle_timer_ms) +{ + struct nh_res_table *res_table; + struct nh_group *nhg; + struct nexthop *nh; + int err = 0; + + /* When 'force' is false, nexthop bucket replacement is performed + * because the bucket was deemed to be idle. In this case, capable + * listeners can choose to perform an atomic replacement: The bucket is + * only replaced if it is inactive. However, if the idle timer interval + * is smaller than the interval in which a listener is querying + * buckets' activity from the device, then atomic replacement should + * not be tried. Pass the idle timer value to listeners, so that they + * could determine which type of replacement to perform. + */ + if (force) { + *p_idle_timer_ms = 0; + return 0; + } + + rcu_read_lock(); + + nh = nexthop_find_by_id(info->net, info->id); + if (!nh) { + err = -EINVAL; + goto out; + } + + nhg = rcu_dereference(nh->nh_grp); + res_table = rcu_dereference(nhg->res_table); + *p_idle_timer_ms = jiffies_to_msecs(res_table->idle_timer); + +out: + rcu_read_unlock(); + + return err; +} + +static int nh_notifier_res_bucket_info_init(struct nh_notifier_info *info, + u16 bucket_index, bool force, + struct nh_info *oldi, + struct nh_info *newi) +{ + unsigned int idle_timer_ms; + int err; + + err = nh_notifier_res_bucket_idle_timer_get(info, force, + &idle_timer_ms); + if (err) + return err; + + info->type = NH_NOTIFIER_INFO_TYPE_RES_BUCKET; + info->nh_res_bucket = kzalloc(sizeof(*info->nh_res_bucket), + GFP_KERNEL); + if (!info->nh_res_bucket) + return -ENOMEM; + + info->nh_res_bucket->bucket_index = bucket_index; + info->nh_res_bucket->idle_timer_ms = idle_timer_ms; + info->nh_res_bucket->force = force; + __nh_notifier_single_info_init(&info->nh_res_bucket->old_nh, oldi); + __nh_notifier_single_info_init(&info->nh_res_bucket->new_nh, newi); + return 0; +} + +static void nh_notifier_res_bucket_info_fini(struct nh_notifier_info *info) +{ + kfree(info->nh_res_bucket); +} + +static int __call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id, + u16 bucket_index, bool force, + struct nh_info *oldi, + struct nh_info *newi, + struct netlink_ext_ack *extack) +{ + struct nh_notifier_info info = { + .net = net, + .extack = extack, + .id = nhg_id, + }; + int err; + + if (nexthop_notifiers_is_empty(net)) + return 0; + + err = nh_notifier_res_bucket_info_init(&info, bucket_index, force, + oldi, newi); + if (err) + return err; + + err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, + NEXTHOP_EVENT_BUCKET_REPLACE, &info); + nh_notifier_res_bucket_info_fini(&info); + + return notifier_to_errno(err); +} + +/* There are three users of RES_TABLE, and NHs etc. referenced from there: + * + * 1) a collection of callbacks for NH maintenance. This operates under + * RTNL, + * 2) the delayed work that gradually balances the resilient table, + * 3) and nexthop_select_path(), operating under RCU. + * + * Both the delayed work and the RTNL block are writers, and need to + * maintain mutual exclusion. Since there are only two and well-known + * writers for each table, the RTNL code can make sure it has exclusive + * access thus: + * + * - Have the DW operate without locking; + * - synchronously cancel the DW; + * - do the writing; + * - if the write was not actually a delete, call upkeep, which schedules + * DW again if necessary. + * + * The functions that are always called from the RTNL context use + * rtnl_dereference(). The functions that can also be called from the DW do + * a raw dereference and rely on the above mutual exclusion scheme. + */ +#define nh_res_dereference(p) (rcu_dereference_raw(p)) + +static int call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id, + u16 bucket_index, bool force, + struct nexthop *old_nh, + struct nexthop *new_nh, + struct netlink_ext_ack *extack) +{ + struct nh_info *oldi = nh_res_dereference(old_nh->nh_info); + struct nh_info *newi = nh_res_dereference(new_nh->nh_info); + + return __call_nexthop_res_bucket_notifiers(net, nhg_id, bucket_index, + force, oldi, newi, extack); +} + +static int call_nexthop_res_table_notifiers(struct net *net, struct nexthop *nh, + struct netlink_ext_ack *extack) +{ + struct nh_notifier_info info = { + .net = net, + .extack = extack, + }; + struct nh_group *nhg; + int err; + + ASSERT_RTNL(); + + if (nexthop_notifiers_is_empty(net)) + return 0; + + /* At this point, the nexthop buckets are still not populated. Only + * emit a notification with the logical nexthops, so that a listener + * could potentially veto it in case of unsupported configuration. + */ + nhg = rtnl_dereference(nh->nh_grp); + err = nh_notifier_mpath_info_init(&info, nhg); + if (err) { + NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info"); + return err; + } + + err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, + NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE, + &info); + kfree(info.nh_grp); + + return notifier_to_errno(err); +} + static int call_nexthop_notifier(struct notifier_block *nb, struct net *net, enum nexthop_event_type event_type, struct nexthop *nh, @@ -239,6 +478,9 @@ static void nexthop_free_group(struct nexthop *nh) WARN_ON(nhg->spare == nhg); + if (nhg->resilient) + vfree(rcu_dereference_raw(nhg->res_table)); + kfree(nhg->spare); kfree(nhg); } @@ -297,6 +539,30 @@ static struct nh_group *nexthop_grp_alloc(u16 num_nh) return nhg; } +static void nh_res_table_upkeep_dw(struct work_struct *work); + +static struct nh_res_table * +nexthop_res_table_alloc(struct net *net, u32 nhg_id, struct nh_config *cfg) +{ + const u16 num_nh_buckets = cfg->nh_grp_res_num_buckets; + struct nh_res_table *res_table; + unsigned long size; + + size = struct_size(res_table, nh_buckets, num_nh_buckets); + res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); + if (!res_table) + return NULL; + + res_table->net = net; + res_table->nhg_id = nhg_id; + INIT_DELAYED_WORK(&res_table->upkeep_dw, &nh_res_table_upkeep_dw); + INIT_LIST_HEAD(&res_table->uw_nh_entries); + res_table->idle_timer = cfg->nh_grp_res_idle_timer; + res_table->unbalanced_timer = cfg->nh_grp_res_unbalanced_timer; + res_table->num_nh_buckets = num_nh_buckets; + return res_table; +} + static void nh_base_seq_inc(struct net *net) { while (++net->nexthop.seq == 0) @@ -345,6 +611,48 @@ static u32 nh_find_unused_id(struct net *net) return 0; } +static void nh_res_time_set_deadline(unsigned long next_time, + unsigned long *deadline) +{ + if (time_before(next_time, *deadline)) + *deadline = next_time; +} + +static clock_t nh_res_table_unbalanced_time(struct nh_res_table *res_table) +{ + if (list_empty(&res_table->uw_nh_entries)) + return 0; + return jiffies_delta_to_clock_t(jiffies - res_table->unbalanced_since); +} + +static int nla_put_nh_group_res(struct sk_buff *skb, struct nh_group *nhg) +{ + struct nh_res_table *res_table = rtnl_dereference(nhg->res_table); + struct nlattr *nest; + + nest = nla_nest_start(skb, NHA_RES_GROUP); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u16(skb, NHA_RES_GROUP_BUCKETS, + res_table->num_nh_buckets) || + nla_put_u32(skb, NHA_RES_GROUP_IDLE_TIMER, + jiffies_to_clock_t(res_table->idle_timer)) || + nla_put_u32(skb, NHA_RES_GROUP_UNBALANCED_TIMER, + jiffies_to_clock_t(res_table->unbalanced_timer)) || + nla_put_u64_64bit(skb, NHA_RES_GROUP_UNBALANCED_TIME, + nh_res_table_unbalanced_time(res_table), + NHA_RES_GROUP_PAD)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg) { struct nexthop_grp *p; @@ -353,8 +661,10 @@ static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg) u16 group_type = 0; int i; - if (nhg->mpath) + if (nhg->hash_threshold) group_type = NEXTHOP_GRP_TYPE_MPATH; + else if (nhg->resilient) + group_type = NEXTHOP_GRP_TYPE_RES; if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type)) goto nla_put_failure; @@ -370,6 +680,9 @@ static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg) p += 1; } + if (nhg->resilient && nla_put_nh_group_res(skb, nhg)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -457,13 +770,26 @@ nla_put_failure: return -EMSGSIZE; } +static size_t nh_nlmsg_size_grp_res(struct nh_group *nhg) +{ + return nla_total_size(0) + /* NHA_RES_GROUP */ + nla_total_size(2) + /* NHA_RES_GROUP_BUCKETS */ + nla_total_size(4) + /* NHA_RES_GROUP_IDLE_TIMER */ + nla_total_size(4) + /* NHA_RES_GROUP_UNBALANCED_TIMER */ + nla_total_size_64bit(8);/* NHA_RES_GROUP_UNBALANCED_TIME */ +} + static size_t nh_nlmsg_size_grp(struct nexthop *nh) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh; + size_t tot = nla_total_size(sz) + + nla_total_size(2); /* NHA_GROUP_TYPE */ - return nla_total_size(sz) + - nla_total_size(2); /* NHA_GROUP_TYPE */ + if (nhg->resilient) + tot += nh_nlmsg_size_grp_res(nhg); + + return tot; } static size_t nh_nlmsg_size_single(struct nexthop *nh) @@ -538,18 +864,142 @@ errout: rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err); } +static unsigned long nh_res_bucket_used_time(const struct nh_res_bucket *bucket) +{ + return (unsigned long)atomic_long_read(&bucket->used_time); +} + +static unsigned long +nh_res_bucket_idle_point(const struct nh_res_table *res_table, + const struct nh_res_bucket *bucket, + unsigned long now) +{ + unsigned long time = nh_res_bucket_used_time(bucket); + + /* Bucket was not used since it was migrated. The idle time is now. */ + if (time == bucket->migrated_time) + return now; + + return time + res_table->idle_timer; +} + +static unsigned long +nh_res_table_unb_point(const struct nh_res_table *res_table) +{ + return res_table->unbalanced_since + res_table->unbalanced_timer; +} + +static void nh_res_bucket_set_idle(const struct nh_res_table *res_table, + struct nh_res_bucket *bucket) +{ + unsigned long now = jiffies; + + atomic_long_set(&bucket->used_time, (long)now); + bucket->migrated_time = now; +} + +static void nh_res_bucket_set_busy(struct nh_res_bucket *bucket) +{ + atomic_long_set(&bucket->used_time, (long)jiffies); +} + +static clock_t nh_res_bucket_idle_time(const struct nh_res_bucket *bucket) +{ + unsigned long used_time = nh_res_bucket_used_time(bucket); + + return jiffies_delta_to_clock_t(jiffies - used_time); +} + +static int nh_fill_res_bucket(struct sk_buff *skb, struct nexthop *nh, + struct nh_res_bucket *bucket, u16 bucket_index, + int event, u32 portid, u32 seq, + unsigned int nlflags, + struct netlink_ext_ack *extack) +{ + struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry); + struct nlmsghdr *nlh; + struct nlattr *nest; + struct nhmsg *nhm; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags); + if (!nlh) + return -EMSGSIZE; + + nhm = nlmsg_data(nlh); + nhm->nh_family = AF_UNSPEC; + nhm->nh_flags = bucket->nh_flags; + nhm->nh_protocol = nh->protocol; + nhm->nh_scope = 0; + nhm->resvd = 0; + + if (nla_put_u32(skb, NHA_ID, nh->id)) + goto nla_put_failure; + + nest = nla_nest_start(skb, NHA_RES_BUCKET); + if (!nest) + goto nla_put_failure; + + if (nla_put_u16(skb, NHA_RES_BUCKET_INDEX, bucket_index) || + nla_put_u32(skb, NHA_RES_BUCKET_NH_ID, nhge->nh->id) || + nla_put_u64_64bit(skb, NHA_RES_BUCKET_IDLE_TIME, + nh_res_bucket_idle_time(bucket), + NHA_RES_BUCKET_PAD)) + goto nla_put_failure_nest; + + nla_nest_end(skb, nest); + nlmsg_end(skb, nlh); + return 0; + +nla_put_failure_nest: + nla_nest_cancel(skb, nest); +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static void nexthop_bucket_notify(struct nh_res_table *res_table, + u16 bucket_index) +{ + struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index]; + struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry); + struct nexthop *nh = nhge->nh_parent; + struct sk_buff *skb; + int err = -ENOBUFS; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + goto errout; + + err = nh_fill_res_bucket(skb, nh, bucket, bucket_index, + RTM_NEWNEXTHOPBUCKET, 0, 0, NLM_F_REPLACE, + NULL); + if (err < 0) { + kfree_skb(skb); + goto errout; + } + + rtnl_notify(skb, nh->net, 0, RTNLGRP_NEXTHOP, NULL, GFP_KERNEL); + return; +errout: + if (err < 0) + rtnl_set_sk_err(nh->net, RTNLGRP_NEXTHOP, err); +} + static bool valid_group_nh(struct nexthop *nh, unsigned int npaths, bool *is_fdb, struct netlink_ext_ack *extack) { if (nh->is_group) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); - /* nested multipath (group within a group) is not - * supported - */ - if (nhg->mpath) { + /* Nesting groups within groups is not supported. */ + if (nhg->hash_threshold) { NL_SET_ERR_MSG(extack, - "Multipath group can not be a nexthop within a group"); + "Hash-threshold group can not be a nexthop within a group"); + return false; + } + if (nhg->resilient) { + NL_SET_ERR_MSG(extack, + "Resilient group can not be a nexthop within a group"); return false; } *is_fdb = nhg->fdb_nh; @@ -591,7 +1041,7 @@ static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family, static int nh_check_attr_group(struct net *net, struct nlattr *tb[], size_t tb_size, - struct netlink_ext_ack *extack) + u16 nh_grp_type, struct netlink_ext_ack *extack) { unsigned int len = nla_len(tb[NHA_GROUP]); u8 nh_family = AF_UNSPEC; @@ -652,8 +1102,14 @@ static int nh_check_attr_group(struct net *net, for (i = NHA_GROUP_TYPE + 1; i < tb_size; ++i) { if (!tb[i]) continue; - if (i == NHA_FDB) + switch (i) { + case NHA_FDB: continue; + case NHA_RES_GROUP: + if (nh_grp_type == NEXTHOP_GRP_TYPE_RES) + continue; + break; + } NL_SET_ERR_MSG(extack, "No other attributes can be set in nexthop groups"); return -EINVAL; @@ -695,7 +1151,7 @@ static bool ipv4_good_nh(const struct fib_nh *nh) return !!(state & NUD_VALID); } -static struct nexthop *nexthop_select_path_mp(struct nh_group *nhg, int hash) +static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash) { struct nexthop *rc = NULL; int i; @@ -704,7 +1160,7 @@ static struct nexthop *nexthop_select_path_mp(struct nh_group *nhg, int hash) struct nh_grp_entry *nhge = &nhg->nh_entries[i]; struct nh_info *nhi; - if (hash > atomic_read(&nhge->mpath.upper_bound)) + if (hash > atomic_read(&nhge->hthr.upper_bound)) continue; nhi = rcu_dereference(nhge->nh->nh_info); @@ -732,6 +1188,22 @@ static struct nexthop *nexthop_select_path_mp(struct nh_group *nhg, int hash) return rc; } +static struct nexthop *nexthop_select_path_res(struct nh_group *nhg, int hash) +{ + struct nh_res_table *res_table = rcu_dereference(nhg->res_table); + u16 bucket_index = hash % res_table->num_nh_buckets; + struct nh_res_bucket *bucket; + struct nh_grp_entry *nhge; + + /* nexthop_select_path() is expected to return a non-NULL value, so + * skip protocol validation and just hand out whatever there is. + */ + bucket = &res_table->nh_buckets[bucket_index]; + nh_res_bucket_set_busy(bucket); + nhge = rcu_dereference(bucket->nh_entry); + return nhge->nh; +} + struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) { struct nh_group *nhg; @@ -740,8 +1212,10 @@ struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) return nh; nhg = rcu_dereference(nh->nh_grp); - if (nhg->mpath) - return nexthop_select_path_mp(nhg, hash); + if (nhg->hash_threshold) + return nexthop_select_path_hthr(nhg, hash); + else if (nhg->resilient) + return nexthop_select_path_res(nhg, hash); /* Unreachable. */ return NULL; @@ -924,7 +1398,319 @@ static int fib_check_nh_list(struct nexthop *old, struct nexthop *new, return 0; } -static void nh_group_rebalance(struct nh_group *nhg) +static bool nh_res_nhge_is_balanced(const struct nh_grp_entry *nhge) +{ + return nhge->res.count_buckets == nhge->res.wants_buckets; +} + +static bool nh_res_nhge_is_ow(const struct nh_grp_entry *nhge) +{ + return nhge->res.count_buckets > nhge->res.wants_buckets; +} + +static bool nh_res_nhge_is_uw(const struct nh_grp_entry *nhge) +{ + return nhge->res.count_buckets < nhge->res.wants_buckets; +} + +static bool nh_res_table_is_balanced(const struct nh_res_table *res_table) +{ + return list_empty(&res_table->uw_nh_entries); +} + +static void nh_res_bucket_unset_nh(struct nh_res_bucket *bucket) +{ + struct nh_grp_entry *nhge; + + if (bucket->occupied) { + nhge = nh_res_dereference(bucket->nh_entry); + nhge->res.count_buckets--; + bucket->occupied = false; + } +} + +static void nh_res_bucket_set_nh(struct nh_res_bucket *bucket, + struct nh_grp_entry *nhge) +{ + nh_res_bucket_unset_nh(bucket); + + bucket->occupied = true; + rcu_assign_pointer(bucket->nh_entry, nhge); + nhge->res.count_buckets++; +} + +static bool nh_res_bucket_should_migrate(struct nh_res_table *res_table, + struct nh_res_bucket *bucket, + unsigned long *deadline, bool *force) +{ + unsigned long now = jiffies; + struct nh_grp_entry *nhge; + unsigned long idle_point; + + if (!bucket->occupied) { + /* The bucket is not occupied, its NHGE pointer is either + * NULL or obsolete. We _have to_ migrate: set force. + */ + *force = true; + return true; + } + + nhge = nh_res_dereference(bucket->nh_entry); + + /* If the bucket is populated by an underweight or balanced + * nexthop, do not migrate. + */ + if (!nh_res_nhge_is_ow(nhge)) + return false; + + /* At this point we know that the bucket is populated with an + * overweight nexthop. It needs to be migrated to a new nexthop if + * the idle timer of unbalanced timer expired. + */ + + idle_point = nh_res_bucket_idle_point(res_table, bucket, now); + if (time_after_eq(now, idle_point)) { + /* The bucket is idle. We _can_ migrate: unset force. */ + *force = false; + return true; + } + + /* Unbalanced timer of 0 means "never force". */ + if (res_table->unbalanced_timer) { + unsigned long unb_point; + + unb_point = nh_res_table_unb_point(res_table); + if (time_after(now, unb_point)) { + /* The bucket is not idle, but the unbalanced timer + * expired. We _can_ migrate, but set force anyway, + * so that drivers know to ignore activity reports + * from the HW. + */ + *force = true; + return true; + } + + nh_res_time_set_deadline(unb_point, deadline); + } + + nh_res_time_set_deadline(idle_point, deadline); + return false; +} + +static bool nh_res_bucket_migrate(struct nh_res_table *res_table, + u16 bucket_index, bool notify, + bool notify_nl, bool force) +{ + struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index]; + struct nh_grp_entry *new_nhge; + struct netlink_ext_ack extack; + int err; + + new_nhge = list_first_entry_or_null(&res_table->uw_nh_entries, + struct nh_grp_entry, + res.uw_nh_entry); + if (WARN_ON_ONCE(!new_nhge)) + /* If this function is called, "bucket" is either not + * occupied, or it belongs to a next hop that is + * overweight. In either case, there ought to be a + * corresponding underweight next hop. + */ + return false; + + if (notify) { + struct nh_grp_entry *old_nhge; + + old_nhge = nh_res_dereference(bucket->nh_entry); + err = call_nexthop_res_bucket_notifiers(res_table->net, + res_table->nhg_id, + bucket_index, force, + old_nhge->nh, + new_nhge->nh, &extack); + if (err) { + pr_err_ratelimited("%s\n", extack._msg); + if (!force) + return false; + /* It is not possible to veto a forced replacement, so + * just clear the hardware flags from the nexthop + * bucket to indicate to user space that this bucket is + * not correctly populated in hardware. + */ + bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); + } + } + + nh_res_bucket_set_nh(bucket, new_nhge); + nh_res_bucket_set_idle(res_table, bucket); + + if (notify_nl) + nexthop_bucket_notify(res_table, bucket_index); + + if (nh_res_nhge_is_balanced(new_nhge)) + list_del(&new_nhge->res.uw_nh_entry); + return true; +} + +#define NH_RES_UPKEEP_DW_MINIMUM_INTERVAL (HZ / 2) + +static void nh_res_table_upkeep(struct nh_res_table *res_table, + bool notify, bool notify_nl) +{ + unsigned long now = jiffies; + unsigned long deadline; + u16 i; + + /* Deadline is the next time that upkeep should be run. It is the + * earliest time at which one of the buckets might be migrated. + * Start at the most pessimistic estimate: either unbalanced_timer + * from now, or if there is none, idle_timer from now. For each + * encountered time point, call nh_res_time_set_deadline() to + * refine the estimate. + */ + if (res_table->unbalanced_timer) + deadline = now + res_table->unbalanced_timer; + else + deadline = now + res_table->idle_timer; + + for (i = 0; i < res_table->num_nh_buckets; i++) { + struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; + bool force; + + if (nh_res_bucket_should_migrate(res_table, bucket, + &deadline, &force)) { + if (!nh_res_bucket_migrate(res_table, i, notify, + notify_nl, force)) { + unsigned long idle_point; + + /* A driver can override the migration + * decision if the HW reports that the + * bucket is actually not idle. Therefore + * remark the bucket as busy again and + * update the deadline. + */ + nh_res_bucket_set_busy(bucket); + idle_point = nh_res_bucket_idle_point(res_table, + bucket, + now); + nh_res_time_set_deadline(idle_point, &deadline); + } + } + } + + /* If the group is still unbalanced, schedule the next upkeep to + * either the deadline computed above, or the minimum deadline, + * whichever comes later. + */ + if (!nh_res_table_is_balanced(res_table)) { + unsigned long now = jiffies; + unsigned long min_deadline; + + min_deadline = now + NH_RES_UPKEEP_DW_MINIMUM_INTERVAL; + if (time_before(deadline, min_deadline)) + deadline = min_deadline; + + queue_delayed_work(system_power_efficient_wq, + &res_table->upkeep_dw, deadline - now); + } +} + +static void nh_res_table_upkeep_dw(struct work_struct *work) +{ + struct delayed_work *dw = to_delayed_work(work); + struct nh_res_table *res_table; + + res_table = container_of(dw, struct nh_res_table, upkeep_dw); + nh_res_table_upkeep(res_table, true, true); +} + +static void nh_res_table_cancel_upkeep(struct nh_res_table *res_table) +{ + cancel_delayed_work_sync(&res_table->upkeep_dw); +} + +static void nh_res_group_rebalance(struct nh_group *nhg, + struct nh_res_table *res_table) +{ + int prev_upper_bound = 0; + int total = 0; + int w = 0; + int i; + + INIT_LIST_HEAD(&res_table->uw_nh_entries); + + for (i = 0; i < nhg->num_nh; ++i) + total += nhg->nh_entries[i].weight; + + for (i = 0; i < nhg->num_nh; ++i) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + int upper_bound; + + w += nhge->weight; + upper_bound = DIV_ROUND_CLOSEST(res_table->num_nh_buckets * w, + total); + nhge->res.wants_buckets = upper_bound - prev_upper_bound; + prev_upper_bound = upper_bound; + + if (nh_res_nhge_is_uw(nhge)) { + if (list_empty(&res_table->uw_nh_entries)) + res_table->unbalanced_since = jiffies; + list_add(&nhge->res.uw_nh_entry, + &res_table->uw_nh_entries); + } + } +} + +/* Migrate buckets in res_table so that they reference NHGE's from NHG with + * the right NH ID. Set those buckets that do not have a corresponding NHGE + * entry in NHG as not occupied. + */ +static void nh_res_table_migrate_buckets(struct nh_res_table *res_table, + struct nh_group *nhg) +{ + u16 i; + + for (i = 0; i < res_table->num_nh_buckets; i++) { + struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; + u32 id = rtnl_dereference(bucket->nh_entry)->nh->id; + bool found = false; + int j; + + for (j = 0; j < nhg->num_nh; j++) { + struct nh_grp_entry *nhge = &nhg->nh_entries[j]; + + if (nhge->nh->id == id) { + nh_res_bucket_set_nh(bucket, nhge); + found = true; + break; + } + } + + if (!found) + nh_res_bucket_unset_nh(bucket); + } +} + +static void replace_nexthop_grp_res(struct nh_group *oldg, + struct nh_group *newg) +{ + /* For NH group replacement, the new NHG might only have a stub + * hash table with 0 buckets, because the number of buckets was not + * specified. For NH removal, oldg and newg both reference the same + * res_table. So in any case, in the following, we want to work + * with oldg->res_table. + */ + struct nh_res_table *old_res_table = rtnl_dereference(oldg->res_table); + unsigned long prev_unbalanced_since = old_res_table->unbalanced_since; + bool prev_has_uw = !list_empty(&old_res_table->uw_nh_entries); + + nh_res_table_cancel_upkeep(old_res_table); + nh_res_table_migrate_buckets(old_res_table, newg); + nh_res_group_rebalance(newg, old_res_table); + if (prev_has_uw && !list_empty(&old_res_table->uw_nh_entries)) + old_res_table->unbalanced_since = prev_unbalanced_since; + nh_res_table_upkeep(old_res_table, true, false); +} + +static void nh_hthr_group_rebalance(struct nh_group *nhg) { int total = 0; int w = 0; @@ -939,7 +1725,7 @@ static void nh_group_rebalance(struct nh_group *nhg) w += nhge->weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1; - atomic_set(&nhge->mpath.upper_bound, upper_bound); + atomic_set(&nhge->hthr.upper_bound, upper_bound); } } @@ -965,7 +1751,9 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, } newg->has_v4 = false; - newg->mpath = nhg->mpath; + newg->is_multipath = nhg->is_multipath; + newg->hash_threshold = nhg->hash_threshold; + newg->resilient = nhg->resilient; newg->fdb_nh = nhg->fdb_nh; newg->num_nh = nhg->num_nh; @@ -993,15 +1781,25 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, j++; } - nh_group_rebalance(newg); + if (newg->hash_threshold) + nh_hthr_group_rebalance(newg); + else if (newg->resilient) + replace_nexthop_grp_res(nhg, newg); + rcu_assign_pointer(nhp->nh_grp, newg); list_del(&nhge->nh_list); nexthop_put(nhge->nh); - err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, &extack); - if (err) - pr_err("%s\n", extack._msg); + /* Removal of a NH from a resilient group is notified through + * bucket notifications. + */ + if (newg->hash_threshold) { + err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, + &extack); + if (err) + pr_err("%s\n", extack._msg); + } if (nlinfo) nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo); @@ -1022,6 +1820,7 @@ static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) { struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp); + struct nh_res_table *res_table; int i, num_nh = nhg->num_nh; for (i = 0; i < num_nh; ++i) { @@ -1032,6 +1831,11 @@ static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) list_del_init(&nhge->nh_list); } + + if (nhg->resilient) { + res_table = rtnl_dereference(nhg->res_table); + nh_res_table_cancel_upkeep(res_table); + } } /* not called for nexthop replace */ @@ -1107,9 +1911,12 @@ static void nh_rt_cache_flush(struct net *net, struct nexthop *nh) } static int replace_nexthop_grp(struct net *net, struct nexthop *old, - struct nexthop *new, + struct nexthop *new, const struct nh_config *cfg, struct netlink_ext_ack *extack) { + struct nh_res_table *tmp_table = NULL; + struct nh_res_table *new_res_table; + struct nh_res_table *old_res_table; struct nh_group *oldg, *newg; int i, err; @@ -1118,19 +1925,67 @@ static int replace_nexthop_grp(struct net *net, struct nexthop *old, return -EINVAL; } - err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack); - if (err) - return err; - oldg = rtnl_dereference(old->nh_grp); newg = rtnl_dereference(new->nh_grp); + if (newg->hash_threshold != oldg->hash_threshold) { + NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with one of a different type."); + return -EINVAL; + } + + if (newg->hash_threshold) { + err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, + extack); + if (err) + return err; + } else if (newg->resilient) { + new_res_table = rtnl_dereference(newg->res_table); + old_res_table = rtnl_dereference(oldg->res_table); + + /* Accept if num_nh_buckets was not given, but if it was + * given, demand that the value be correct. + */ + if (cfg->nh_grp_res_has_num_buckets && + cfg->nh_grp_res_num_buckets != + old_res_table->num_nh_buckets) { + NL_SET_ERR_MSG(extack, "Can not change number of buckets of a resilient nexthop group."); + return -EINVAL; + } + + /* Emit a pre-replace notification so that listeners could veto + * a potentially unsupported configuration. Otherwise, + * individual bucket replacement notifications would need to be + * vetoed, which is something that should only happen if the + * bucket is currently active. + */ + err = call_nexthop_res_table_notifiers(net, new, extack); + if (err) + return err; + + if (cfg->nh_grp_res_has_idle_timer) + old_res_table->idle_timer = cfg->nh_grp_res_idle_timer; + if (cfg->nh_grp_res_has_unbalanced_timer) + old_res_table->unbalanced_timer = + cfg->nh_grp_res_unbalanced_timer; + + replace_nexthop_grp_res(oldg, newg); + + tmp_table = new_res_table; + rcu_assign_pointer(newg->res_table, old_res_table); + rcu_assign_pointer(newg->spare->res_table, old_res_table); + } + /* update parents - used by nexthop code for cleanup */ for (i = 0; i < newg->num_nh; i++) newg->nh_entries[i].nh_parent = old; rcu_assign_pointer(old->nh_grp, newg); + if (newg->resilient) { + rcu_assign_pointer(oldg->res_table, tmp_table); + rcu_assign_pointer(oldg->spare->res_table, tmp_table); + } + for (i = 0; i < oldg->num_nh; i++) oldg->nh_entries[i].nh_parent = new; @@ -1156,6 +2011,71 @@ static void nh_group_v4_update(struct nh_group *nhg) nhg->has_v4 = has_v4; } +static int replace_nexthop_single_notify_res(struct net *net, + struct nh_res_table *res_table, + struct nexthop *old, + struct nh_info *oldi, + struct nh_info *newi, + struct netlink_ext_ack *extack) +{ + u32 nhg_id = res_table->nhg_id; + int err; + u16 i; + + for (i = 0; i < res_table->num_nh_buckets; i++) { + struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; + struct nh_grp_entry *nhge; + + nhge = rtnl_dereference(bucket->nh_entry); + if (nhge->nh == old) { + err = __call_nexthop_res_bucket_notifiers(net, nhg_id, + i, true, + oldi, newi, + extack); + if (err) + goto err_notify; + } + } + + return 0; + +err_notify: + while (i-- > 0) { + struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; + struct nh_grp_entry *nhge; + + nhge = rtnl_dereference(bucket->nh_entry); + if (nhge->nh == old) + __call_nexthop_res_bucket_notifiers(net, nhg_id, i, + true, newi, oldi, + extack); + } + return err; +} + +static int replace_nexthop_single_notify(struct net *net, + struct nexthop *group_nh, + struct nexthop *old, + struct nh_info *oldi, + struct nh_info *newi, + struct netlink_ext_ack *extack) +{ + struct nh_group *nhg = rtnl_dereference(group_nh->nh_grp); + struct nh_res_table *res_table; + + if (nhg->hash_threshold) { + return call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, + group_nh, extack); + } else if (nhg->resilient) { + res_table = rtnl_dereference(nhg->res_table); + return replace_nexthop_single_notify_res(net, res_table, + old, oldi, newi, + extack); + } + + return -EINVAL; +} + static int replace_nexthop_single(struct net *net, struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) @@ -1198,8 +2118,8 @@ static int replace_nexthop_single(struct net *net, struct nexthop *old, list_for_each_entry(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; - err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, - extack); + err = replace_nexthop_single_notify(net, nhp, old, oldi, newi, + extack); if (err) goto err_notify; } @@ -1229,7 +2149,7 @@ err_notify: list_for_each_entry_continue_reverse(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; - call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, extack); + replace_nexthop_single_notify(net, nhp, old, newi, oldi, NULL); } call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, old, extack); return err; @@ -1276,7 +2196,8 @@ static void nexthop_replace_notify(struct net *net, struct nexthop *nh, } static int replace_nexthop(struct net *net, struct nexthop *old, - struct nexthop *new, struct netlink_ext_ack *extack) + struct nexthop *new, const struct nh_config *cfg, + struct netlink_ext_ack *extack) { bool new_is_reject = false; struct nh_grp_entry *nhge; @@ -1319,7 +2240,7 @@ static int replace_nexthop(struct net *net, struct nexthop *old, } if (old->is_group) - err = replace_nexthop_grp(net, old, new, extack); + err = replace_nexthop_grp(net, old, new, cfg, extack); else err = replace_nexthop_single(net, old, new, extack); @@ -1361,7 +2282,7 @@ static int insert_nexthop(struct net *net, struct nexthop *new_nh, } else if (new_id > nh->id) { pp = &next->rb_right; } else if (replace) { - rc = replace_nexthop(net, nh, new_nh, extack); + rc = replace_nexthop(net, nh, new_nh, cfg, extack); if (!rc) { new_nh = nh; /* send notification with old nh */ replace_notify = 1; @@ -1379,9 +2300,37 @@ static int insert_nexthop(struct net *net, struct nexthop *new_nh, goto out; } + if (new_nh->is_group) { + struct nh_group *nhg = rtnl_dereference(new_nh->nh_grp); + struct nh_res_table *res_table; + + if (nhg->resilient) { + res_table = rtnl_dereference(nhg->res_table); + + /* Not passing the number of buckets is OK when + * replacing, but not when creating a new group. + */ + if (!cfg->nh_grp_res_has_num_buckets) { + NL_SET_ERR_MSG(extack, "Number of buckets not specified for nexthop group insertion"); + rc = -EINVAL; + goto out; + } + + nh_res_group_rebalance(nhg, res_table); + + /* Do not send bucket notifications, we do full + * notification below. + */ + nh_res_table_upkeep(res_table, false, false); + } + } + rb_link_node_rcu(&new_nh->rb_node, parent, pp); rb_insert_color(&new_nh->rb_node, root); + /* The initial insertion is a full notification for hash-threshold as + * well as resilient groups. + */ rc = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new_nh, extack); if (rc) rb_erase(&new_nh->rb_node, &net->nexthop.rb_root); @@ -1441,6 +2390,7 @@ static struct nexthop *nexthop_create_group(struct net *net, u16 num_nh = nla_len(grps_attr) / sizeof(*entry); struct nh_group *nhg; struct nexthop *nh; + int err; int i; if (WARN_ON(!num_nh)) @@ -1472,8 +2422,10 @@ static struct nexthop *nexthop_create_group(struct net *net, struct nh_info *nhi; nhe = nexthop_find_by_id(net, entry[i].id); - if (!nexthop_get(nhe)) + if (!nexthop_get(nhe)) { + err = -ENOENT; goto out_no_nh; + } nhi = rtnl_dereference(nhe->nh_info); if (nhi->family == AF_INET) @@ -1485,13 +2437,28 @@ static struct nexthop *nexthop_create_group(struct net *net, nhg->nh_entries[i].nh_parent = nh; } - if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) - nhg->mpath = 1; + if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) { + nhg->hash_threshold = 1; + nhg->is_multipath = true; + } else if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) { + struct nh_res_table *res_table; + + res_table = nexthop_res_table_alloc(net, cfg->nh_id, cfg); + if (!res_table) { + err = -ENOMEM; + goto out_no_nh; + } + + rcu_assign_pointer(nhg->spare->res_table, res_table); + rcu_assign_pointer(nhg->res_table, res_table); + nhg->resilient = true; + nhg->is_multipath = true; + } - WARN_ON_ONCE(nhg->mpath != 1); + WARN_ON_ONCE(nhg->hash_threshold + nhg->resilient != 1); - if (nhg->mpath) - nh_group_rebalance(nhg); + if (nhg->hash_threshold) + nh_hthr_group_rebalance(nhg); if (cfg->nh_fdb) nhg->fdb_nh = 1; @@ -1510,7 +2477,7 @@ out_no_nh: kfree(nhg); kfree(nh); - return ERR_PTR(-ENOENT); + return ERR_PTR(err); } static int nh_create_ipv4(struct net *net, struct nexthop *nh, @@ -1680,6 +2647,70 @@ static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg, return nh; } +static int rtm_nh_get_timer(struct nlattr *attr, unsigned long fallback, + unsigned long *timer_p, bool *has_p, + struct netlink_ext_ack *extack) +{ + unsigned long timer; + u32 value; + + if (!attr) { + *timer_p = fallback; + *has_p = false; + return 0; + } + + value = nla_get_u32(attr); + timer = clock_t_to_jiffies(value); + if (timer == ~0UL) { + NL_SET_ERR_MSG(extack, "Timer value too large"); + return -EINVAL; + } + + *timer_p = timer; + *has_p = true; + return 0; +} + +static int rtm_to_nh_config_grp_res(struct nlattr *res, struct nh_config *cfg, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_policy_new)] = {}; + int err; + + if (res) { + err = nla_parse_nested(tb, + ARRAY_SIZE(rtm_nh_res_policy_new) - 1, + res, rtm_nh_res_policy_new, extack); + if (err < 0) + return err; + } + + if (tb[NHA_RES_GROUP_BUCKETS]) { + cfg->nh_grp_res_num_buckets = + nla_get_u16(tb[NHA_RES_GROUP_BUCKETS]); + cfg->nh_grp_res_has_num_buckets = true; + if (!cfg->nh_grp_res_num_buckets) { + NL_SET_ERR_MSG(extack, "Number of buckets needs to be non-0"); + return -EINVAL; + } + } + + err = rtm_nh_get_timer(tb[NHA_RES_GROUP_IDLE_TIMER], + NH_RES_DEFAULT_IDLE_TIMER, + &cfg->nh_grp_res_idle_timer, + &cfg->nh_grp_res_has_idle_timer, + extack); + if (err) + return err; + + return rtm_nh_get_timer(tb[NHA_RES_GROUP_UNBALANCED_TIMER], + NH_RES_DEFAULT_UNBALANCED_TIMER, + &cfg->nh_grp_res_unbalanced_timer, + &cfg->nh_grp_res_has_unbalanced_timer, + extack); +} + static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct nh_config *cfg, struct netlink_ext_ack *extack) @@ -1758,7 +2789,14 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, NL_SET_ERR_MSG(extack, "Invalid group type"); goto out; } - err = nh_check_attr_group(net, tb, ARRAY_SIZE(tb), extack); + err = nh_check_attr_group(net, tb, ARRAY_SIZE(tb), + cfg->nh_grp_type, extack); + if (err) + goto out; + + if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) + err = rtm_to_nh_config_grp_res(tb[NHA_RES_GROUP], + cfg, extack); /* no other attributes should be set */ goto out; @@ -1983,10 +3021,12 @@ errout_free: } struct nh_dump_filter { + u32 nh_id; int dev_idx; int master_idx; bool group_filter; bool fdb_filter; + u32 res_bucket_nh_id; }; static bool nh_dump_filtered(struct nexthop *nh, @@ -2100,26 +3140,24 @@ static int rtm_dump_walk_nexthops(struct sk_buff *skb, void *data) { struct rb_node *node; - int idx = 0, s_idx; + int s_idx; int err; s_idx = ctx->idx; for (node = rb_first(root); node; node = rb_next(node)) { struct nexthop *nh; - if (idx < s_idx) - goto cont; - nh = rb_entry(node, struct nexthop, rb_node); - ctx->idx = idx; + if (nh->id < s_idx) + continue; + + ctx->idx = nh->id; err = nh_cb(skb, cb, nh, data); if (err) return err; -cont: - idx++; } - ctx->idx = idx; + ctx->idx++; return 0; } @@ -2166,6 +3204,318 @@ out_err: return err; } +static struct nexthop * +nexthop_find_group_resilient(struct net *net, u32 id, + struct netlink_ext_ack *extack) +{ + struct nh_group *nhg; + struct nexthop *nh; + + nh = nexthop_find_by_id(net, id); + if (!nh) + return ERR_PTR(-ENOENT); + + if (!nh->is_group) { + NL_SET_ERR_MSG(extack, "Not a nexthop group"); + return ERR_PTR(-EINVAL); + } + + nhg = rtnl_dereference(nh->nh_grp); + if (!nhg->resilient) { + NL_SET_ERR_MSG(extack, "Nexthop group not of type resilient"); + return ERR_PTR(-EINVAL); + } + + return nh; +} + +static int nh_valid_dump_nhid(struct nlattr *attr, u32 *nh_id_p, + struct netlink_ext_ack *extack) +{ + u32 idx; + + if (attr) { + idx = nla_get_u32(attr); + if (!idx) { + NL_SET_ERR_MSG(extack, "Invalid nexthop id"); + return -EINVAL; + } + *nh_id_p = idx; + } else { + *nh_id_p = 0; + } + + return 0; +} + +static int nh_valid_dump_bucket_req(const struct nlmsghdr *nlh, + struct nh_dump_filter *filter, + struct netlink_callback *cb) +{ + struct nlattr *res_tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_dump)]; + struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump_bucket)]; + int err; + + err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, + ARRAY_SIZE(rtm_nh_policy_dump_bucket) - 1, + rtm_nh_policy_dump_bucket, NULL); + if (err < 0) + return err; + + err = nh_valid_dump_nhid(tb[NHA_ID], &filter->nh_id, cb->extack); + if (err) + return err; + + if (tb[NHA_RES_BUCKET]) { + size_t max = ARRAY_SIZE(rtm_nh_res_bucket_policy_dump) - 1; + + err = nla_parse_nested(res_tb, max, + tb[NHA_RES_BUCKET], + rtm_nh_res_bucket_policy_dump, + cb->extack); + if (err < 0) + return err; + + err = nh_valid_dump_nhid(res_tb[NHA_RES_BUCKET_NH_ID], + &filter->res_bucket_nh_id, + cb->extack); + if (err) + return err; + } + + return __nh_valid_dump_req(nlh, tb, filter, cb->extack); +} + +struct rtm_dump_res_bucket_ctx { + struct rtm_dump_nh_ctx nh; + u16 bucket_index; + u32 done_nh_idx; /* 1 + the index of the last fully processed NH. */ +}; + +static struct rtm_dump_res_bucket_ctx * +rtm_dump_res_bucket_ctx(struct netlink_callback *cb) +{ + struct rtm_dump_res_bucket_ctx *ctx = (void *)cb->ctx; + + BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); + return ctx; +} + +struct rtm_dump_nexthop_bucket_data { + struct rtm_dump_res_bucket_ctx *ctx; + struct nh_dump_filter filter; +}; + +static int rtm_dump_nexthop_bucket_nh(struct sk_buff *skb, + struct netlink_callback *cb, + struct nexthop *nh, + struct rtm_dump_nexthop_bucket_data *dd) +{ + u32 portid = NETLINK_CB(cb->skb).portid; + struct nhmsg *nhm = nlmsg_data(cb->nlh); + struct nh_res_table *res_table; + struct nh_group *nhg; + u16 bucket_index; + int err; + + if (dd->ctx->nh.idx < dd->ctx->done_nh_idx) + return 0; + + nhg = rtnl_dereference(nh->nh_grp); + res_table = rtnl_dereference(nhg->res_table); + for (bucket_index = dd->ctx->bucket_index; + bucket_index < res_table->num_nh_buckets; + bucket_index++) { + struct nh_res_bucket *bucket; + struct nh_grp_entry *nhge; + + bucket = &res_table->nh_buckets[bucket_index]; + nhge = rtnl_dereference(bucket->nh_entry); + if (nh_dump_filtered(nhge->nh, &dd->filter, nhm->nh_family)) + continue; + + if (dd->filter.res_bucket_nh_id && + dd->filter.res_bucket_nh_id != nhge->nh->id) + continue; + + err = nh_fill_res_bucket(skb, nh, bucket, bucket_index, + RTM_NEWNEXTHOPBUCKET, portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + cb->extack); + if (err < 0) { + if (likely(skb->len)) + goto out; + goto out_err; + } + } + + dd->ctx->done_nh_idx = dd->ctx->nh.idx + 1; + bucket_index = 0; + +out: + err = skb->len; +out_err: + dd->ctx->bucket_index = bucket_index; + return err; +} + +static int rtm_dump_nexthop_bucket_cb(struct sk_buff *skb, + struct netlink_callback *cb, + struct nexthop *nh, void *data) +{ + struct rtm_dump_nexthop_bucket_data *dd = data; + struct nh_group *nhg; + + if (!nh->is_group) + return 0; + + nhg = rtnl_dereference(nh->nh_grp); + if (!nhg->resilient) + return 0; + + return rtm_dump_nexthop_bucket_nh(skb, cb, nh, dd); +} + +/* rtnl */ +static int rtm_dump_nexthop_bucket(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct rtm_dump_res_bucket_ctx *ctx = rtm_dump_res_bucket_ctx(cb); + struct rtm_dump_nexthop_bucket_data dd = { .ctx = ctx }; + struct net *net = sock_net(skb->sk); + struct nexthop *nh; + int err; + + err = nh_valid_dump_bucket_req(cb->nlh, &dd.filter, cb); + if (err) + return err; + + if (dd.filter.nh_id) { + nh = nexthop_find_group_resilient(net, dd.filter.nh_id, + cb->extack); + if (IS_ERR(nh)) + return PTR_ERR(nh); + err = rtm_dump_nexthop_bucket_nh(skb, cb, nh, &dd); + } else { + struct rb_root *root = &net->nexthop.rb_root; + + err = rtm_dump_walk_nexthops(skb, cb, root, &ctx->nh, + &rtm_dump_nexthop_bucket_cb, &dd); + } + + if (err < 0) { + if (likely(skb->len)) + goto out; + goto out_err; + } + +out: + err = skb->len; +out_err: + cb->seq = net->nexthop.seq; + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + return err; +} + +static int nh_valid_get_bucket_req_res_bucket(struct nlattr *res, + u16 *bucket_index, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_get)]; + int err; + + err = nla_parse_nested(tb, ARRAY_SIZE(rtm_nh_res_bucket_policy_get) - 1, + res, rtm_nh_res_bucket_policy_get, extack); + if (err < 0) + return err; + + if (!tb[NHA_RES_BUCKET_INDEX]) { + NL_SET_ERR_MSG(extack, "Bucket index is missing"); + return -EINVAL; + } + + *bucket_index = nla_get_u16(tb[NHA_RES_BUCKET_INDEX]); + return 0; +} + +static int nh_valid_get_bucket_req(const struct nlmsghdr *nlh, + u32 *id, u16 *bucket_index, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get_bucket)]; + int err; + + err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, + ARRAY_SIZE(rtm_nh_policy_get_bucket) - 1, + rtm_nh_policy_get_bucket, extack); + if (err < 0) + return err; + + err = __nh_valid_get_del_req(nlh, tb, id, extack); + if (err) + return err; + + if (!tb[NHA_RES_BUCKET]) { + NL_SET_ERR_MSG(extack, "Bucket information is missing"); + return -EINVAL; + } + + err = nh_valid_get_bucket_req_res_bucket(tb[NHA_RES_BUCKET], + bucket_index, extack); + if (err) + return err; + + return 0; +} + +/* rtnl */ +static int rtm_get_nexthop_bucket(struct sk_buff *in_skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(in_skb->sk); + struct nh_res_table *res_table; + struct sk_buff *skb = NULL; + struct nh_group *nhg; + struct nexthop *nh; + u16 bucket_index; + int err; + u32 id; + + err = nh_valid_get_bucket_req(nlh, &id, &bucket_index, extack); + if (err) + return err; + + nh = nexthop_find_group_resilient(net, id, extack); + if (IS_ERR(nh)) + return PTR_ERR(nh); + + nhg = rtnl_dereference(nh->nh_grp); + res_table = rtnl_dereference(nhg->res_table); + if (bucket_index >= res_table->num_nh_buckets) { + NL_SET_ERR_MSG(extack, "Bucket index out of bounds"); + return -ENOENT; + } + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + err = nh_fill_res_bucket(skb, nh, &res_table->nh_buckets[bucket_index], + bucket_index, RTM_NEWNEXTHOPBUCKET, + NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, + 0, extack); + if (err < 0) { + WARN_ON(err == -EMSGSIZE); + goto errout_free; + } + + return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); + +errout_free: + kfree_skb(skb); + return err; +} + static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu) { unsigned int hash = nh_dev_hashfn(dev->ifindex); @@ -2277,6 +3627,75 @@ out: } EXPORT_SYMBOL(nexthop_set_hw_flags); +void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index, + bool offload, bool trap) +{ + struct nh_res_table *res_table; + struct nh_res_bucket *bucket; + struct nexthop *nexthop; + struct nh_group *nhg; + + rcu_read_lock(); + + nexthop = nexthop_find_by_id(net, id); + if (!nexthop || !nexthop->is_group) + goto out; + + nhg = rcu_dereference(nexthop->nh_grp); + if (!nhg->resilient) + goto out; + + if (bucket_index >= nhg->res_table->num_nh_buckets) + goto out; + + res_table = rcu_dereference(nhg->res_table); + bucket = &res_table->nh_buckets[bucket_index]; + bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); + if (offload) + bucket->nh_flags |= RTNH_F_OFFLOAD; + if (trap) + bucket->nh_flags |= RTNH_F_TRAP; + +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL(nexthop_bucket_set_hw_flags); + +void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets, + unsigned long *activity) +{ + struct nh_res_table *res_table; + struct nexthop *nexthop; + struct nh_group *nhg; + u16 i; + + rcu_read_lock(); + + nexthop = nexthop_find_by_id(net, id); + if (!nexthop || !nexthop->is_group) + goto out; + + nhg = rcu_dereference(nexthop->nh_grp); + if (!nhg->resilient) + goto out; + + /* Instead of silently ignoring some buckets, demand that the sizes + * be the same. + */ + res_table = rcu_dereference(nhg->res_table); + if (num_buckets != res_table->num_nh_buckets) + goto out; + + for (i = 0; i < num_buckets; i++) { + if (test_bit(i, activity)) + nh_res_bucket_set_busy(&res_table->nh_buckets[i]); + } + +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL(nexthop_res_grp_activity_update); + static void __net_exit nexthop_net_exit(struct net *net) { rtnl_lock(); @@ -2320,6 +3739,9 @@ static int __init nexthop_init(void) rtnl_register(PF_INET6, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); rtnl_register(PF_INET6, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0); + rtnl_register(PF_UNSPEC, RTM_GETNEXTHOPBUCKET, rtm_get_nexthop_bucket, + rtm_dump_nexthop_bucket, 0); + return 0; } subsys_initcall(nexthop_init); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 8b943f85fff9..1c9f71a37258 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -453,7 +453,9 @@ EXPORT_SYMBOL_GPL(ping_bind); static inline int ping_supported(int family, int type, int code) { return (family == AF_INET && type == ICMP_ECHO && code == 0) || - (family == AF_INET6 && type == ICMPV6_ECHO_REQUEST && code == 0); + (family == AF_INET && type == ICMP_EXT_ECHO && code == 0) || + (family == AF_INET6 && type == ICMPV6_ECHO_REQUEST && code == 0) || + (family == AF_INET6 && type == ICMPV6_EXT_ECHO_REQUEST && code == 0); } /* diff --git a/net/ipv4/route.c b/net/ipv4/route.c index bba150fdd265..f6787c55f6ab 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -21,7 +21,7 @@ * Alan Cox : Added BSD route gw semantics * Alan Cox : Super /proc >4K * Alan Cox : MTU in route table - * Alan Cox : MSS actually. Also added the window + * Alan Cox : MSS actually. Also added the window * clamper. * Sam Lantinga : Fixed route matching in rt_del() * Alan Cox : Routing cache support. @@ -41,7 +41,7 @@ * Olaf Erb : irtt wasn't being copied right. * Bjorn Ekwall : Kerneld route support. * Alan Cox : Multicast fixed (I hope) - * Pavel Krauz : Limited broadcast fixed + * Pavel Krauz : Limited broadcast fixed * Mike McLagan : Routing by source * Alexey Kuznetsov : End of old history. Split to fib.c and * route.c and rewritten from scratch. @@ -54,8 +54,8 @@ * Robert Olsson : Added rt_cache statistics * Arnaldo C. Melo : Convert proc stuff to seq_file * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. - * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect - * Ilia Sotnikov : Removed TOS from hash calculations + * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect + * Ilia Sotnikov : Removed TOS from hash calculations */ #define pr_fmt(fmt) "IPv4: " fmt @@ -66,6 +66,7 @@ #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> +#include <linux/memblock.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> @@ -234,19 +235,6 @@ static const struct seq_operations rt_cache_seq_ops = { .show = rt_cache_seq_show, }; -static int rt_cache_seq_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &rt_cache_seq_ops); -} - -static const struct proc_ops rt_cache_proc_ops = { - .proc_open = rt_cache_seq_open, - .proc_read = seq_read, - .proc_lseek = seq_lseek, - .proc_release = seq_release, -}; - - static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) { int cpu; @@ -324,19 +312,6 @@ static const struct seq_operations rt_cpu_seq_ops = { .show = rt_cpu_seq_show, }; - -static int rt_cpu_seq_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &rt_cpu_seq_ops); -} - -static const struct proc_ops rt_cpu_proc_ops = { - .proc_open = rt_cpu_seq_open, - .proc_read = seq_read, - .proc_lseek = seq_lseek, - .proc_release = seq_release, -}; - #ifdef CONFIG_IP_ROUTE_CLASSID static int rt_acct_proc_show(struct seq_file *m, void *v) { @@ -367,13 +342,13 @@ static int __net_init ip_rt_do_proc_init(struct net *net) { struct proc_dir_entry *pde; - pde = proc_create("rt_cache", 0444, net->proc_net, - &rt_cache_proc_ops); + pde = proc_create_seq("rt_cache", 0444, net->proc_net, + &rt_cache_seq_ops); if (!pde) goto err1; - pde = proc_create("rt_cache", 0444, - net->proc_net_stat, &rt_cpu_proc_ops); + pde = proc_create_seq("rt_cache", 0444, net->proc_net_stat, + &rt_cpu_seq_ops); if (!pde) goto err2; @@ -478,8 +453,10 @@ static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr) __ipv4_confirm_neigh(dev, *(__force u32 *)pkey); } -#define IP_IDENTS_SZ 2048u - +/* Hash tables of size 2048..262144 depending on RAM size. + * Each bucket uses 8 bytes. + */ +static u32 ip_idents_mask __read_mostly; static atomic_t *ip_idents __read_mostly; static u32 *ip_tstamps __read_mostly; @@ -489,12 +466,16 @@ static u32 *ip_tstamps __read_mostly; */ u32 ip_idents_reserve(u32 hash, int segs) { - u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ; - atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ; - u32 old = READ_ONCE(*p_tstamp); - u32 now = (u32)jiffies; + u32 bucket, old, now = (u32)jiffies; + atomic_t *p_id; + u32 *p_tstamp; u32 delta = 0; + bucket = hash & ip_idents_mask; + p_tstamp = ip_tstamps + bucket; + p_id = ip_idents + bucket; + old = READ_ONCE(*p_tstamp); + if (old != now && cmpxchg(p_tstamp, old, now) == old) delta = prandom_u32_max(now - old); @@ -722,6 +703,7 @@ static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr, for_each_possible_cpu(i) { struct rtable __rcu **prt; + prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i); rt = rcu_dereference(*prt); if (rt) @@ -1258,12 +1240,12 @@ static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb) } /* - We do not cache source address of outgoing interface, - because it is used only by IP RR, TS and SRR options, - so that it out of fast path. - - BTW remember: "addr" is allowed to be not aligned - in IP options! + * We do not cache source address of outgoing interface, + * because it is used only by IP RR, TS and SRR options, + * so that it out of fast path. + * + * BTW remember: "addr" is allowed to be not aligned + * in IP options! */ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) @@ -2108,7 +2090,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto out; /* Check for the most weird martians, which can be not detected - by fib_lookup. + * by fib_lookup. */ tun_info = skb_tunnel_info(skb); @@ -2246,7 +2228,7 @@ local_input: if (res->type == RTN_UNREACHABLE) { rth->dst.input= ip_error; rth->dst.error= -err; - rth->rt_flags &= ~RTCF_LOCAL; + rth->rt_flags &= ~RTCF_LOCAL; } if (do_cache) { @@ -2317,15 +2299,15 @@ int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev, struct fib_result *res) { /* Multicast recognition logic is moved from route cache to here. - The problem was that too many Ethernet cards have broken/missing - hardware multicast filters :-( As result the host on multicasting - network acquires a lot of useless route cache entries, sort of - SDR messages from all the world. Now we try to get rid of them. - Really, provided software IP multicast filter is organized - reasonably (at least, hashed), it does not result in a slowdown - comparing with route cache reject entries. - Note, that multicast routers are not affected, because - route cache entry is created eventually. + * The problem was that too many Ethernet cards have broken/missing + * hardware multicast filters :-( As result the host on multicasting + * network acquires a lot of useless route cache entries, sort of + * SDR messages from all the world. Now we try to get rid of them. + * Really, provided software IP multicast filter is organized + * reasonably (at least, hashed), it does not result in a slowdown + * comparing with route cache reject entries. + * Note, that multicast routers are not affected, because + * route cache entry is created eventually. */ if (ipv4_is_multicast(daddr)) { struct in_device *in_dev = __in_dev_get_rcu(dev); @@ -2537,11 +2519,11 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, rth = ERR_PTR(-ENETUNREACH); /* I removed check for oif == dev_out->oif here. - It was wrong for two reasons: - 1. ip_dev_find(net, saddr) can return wrong iface, if saddr - is assigned to multiple interfaces. - 2. Moreover, we are allowed to send packets with saddr - of another iface. --ANK + * It was wrong for two reasons: + * 1. ip_dev_find(net, saddr) can return wrong iface, if saddr + * is assigned to multiple interfaces. + * 2. Moreover, we are allowed to send packets with saddr + * of another iface. --ANK */ if (fl4->flowi4_oif == 0 && @@ -2553,18 +2535,18 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, goto out; /* Special hack: user can direct multicasts - and limited broadcast via necessary interface - without fiddling with IP_MULTICAST_IF or IP_PKTINFO. - This hack is not just for fun, it allows - vic,vat and friends to work. - They bind socket to loopback, set ttl to zero - and expect that it will work. - From the viewpoint of routing cache they are broken, - because we are not allowed to build multicast path - with loopback source addr (look, routing cache - cannot know, that ttl is zero, so that packet - will not leave this host and route is valid). - Luckily, this hack is good workaround. + * and limited broadcast via necessary interface + * without fiddling with IP_MULTICAST_IF or IP_PKTINFO. + * This hack is not just for fun, it allows + * vic,vat and friends to work. + * They bind socket to loopback, set ttl to zero + * and expect that it will work. + * From the viewpoint of routing cache they are broken, + * because we are not allowed to build multicast path + * with loopback source addr (look, routing cache + * cannot know, that ttl is zero, so that packet + * will not leave this host and route is valid). + * Luckily, this hack is good workaround. */ fl4->flowi4_oif = dev_out->ifindex; @@ -2627,21 +2609,21 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, (ipv4_is_multicast(fl4->daddr) || !netif_index_is_l3_master(net, fl4->flowi4_oif))) { /* Apparently, routing tables are wrong. Assume, - that the destination is on link. - - WHY? DW. - Because we are allowed to send to iface - even if it has NO routes and NO assigned - addresses. When oif is specified, routing - tables are looked up with only one purpose: - to catch if destination is gatewayed, rather than - direct. Moreover, if MSG_DONTROUTE is set, - we send packet, ignoring both routing tables - and ifaddr state. --ANK - - - We could make it even if oif is unknown, - likely IPv6, but we do not. + * that the destination is on link. + * + * WHY? DW. + * Because we are allowed to send to iface + * even if it has NO routes and NO assigned + * addresses. When oif is specified, routing + * tables are looked up with only one purpose: + * to catch if destination is gatewayed, rather than + * direct. Moreover, if MSG_DONTROUTE is set, + * we send packet, ignoring both routing tables + * and ifaddr state. --ANK + * + * + * We could make it even if oif is unknown, + * likely IPv6, but we do not. */ if (fl4->saddr == 0) @@ -3553,18 +3535,25 @@ struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; int __init ip_rt_init(void) { + void *idents_hash; int cpu; - ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents), - GFP_KERNEL); - if (!ip_idents) - panic("IP: failed to allocate ip_idents\n"); + /* For modern hosts, this will use 2 MB of memory */ + idents_hash = alloc_large_system_hash("IP idents", + sizeof(*ip_idents) + sizeof(*ip_tstamps), + 0, + 16, /* one bucket per 64 KB */ + HASH_ZERO, + NULL, + &ip_idents_mask, + 2048, + 256*1024); + + ip_idents = idents_hash; - prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); + prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents)); - ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL); - if (!ip_tstamps) - panic("IP: failed to allocate ip_tstamps\n"); + ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents); for_each_possible_cpu(cpu) { struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 60465f077497..a62934b9f15a 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -31,7 +31,6 @@ static int two = 2; static int four = 4; static int thousand = 1000; -static int gso_max_segs = GSO_MAX_SEGS; static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; @@ -47,7 +46,6 @@ static int tcp_syn_retries_min = 1; static int tcp_syn_retries_max = MAX_TCP_SYNCNT; static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; -static int comp_sack_nr_max = 255; static u32 u32_max_div_HZ = UINT_MAX / HZ; static int one_day_secs = 24 * 3600; @@ -209,7 +207,7 @@ static int ipv4_fwd_update_priority(struct ctl_table *table, int write, net = container_of(table->data, struct net, ipv4.sysctl_ip_fwd_update_priority); - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) call_netevent_notifiers(NETEVENT_IPV4_FWD_UPDATE_PRIORITY_UPDATE, net); @@ -389,7 +387,7 @@ static int proc_tcp_early_demux(struct ctl_table *table, int write, { int ret = 0; - ret = proc_dointvec(table, write, buffer, lenp, ppos); + ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); if (write && !ret) { int enabled = init_net.ipv4.sysctl_tcp_early_demux; @@ -405,7 +403,7 @@ static int proc_udp_early_demux(struct ctl_table *table, int write, { int ret = 0; - ret = proc_dointvec(table, write, buffer, lenp, ppos); + ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); if (write && !ret) { int enabled = init_net.ipv4.sysctl_udp_early_demux; @@ -457,7 +455,7 @@ static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write, ipv4.sysctl_fib_multipath_hash_policy); int ret; - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net); @@ -595,30 +593,39 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "icmp_echo_ignore_all", .data = &init_net.ipv4.sysctl_icmp_echo_ignore_all, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, + }, + { + .procname = "icmp_echo_enable_probe", + .data = &init_net.ipv4.sysctl_icmp_echo_enable_probe, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE }, { .procname = "icmp_echo_ignore_broadcasts", .data = &init_net.ipv4.sysctl_icmp_echo_ignore_broadcasts, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "icmp_ignore_bogus_error_responses", .data = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "icmp_errors_use_inbound_ifaddr", .data = &init_net.ipv4.sysctl_icmp_errors_use_inbound_ifaddr, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "icmp_ratelimit", @@ -645,9 +652,9 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "raw_l3mdev_accept", .data = &init_net.ipv4.sysctl_raw_l3mdev_accept, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, @@ -655,60 +662,60 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_ecn", .data = &init_net.ipv4.sysctl_tcp_ecn, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_ecn_fallback", .data = &init_net.ipv4.sysctl_tcp_ecn_fallback, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_dynaddr", .data = &init_net.ipv4.sysctl_ip_dynaddr, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_early_demux", .data = &init_net.ipv4.sysctl_ip_early_demux, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "udp_early_demux", .data = &init_net.ipv4.sysctl_udp_early_demux, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_udp_early_demux }, { .procname = "tcp_early_demux", .data = &init_net.ipv4.sysctl_tcp_early_demux, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_tcp_early_demux }, { .procname = "nexthop_compat_mode", .data = &init_net.ipv4.sysctl_nexthop_compat_mode, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "ip_default_ttl", .data = &init_net.ipv4.sysctl_ip_default_ttl, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = &ip_ttl_min, .extra2 = &ip_ttl_max, }, @@ -729,21 +736,21 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "ip_no_pmtu_disc", .data = &init_net.ipv4.sysctl_ip_no_pmtu_disc, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_forward_use_pmtu", .data = &init_net.ipv4.sysctl_ip_fwd_use_pmtu, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_forward_update_priority", .data = &init_net.ipv4.sysctl_ip_fwd_update_priority, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, .proc_handler = ipv4_fwd_update_priority, .extra1 = SYSCTL_ZERO, @@ -752,40 +759,40 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "ip_nonlocal_bind", .data = &init_net.ipv4.sysctl_ip_nonlocal_bind, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_autobind_reuse", .data = &init_net.ipv4.sysctl_ip_autobind_reuse, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "fwmark_reflect", .data = &init_net.ipv4.sysctl_fwmark_reflect, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_fwmark_accept", .data = &init_net.ipv4.sysctl_tcp_fwmark_accept, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, #ifdef CONFIG_NET_L3_MASTER_DEV { .procname = "tcp_l3mdev_accept", .data = &init_net.ipv4.sysctl_tcp_l3mdev_accept, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, @@ -793,9 +800,9 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_mtu_probing", .data = &init_net.ipv4.sysctl_tcp_mtu_probing, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_base_mss", @@ -840,9 +847,9 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "igmp_link_local_mcast_reports", .data = &init_net.ipv4.sysctl_igmp_llm_reports, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "igmp_max_memberships", @@ -897,9 +904,9 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_keepalive_probes", .data = &init_net.ipv4.sysctl_tcp_keepalive_probes, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_keepalive_intvl", @@ -911,26 +918,26 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_syn_retries", .data = &init_net.ipv4.sysctl_tcp_syn_retries, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = &tcp_syn_retries_min, .extra2 = &tcp_syn_retries_max }, { .procname = "tcp_synack_retries", .data = &init_net.ipv4.sysctl_tcp_synack_retries, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, #ifdef CONFIG_SYN_COOKIES { .procname = "tcp_syncookies", .data = &init_net.ipv4.sysctl_tcp_syncookies, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, #endif { @@ -943,24 +950,24 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_retries1", .data = &init_net.ipv4.sysctl_tcp_retries1, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra2 = &tcp_retr1_max }, { .procname = "tcp_retries2", .data = &init_net.ipv4.sysctl_tcp_retries2, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_orphan_retries", .data = &init_net.ipv4.sysctl_tcp_orphan_retries, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_fin_timeout", @@ -979,9 +986,9 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_tw_reuse", .data = &init_net.ipv4.sysctl_tcp_tw_reuse, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &two, }, @@ -1030,16 +1037,16 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "fib_multipath_use_neigh", .data = &init_net.ipv4.sysctl_fib_multipath_use_neigh, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "fib_multipath_hash_policy", .data = &init_net.ipv4.sysctl_fib_multipath_hash_policy, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_fib_multipath_hash_policy, .extra1 = SYSCTL_ZERO, @@ -1057,9 +1064,9 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "udp_l3mdev_accept", .data = &init_net.ipv4.sysctl_udp_l3mdev_accept, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, @@ -1067,88 +1074,88 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_sack", .data = &init_net.ipv4.sysctl_tcp_sack, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_window_scaling", .data = &init_net.ipv4.sysctl_tcp_window_scaling, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_timestamps", .data = &init_net.ipv4.sysctl_tcp_timestamps, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_early_retrans", .data = &init_net.ipv4.sysctl_tcp_early_retrans, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &four, }, { .procname = "tcp_recovery", .data = &init_net.ipv4.sysctl_tcp_recovery, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_thin_linear_timeouts", .data = &init_net.ipv4.sysctl_tcp_thin_linear_timeouts, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_slow_start_after_idle", .data = &init_net.ipv4.sysctl_tcp_slow_start_after_idle, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_retrans_collapse", .data = &init_net.ipv4.sysctl_tcp_retrans_collapse, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_stdurg", .data = &init_net.ipv4.sysctl_tcp_stdurg, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_rfc1337", .data = &init_net.ipv4.sysctl_tcp_rfc1337, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_abort_on_overflow", .data = &init_net.ipv4.sysctl_tcp_abort_on_overflow, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_fack", .data = &init_net.ipv4.sysctl_tcp_fack, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_max_reordering", @@ -1160,16 +1167,16 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_dsack", .data = &init_net.ipv4.sysctl_tcp_dsack, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_app_win", .data = &init_net.ipv4.sysctl_tcp_app_win, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_adv_win_scale", @@ -1183,46 +1190,46 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_frto", .data = &init_net.ipv4.sysctl_tcp_frto, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_no_metrics_save", .data = &init_net.ipv4.sysctl_tcp_nometrics_save, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_no_ssthresh_metrics_save", .data = &init_net.ipv4.sysctl_tcp_no_ssthresh_metrics_save, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "tcp_moderate_rcvbuf", .data = &init_net.ipv4.sysctl_tcp_moderate_rcvbuf, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_tso_win_divisor", .data = &init_net.ipv4.sysctl_tcp_tso_win_divisor, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_workaround_signed_windows", .data = &init_net.ipv4.sysctl_tcp_workaround_signed_windows, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_limit_output_bytes", @@ -1241,11 +1248,10 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_min_tso_segs", .data = &init_net.ipv4.sysctl_tcp_min_tso_segs, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ONE, - .extra2 = &gso_max_segs, }, { .procname = "tcp_min_rtt_wlen", @@ -1259,9 +1265,9 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_autocorking", .data = &init_net.ipv4.sysctl_tcp_autocorking, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, @@ -1323,18 +1329,17 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_comp_sack_nr", .data = &init_net.ipv4.sysctl_tcp_comp_sack_nr, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &comp_sack_nr_max, }, { .procname = "tcp_reflect_tos", .data = &init_net.ipv4.sysctl_tcp_reflect_tos, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, @@ -1357,9 +1362,9 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "fib_notify_on_flag_change", .data = &init_net.ipv4.sysctl_fib_notify_on_flag_change, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &two, }, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index de7cc8445ac0..e14fd0c50c10 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -267,6 +267,7 @@ #include <linux/slab.h> #include <linux/errqueue.h> #include <linux/static_key.h> +#include <linux/btf.h> #include <net/icmp.h> #include <net/inet_common.h> @@ -2587,6 +2588,17 @@ void tcp_set_state(struct sock *sk, int state) BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV); BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES); + /* bpf uapi header bpf.h defines an anonymous enum with values + * BPF_TCP_* used by bpf programs. Currently gcc built vmlinux + * is able to emit this enum in DWARF due to the above BUILD_BUG_ON. + * But clang built vmlinux does not have this enum in DWARF + * since clang removes the above code before generating IR/debuginfo. + * Let us explicitly emit the type debuginfo to ensure the + * above-mentioned anonymous enum in the vmlinux DWARF and hence BTF + * regardless of which compiler is used. + */ + BTF_TYPE_EMIT_ENUM(BPF_TCP_ESTABLISHED); + if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG)) tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state); diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index bc7d2a586e18..ad9d17923fc5 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -10,86 +10,6 @@ #include <net/inet_common.h> #include <net/tls.h> -int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, - struct msghdr *msg, int len, int flags) -{ - struct iov_iter *iter = &msg->msg_iter; - int peek = flags & MSG_PEEK; - struct sk_msg *msg_rx; - int i, copied = 0; - - msg_rx = list_first_entry_or_null(&psock->ingress_msg, - struct sk_msg, list); - - while (copied != len) { - struct scatterlist *sge; - - if (unlikely(!msg_rx)) - break; - - i = msg_rx->sg.start; - do { - struct page *page; - int copy; - - sge = sk_msg_elem(msg_rx, i); - copy = sge->length; - page = sg_page(sge); - if (copied + copy > len) - copy = len - copied; - copy = copy_page_to_iter(page, sge->offset, copy, iter); - if (!copy) - return copied ? copied : -EFAULT; - - copied += copy; - if (likely(!peek)) { - sge->offset += copy; - sge->length -= copy; - if (!msg_rx->skb) - sk_mem_uncharge(sk, copy); - msg_rx->sg.size -= copy; - - if (!sge->length) { - sk_msg_iter_var_next(i); - if (!msg_rx->skb) - put_page(page); - } - } else { - /* Lets not optimize peek case if copy_page_to_iter - * didn't copy the entire length lets just break. - */ - if (copy != sge->length) - return copied; - sk_msg_iter_var_next(i); - } - - if (copied == len) - break; - } while (i != msg_rx->sg.end); - - if (unlikely(peek)) { - if (msg_rx == list_last_entry(&psock->ingress_msg, - struct sk_msg, list)) - break; - msg_rx = list_next_entry(msg_rx, list); - continue; - } - - msg_rx->sg.start = i; - if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) { - list_del(&msg_rx->list); - if (msg_rx->skb) - consume_skb(msg_rx->skb); - kfree(msg_rx); - } - msg_rx = list_first_entry_or_null(&psock->ingress_msg, - struct sk_msg, list); - } - - return copied; -} -EXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg); - static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg, u32 apply_bytes, int flags) { @@ -229,7 +149,7 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, } EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir); -#ifdef CONFIG_BPF_STREAM_PARSER +#ifdef CONFIG_BPF_SYSCALL static bool tcp_bpf_stream_read(const struct sock *sk) { struct sk_psock *psock; @@ -243,28 +163,6 @@ static bool tcp_bpf_stream_read(const struct sock *sk) return !empty; } -static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock, - int flags, long timeo, int *err) -{ - DEFINE_WAIT_FUNC(wait, woken_wake_function); - int ret = 0; - - if (sk->sk_shutdown & RCV_SHUTDOWN) - return 1; - - if (!timeo) - return ret; - - add_wait_queue(sk_sleep(sk), &wait); - sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - ret = sk_wait_event(sk, &timeo, - !list_empty(&psock->ingress_msg) || - !skb_queue_empty(&sk->sk_receive_queue), &wait); - sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); - remove_wait_queue(sk_sleep(sk), &wait); - return ret; -} - static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { @@ -284,13 +182,13 @@ static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } lock_sock(sk); msg_bytes_ready: - copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags); + copied = sk_msg_recvmsg(sk, psock, msg, len, flags); if (!copied) { int data, err = 0; long timeo; timeo = sock_rcvtimeo(sk, nonblock); - data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err); + data = sk_msg_wait_data(sk, psock, flags, timeo, &err); if (data) { if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; @@ -601,20 +499,43 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops) ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP; } -struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock) +int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; + if (restore) { + if (inet_csk_has_ulp(sk)) { + /* TLS does not have an unhash proto in SW cases, + * but we need to ensure we stop using the sock_map + * unhash routine because the associated psock is being + * removed. So use the original unhash handler. + */ + WRITE_ONCE(sk->sk_prot->unhash, psock->saved_unhash); + tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space); + } else { + sk->sk_write_space = psock->saved_write_space; + /* Pairs with lockless read in sk_clone_lock() */ + WRITE_ONCE(sk->sk_prot, psock->sk_proto); + } + return 0; + } + + if (inet_csk_has_ulp(sk)) + return -EINVAL; + if (sk->sk_family == AF_INET6) { if (tcp_bpf_assert_proto_ops(psock->sk_proto)) - return ERR_PTR(-EINVAL); + return -EINVAL; tcp_bpf_check_v6_needs_rebuild(psock->sk_proto); } - return &tcp_bpf_prots[family][config]; + /* Pairs with lockless read in sk_clone_lock() */ + WRITE_ONCE(sk->sk_prot, &tcp_bpf_prots[family][config]); + return 0; } +EXPORT_SYMBOL_GPL(tcp_bpf_update_proto); /* If a child got cloned from a listening socket that had tcp_bpf * protocol callbacks installed, we need to restore the callbacks to @@ -629,4 +550,4 @@ void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) if (prot == &tcp_bpf_prots[family][TCP_BPF_BASE]) newsk->sk_prot = sk->sk_prot_creator; } -#endif /* CONFIG_BPF_STREAM_PARSER */ +#endif /* CONFIG_BPF_SYSCALL */ diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index ffcbe46dacdb..4a30deaa9a37 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -124,7 +124,7 @@ static inline void bictcp_hystart_reset(struct sock *sk) ca->sample_cnt = 0; } -static void bictcp_init(struct sock *sk) +static void cubictcp_init(struct sock *sk) { struct bictcp *ca = inet_csk_ca(sk); @@ -137,7 +137,7 @@ static void bictcp_init(struct sock *sk) tcp_sk(sk)->snd_ssthresh = initial_ssthresh; } -static void bictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) +static void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_TX_START) { struct bictcp *ca = inet_csk_ca(sk); @@ -319,7 +319,7 @@ tcp_friendliness: ca->cnt = max(ca->cnt, 2U); } -static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) +static void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); @@ -338,7 +338,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) tcp_cong_avoid_ai(tp, ca->cnt, acked); } -static u32 bictcp_recalc_ssthresh(struct sock *sk) +static u32 cubictcp_recalc_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); @@ -355,7 +355,7 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk) return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); } -static void bictcp_state(struct sock *sk, u8 new_state) +static void cubictcp_state(struct sock *sk, u8 new_state) { if (new_state == TCP_CA_Loss) { bictcp_reset(inet_csk_ca(sk)); @@ -442,7 +442,7 @@ static void hystart_update(struct sock *sk, u32 delay) } } -static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) +static void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) { const struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); @@ -471,13 +471,13 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) } static struct tcp_congestion_ops cubictcp __read_mostly = { - .init = bictcp_init, - .ssthresh = bictcp_recalc_ssthresh, - .cong_avoid = bictcp_cong_avoid, - .set_state = bictcp_state, + .init = cubictcp_init, + .ssthresh = cubictcp_recalc_ssthresh, + .cong_avoid = cubictcp_cong_avoid, + .set_state = cubictcp_state, .undo_cwnd = tcp_reno_undo_cwnd, - .cwnd_event = bictcp_cwnd_event, - .pkts_acked = bictcp_acked, + .cwnd_event = cubictcp_cwnd_event, + .pkts_acked = cubictcp_acked, .owner = THIS_MODULE, .name = "cubic", }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 69a545db80d2..4cf4dd532d1c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2914,7 +2914,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, /* D. Check state exit conditions. State can be terminated * when high_seq is ACKed. */ if (icsk->icsk_ca_state == TCP_CA_Open) { - WARN_ON(tp->retrans_out != 0); + WARN_ON(tp->retrans_out != 0 && !tp->syn_data); tp->retrans_stamp = 0; } else if (!before(tp->snd_una, tp->high_seq)) { switch (icsk->icsk_ca_state) { @@ -5994,11 +5994,9 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED; else tp->fastopen_client_fail = TFO_DATA_NOT_ACKED; - skb_rbtree_walk_from(data) { - if (__tcp_retransmit_skb(sk, data, 1)) - break; - } - tcp_rearm_rto(sk); + skb_rbtree_walk_from(data) + tcp_mark_skb_lost(sk, data); + tcp_xmit_retransmit_queue(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); return true; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index daad4f99db32..312184cead57 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -655,14 +655,18 @@ EXPORT_SYMBOL(tcp_v4_send_check); * Exception: precedence violation. We do not implement it in any case. */ +#ifdef CONFIG_TCP_MD5SIG +#define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED +#else +#define OPTION_BYTES sizeof(__be32) +#endif + static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); struct { struct tcphdr th; -#ifdef CONFIG_TCP_MD5SIG - __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)]; -#endif + __be32 opt[OPTION_BYTES / sizeof(__be32)]; } rep; struct ip_reply_arg arg; #ifdef CONFIG_TCP_MD5SIG @@ -770,6 +774,17 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) ip_hdr(skb)->daddr, &rep.th); } #endif + /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ + if (rep.opt[0] == 0) { + __be32 mrst = mptcp_reset_option(skb); + + if (mrst) { + rep.opt[0] = mrst; + arg.iov[0].iov_len += sizeof(mrst); + rep.th.doff = arg.iov[0].iov_len / 4; + } + } + arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, /* XXX */ arg.iov[0].iov_len, IPPROTO_TCP, 0); @@ -2806,6 +2821,9 @@ struct proto tcp_prot = { .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, +#ifdef CONFIG_BPF_SYSCALL + .psock_update_sk_prot = tcp_bpf_update_proto, +#endif .enter_memory_pressure = tcp_enter_memory_pressure, .leave_memory_pressure = tcp_leave_memory_pressure, .stream_memory_free = tcp_stream_memory_free, diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index e6459537d4d2..82b36ec3f2f8 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -63,7 +63,7 @@ enum tcp_lp_state { * @sowd: smoothed OWD << 3 * @owd_min: min OWD * @owd_max: max OWD - * @owd_max_rsv: resrved max owd + * @owd_max_rsv: reserved max owd * @remote_hz: estimated remote HZ * @remote_ref_time: remote reference time * @local_ref_time: local reference time @@ -305,7 +305,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample) /* FIXME: try to reset owd_min and owd_max here * so decrease the chance the min/max is no longer suitable - * and will usually within threshold when whithin inference */ + * and will usually within threshold when within inference */ lp->owd_min = lp->sowd >> 3; lp->owd_max = lp->sowd >> 2; lp->owd_max_rsv = lp->sowd >> 2; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index fbf140a770d8..bde781f46b41 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2775,13 +2775,17 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) * a packet is still in a qdisc or driver queue. * In this case, there is very little point doing a retransmit ! */ -static bool skb_still_in_host_queue(const struct sock *sk, +static bool skb_still_in_host_queue(struct sock *sk, const struct sk_buff *skb) { if (unlikely(skb_fclone_busy(sk, skb))) { - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); - return true; + set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); + smp_mb__after_atomic(); + if (skb_fclone_busy(sk, skb)) { + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); + return true; + } } return false; } @@ -3147,14 +3151,6 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) if (icsk->icsk_mtup.probe_size) icsk->icsk_mtup.probe_size = 0; - /* Do not sent more than we queued. 1/4 is reserved for possible - * copying overhead: fragmentation, tunneling, mangling etc. - */ - if (refcount_read(&sk->sk_wmem_alloc) > - min_t(u32, sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), - sk->sk_sndbuf)) - return -EAGAIN; - if (skb_still_in_host_queue(sk, skb)) return -EBUSY; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 99d743eb9dc4..15f5504adf5b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1782,6 +1782,35 @@ busy_check: } EXPORT_SYMBOL(__skb_recv_udp); +int udp_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + int copied = 0; + + while (1) { + struct sk_buff *skb; + int err, used; + + skb = skb_recv_udp(sk, 0, 1, &err); + if (!skb) + return err; + used = recv_actor(desc, skb, 0, skb->len); + if (used <= 0) { + if (!copied) + copied = used; + break; + } else if (used <= skb->len) { + copied += used; + } + + if (!desc->count) + break; + } + + return copied; +} +EXPORT_SYMBOL(udp_read_sock); + /* * This should be easy, if there is something there we * return it, otherwise we block. @@ -2178,6 +2207,8 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) segs = udp_rcv_segment(sk, skb, true); skb_list_walk_safe(segs, skb, next) { __skb_pull(skb, skb_transport_offset(skb)); + + udp_post_segment_fix_csum(skb); ret = udp_queue_rcv_one_skb(sk, skb); if (ret > 0) ip_protocol_deliver_rcu(dev_net(skb->dev), skb, ret); @@ -2664,9 +2695,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, case UDP_GRO: lock_sock(sk); + + /* when enabling GRO, accept the related GSO packet type */ if (valbool) udp_tunnel_encap_enable(sk->sk_socket); up->gro_enabled = valbool; + up->accept_udp_l4 = valbool; release_sock(sk); break; @@ -2853,6 +2887,9 @@ struct proto udp_prot = { .unhash = udp_lib_unhash, .rehash = udp_v4_rehash, .get_port = udp_v4_get_port, +#ifdef CONFIG_BPF_SYSCALL + .psock_update_sk_prot = udp_bpf_update_proto, +#endif .memory_allocated = &udp_memory_allocated, .sysctl_mem = sysctl_udp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index 7a94791efc1a..954c4591a6fd 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -4,6 +4,68 @@ #include <linux/skmsg.h> #include <net/sock.h> #include <net/udp.h> +#include <net/inet_common.h> + +#include "udp_impl.h" + +static struct proto *udpv6_prot_saved __read_mostly; + +static int sk_udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int noblock, int flags, int *addr_len) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + return udpv6_prot_saved->recvmsg(sk, msg, len, noblock, flags, + addr_len); +#endif + return udp_prot.recvmsg(sk, msg, len, noblock, flags, addr_len); +} + +static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len) +{ + struct sk_psock *psock; + int copied, ret; + + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + + psock = sk_psock_get(sk); + if (unlikely(!psock)) + return sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + + lock_sock(sk); + if (sk_psock_queue_empty(psock)) { + ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + goto out; + } + +msg_bytes_ready: + copied = sk_msg_recvmsg(sk, psock, msg, len, flags); + if (!copied) { + int data, err = 0; + long timeo; + + timeo = sock_rcvtimeo(sk, nonblock); + data = sk_msg_wait_data(sk, psock, flags, timeo, &err); + if (data) { + if (!sk_psock_queue_empty(psock)) + goto msg_bytes_ready; + ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + goto out; + } + if (err) { + ret = err; + goto out; + } + copied = -EAGAIN; + } + ret = copied; +out: + release_sock(sk); + sk_psock_put(sk, psock); + return ret; +} enum { UDP_BPF_IPV4, @@ -11,7 +73,6 @@ enum { UDP_BPF_NUM_PROTS, }; -static struct proto *udpv6_prot_saved __read_mostly; static DEFINE_SPINLOCK(udpv6_prot_lock); static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS]; @@ -20,6 +81,7 @@ static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base) *prot = *base; prot->unhash = sock_map_unhash; prot->close = sock_map_close; + prot->recvmsg = udp_bpf_recvmsg; } static void udp_bpf_check_v6_needs_rebuild(struct proto *ops) @@ -41,12 +103,20 @@ static int __init udp_bpf_v4_build_proto(void) } core_initcall(udp_bpf_v4_build_proto); -struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock) +int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6; + if (restore) { + sk->sk_write_space = psock->saved_write_space; + WRITE_ONCE(sk->sk_prot, psock->sk_proto); + return 0; + } + if (sk->sk_family == AF_INET6) udp_bpf_check_v6_needs_rebuild(psock->sk_proto); - return &udp_bpf_prots[family]; + WRITE_ONCE(sk->sk_prot, &udp_bpf_prots[family]); + return 0; } +EXPORT_SYMBOL_GPL(udp_bpf_update_proto); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index c5b4b586570f..54e06b88af69 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -515,21 +515,24 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, unsigned int off = skb_gro_offset(skb); int flush = 1; + /* we can do L4 aggregation only if the packet can't land in a tunnel + * otherwise we could corrupt the inner stream + */ NAPI_GRO_CB(skb)->is_flist = 0; - if (skb->dev->features & NETIF_F_GRO_FRAGLIST) - NAPI_GRO_CB(skb)->is_flist = sk ? !udp_sk(sk)->gro_enabled: 1; + if (!sk || !udp_sk(sk)->gro_receive) { + if (skb->dev->features & NETIF_F_GRO_FRAGLIST) + NAPI_GRO_CB(skb)->is_flist = sk ? !udp_sk(sk)->gro_enabled : 1; - if ((!sk && (skb->dev->features & NETIF_F_GRO_UDP_FWD)) || - (sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist) { - pp = call_gro_receive(udp_gro_receive_segment, head, skb); + if ((!sk && (skb->dev->features & NETIF_F_GRO_UDP_FWD)) || + (sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist) + pp = call_gro_receive(udp_gro_receive_segment, head, skb); return pp; } - if (!sk || NAPI_GRO_CB(skb)->encap_mark || + if (NAPI_GRO_CB(skb)->encap_mark || (uh->check && skb->ip_summed != CHECKSUM_PARTIAL && NAPI_GRO_CB(skb)->csum_cnt == 0 && - !NAPI_GRO_CB(skb)->csum_valid) || - !udp_sk(sk)->gro_receive) + !NAPI_GRO_CB(skb)->csum_valid)) goto out; /* mark that this skb passed once through the tunnel gro layer */ @@ -639,6 +642,11 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff, skb_shinfo(skb)->gso_type = uh->check ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + /* clear the encap mark, so that inner frag_list gro_complete + * can take place + */ + NAPI_GRO_CB(skb)->encap_mark = 0; + /* Set encapsulation before calling into inner gro_complete() * functions to make them set up the inner offsets. */ @@ -662,7 +670,8 @@ INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff) const struct iphdr *iph = ip_hdr(skb); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); - if (NAPI_GRO_CB(skb)->is_flist) { + /* do fraglist only if there is no outer UDP encap (or we already processed it) */ + if (NAPI_GRO_CB(skb)->is_flist && !NAPI_GRO_CB(skb)->encap_mark) { uh->len = htons(skb->len - nhoff); skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4); |