diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/af_inet.c | 1 | ||||
-rw-r--r-- | net/ipv4/cipso_ipv4.c | 2 | ||||
-rw-r--r-- | net/ipv4/fou.c | 4 | ||||
-rw-r--r-- | net/ipv4/icmp.c | 6 | ||||
-rw-r--r-- | net/ipv4/inet_diag.c | 17 | ||||
-rw-r--r-- | net/ipv4/inet_hashtables.c | 6 | ||||
-rw-r--r-- | net/ipv4/ip_options.c | 35 | ||||
-rw-r--r-- | net/ipv4/ip_output.c | 11 | ||||
-rw-r--r-- | net/ipv4/ip_sockglue.c | 5 | ||||
-rw-r--r-- | net/ipv4/ipmr.c | 14 | ||||
-rw-r--r-- | net/ipv4/nexthop.c | 66 | ||||
-rw-r--r-- | net/ipv4/ping.c | 29 | ||||
-rw-r--r-- | net/ipv4/raw.c | 5 | ||||
-rw-r--r-- | net/ipv4/route.c | 19 | ||||
-rw-r--r-- | net/ipv4/syncookies.c | 6 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 9 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 42 | ||||
-rw-r--r-- | net/ipv4/tcp_bpf.c | 13 | ||||
-rw-r--r-- | net/ipv4/tcp_fastopen.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 151 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 18 | ||||
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 1 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 194 | ||||
-rw-r--r-- | net/ipv4/tcp_scalable.c | 2 | ||||
-rw-r--r-- | net/ipv4/udp.c | 2 | ||||
-rw-r--r-- | net/ipv4/udp_bpf.c | 9 |
26 files changed, 495 insertions, 174 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 4307503a6f0b..b7260c8cef2e 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1017,6 +1017,7 @@ static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned lon const struct proto_ops inet_stream_ops = { .family = PF_INET, + .flags = PROTO_CMSG_DATA_ONLY, .owner = THIS_MODULE, .release = inet_release, .bind = inet_bind, diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 2eb71579f4d2..471d33a0d095 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -498,7 +498,7 @@ static void cipso_v4_doi_free_rcu(struct rcu_head *entry) /** * cipso_v4_doi_remove - Remove an existing DOI from the CIPSO protocol engine * @doi: the DOI value - * @audit_secid: the LSM secid to use in the audit message + * @audit_info: NetLabel audit information * * Description: * Removes a DOI definition from the CIPSO engine. The NetLabel routines will diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index abd083415f89..5308cfa3de62 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -237,7 +237,7 @@ static struct sk_buff *fou_gro_receive(struct sock *sk, /* We can clear the encap_mark for FOU as we are essentially doing * one of two possible things. We are either adding an L4 tunnel - * header to the outer L3 tunnel header, or we are are simply + * header to the outer L3 tunnel header, or we are simply * treating the GRE tunnel header as though it is a UDP protocol * specific header such as VXLAN or GENEVE. */ @@ -429,7 +429,7 @@ next_proto: /* We can clear the encap_mark for GUE as we are essentially doing * one of two possible things. We are either adding an L4 tunnel - * header to the outer L3 tunnel header, or we are are simply + * header to the outer L3 tunnel header, or we are simply * treating the GRE tunnel header as though it is a UDP protocol * specific header such as VXLAN or GENEVE. */ diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index cf36f955bfe6..8f2e974a1e4d 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -690,9 +690,9 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, rcu_read_unlock(); } - tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) | + tos = icmp_pointers[type].error ? (RT_TOS(iph->tos) | IPTOS_PREC_INTERNETCONTROL) : - iph->tos; + iph->tos; mark = IP4_REPLY_MARK(net, skb_in->mark); if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, opt)) @@ -784,7 +784,7 @@ EXPORT_SYMBOL(icmp_ndo_send); static void icmp_socket_deliver(struct sk_buff *skb, u32 info) { - const struct iphdr *iph = (const struct iphdr *) skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; const struct net_protocol *ipprot; int protocol = iph->protocol; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index f1bd95f243b3..366a4507b5a3 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -125,6 +125,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, bool net_admin) { const struct inet_sock *inet = inet_sk(sk); + struct inet_diag_sockopt inet_sockopt; if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown)) goto errout; @@ -180,6 +181,22 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); r->idiag_inode = sock_i_ino(sk); + memset(&inet_sockopt, 0, sizeof(inet_sockopt)); + inet_sockopt.recverr = inet->recverr; + inet_sockopt.is_icsk = inet->is_icsk; + inet_sockopt.freebind = inet->freebind; + inet_sockopt.hdrincl = inet->hdrincl; + inet_sockopt.mc_loop = inet->mc_loop; + inet_sockopt.transparent = inet->transparent; + inet_sockopt.mc_all = inet->mc_all; + inet_sockopt.nodefrag = inet->nodefrag; + inet_sockopt.bind_address_no_port = inet->bind_address_no_port; + inet_sockopt.recverr_rfc4884 = inet->recverr_rfc4884; + inet_sockopt.defer_connect = inet->defer_connect; + if (nla_put(skb, INET_DIAG_SOCKOPT, sizeof(inet_sockopt), + &inet_sockopt)) + goto errout; + return 0; errout: return 1; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 239e54474b65..8cbe74313f38 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -228,7 +228,7 @@ static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk) static inline int compute_score(struct sock *sk, struct net *net, const unsigned short hnum, const __be32 daddr, - const int dif, const int sdif, bool exact_dif) + const int dif, const int sdif) { int score = -1; @@ -277,15 +277,13 @@ static struct sock *inet_lhash2_lookup(struct net *net, const __be32 daddr, const unsigned short hnum, const int dif, const int sdif) { - bool exact_dif = inet_exact_dif_match(net, skb); struct inet_connection_sock *icsk; struct sock *sk, *result = NULL; int score, hiscore = 0; inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { sk = (struct sock *)icsk; - score = compute_score(sk, net, hnum, daddr, - dif, sdif, exact_dif); + score = compute_score(sk, net, hnum, daddr, dif, sdif); if (score > hiscore) { result = lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 948747aac4e2..da1b5038bdfd 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -47,32 +47,32 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt, unsigned char *iph = skb_network_header(skb); memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options)); - memcpy(iph+sizeof(struct iphdr), opt->__data, opt->optlen); + memcpy(iph + sizeof(struct iphdr), opt->__data, opt->optlen); opt = &(IPCB(skb)->opt); if (opt->srr) - memcpy(iph+opt->srr+iph[opt->srr+1]-4, &daddr, 4); + memcpy(iph + opt->srr + iph[opt->srr + 1] - 4, &daddr, 4); if (!is_frag) { if (opt->rr_needaddr) - ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, skb, rt); + ip_rt_get_source(iph + opt->rr + iph[opt->rr + 2] - 5, skb, rt); if (opt->ts_needaddr) - ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, skb, rt); + ip_rt_get_source(iph + opt->ts + iph[opt->ts + 2] - 9, skb, rt); if (opt->ts_needtime) { __be32 midtime; midtime = inet_current_timestamp(); - memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4); + memcpy(iph + opt->ts + iph[opt->ts + 2] - 5, &midtime, 4); } return; } if (opt->rr) { - memset(iph+opt->rr, IPOPT_NOP, iph[opt->rr+1]); + memset(iph + opt->rr, IPOPT_NOP, iph[opt->rr + 1]); opt->rr = 0; opt->rr_needaddr = 0; } if (opt->ts) { - memset(iph+opt->ts, IPOPT_NOP, iph[opt->ts+1]); + memset(iph + opt->ts, IPOPT_NOP, iph[opt->ts + 1]); opt->ts = 0; opt->ts_needaddr = opt->ts_needtime = 0; } @@ -495,26 +495,29 @@ EXPORT_SYMBOL(ip_options_compile); void ip_options_undo(struct ip_options *opt) { if (opt->srr) { - unsigned char *optptr = opt->__data+opt->srr-sizeof(struct iphdr); - memmove(optptr+7, optptr+3, optptr[1]-7); - memcpy(optptr+3, &opt->faddr, 4); + unsigned char *optptr = opt->__data + opt->srr - sizeof(struct iphdr); + + memmove(optptr + 7, optptr + 3, optptr[1] - 7); + memcpy(optptr + 3, &opt->faddr, 4); } if (opt->rr_needaddr) { - unsigned char *optptr = opt->__data+opt->rr-sizeof(struct iphdr); + unsigned char *optptr = opt->__data + opt->rr - sizeof(struct iphdr); + optptr[2] -= 4; - memset(&optptr[optptr[2]-1], 0, 4); + memset(&optptr[optptr[2] - 1], 0, 4); } if (opt->ts) { - unsigned char *optptr = opt->__data+opt->ts-sizeof(struct iphdr); + unsigned char *optptr = opt->__data + opt->ts - sizeof(struct iphdr); + if (opt->ts_needtime) { optptr[2] -= 4; - memset(&optptr[optptr[2]-1], 0, 4); - if ((optptr[3]&0xF) == IPOPT_TS_PRESPEC) + memset(&optptr[optptr[2] - 1], 0, 4); + if ((optptr[3] & 0xF) == IPOPT_TS_PRESPEC) optptr[2] -= 4; } if (opt->ts_needaddr) { optptr[2] -= 4; - memset(&optptr[optptr[2]-1], 0, 4); + memset(&optptr[optptr[2] - 1], 0, 4); } } } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index e6f2ada9e7d5..8b200252c655 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -143,7 +143,8 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) * */ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, - __be32 saddr, __be32 daddr, struct ip_options_rcu *opt) + __be32 saddr, __be32 daddr, struct ip_options_rcu *opt, + u8 tos) { struct inet_sock *inet = inet_sk(sk); struct rtable *rt = skb_rtable(skb); @@ -156,7 +157,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, iph = ip_hdr(skb); iph->version = 4; iph->ihl = 5; - iph->tos = inet->tos; + iph->tos = tos; iph->ttl = ip_select_ttl(inet, &rt->dst); iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); iph->saddr = saddr; @@ -997,7 +998,7 @@ static int __ip_append_data(struct sock *sk, fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; - maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu; + maxnonfragsize = ip_sk_ignore_df(sk) ? IP_MAX_MTU : mtu; if (cork->length + length > maxnonfragsize - fragheaderlen) { ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, @@ -1352,7 +1353,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, if (cork->flags & IPCORK_OPT) opt = cork->opt; - if (!(rt->dst.dev->features&NETIF_F_SG)) + if (!(rt->dst.dev->features & NETIF_F_SG)) return -EOPNOTSUPP; hh_len = LL_RESERVED_SPACE(rt->dst.dev); @@ -1537,7 +1538,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, ip_select_ident(net, skb, sk); if (opt) { - iph->ihl += opt->optlen>>2; + iph->ihl += opt->optlen >> 2; ip_options_build(skb, opt, cork->addr, rt, 0); } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index d2c223554ff7..ec6036713e2c 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1124,8 +1124,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname, dev_put(dev); err = -EINVAL; - if (sk->sk_bound_dev_if && - (!midx || midx != sk->sk_bound_dev_if)) + if (sk->sk_bound_dev_if && midx != sk->sk_bound_dev_if) break; inet->uc_index = ifindex; @@ -1189,7 +1188,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname, err = -EINVAL; if (sk->sk_bound_dev_if && mreq.imr_ifindex != sk->sk_bound_dev_if && - (!midx || midx != sk->sk_bound_dev_if)) + midx != sk->sk_bound_dev_if) break; inet->mc_index = mreq.imr_ifindex; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 876fd6ff1ff9..939792a38814 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1038,10 +1038,13 @@ static int ipmr_cache_report(struct mr_table *mrt, memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr)); msg->im_msgtype = assert; msg->im_mbz = 0; - if (assert == IGMPMSG_WRVIFWHOLE) + if (assert == IGMPMSG_WRVIFWHOLE) { msg->im_vif = vifi; - else + msg->im_vif_hi = vifi >> 8; + } else { msg->im_vif = mrt->mroute_reg_vif_num; + msg->im_vif_hi = mrt->mroute_reg_vif_num >> 8; + } ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2; ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) + sizeof(struct iphdr)); @@ -1054,6 +1057,7 @@ static int ipmr_cache_report(struct mr_table *mrt, ip_hdr(skb)->protocol = 0; msg = (struct igmpmsg *)skb_network_header(skb); msg->im_vif = vifi; + msg->im_vif_hi = vifi >> 8; skb_dst_set(skb, dst_clone(skb_dst(pkt))); /* Add our header */ igmp = skb_put(skb, sizeof(struct igmphdr)); @@ -2396,6 +2400,7 @@ static size_t igmpmsg_netlink_msgsize(size_t payloadlen) + nla_total_size(4) /* IPMRA_CREPORT_VIF_ID */ + nla_total_size(4) /* IPMRA_CREPORT_SRC_ADDR */ + nla_total_size(4) /* IPMRA_CREPORT_DST_ADDR */ + + nla_total_size(4) /* IPMRA_CREPORT_TABLE */ /* IPMRA_CREPORT_PKT */ + nla_total_size(payloadlen) ; @@ -2427,11 +2432,12 @@ static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt) rtgenm = nlmsg_data(nlh); rtgenm->rtgen_family = RTNL_FAMILY_IPMR; if (nla_put_u8(skb, IPMRA_CREPORT_MSGTYPE, msg->im_msgtype) || - nla_put_u32(skb, IPMRA_CREPORT_VIF_ID, msg->im_vif) || + nla_put_u32(skb, IPMRA_CREPORT_VIF_ID, msg->im_vif | (msg->im_vif_hi << 8)) || nla_put_in_addr(skb, IPMRA_CREPORT_SRC_ADDR, msg->im_src.s_addr) || nla_put_in_addr(skb, IPMRA_CREPORT_DST_ADDR, - msg->im_dst.s_addr)) + msg->im_dst.s_addr) || + nla_put_u32(skb, IPMRA_CREPORT_TABLE, mrt->id)) goto nla_put_failure; nla = nla_reserve(skb, IPMRA_CREPORT_PKT, payloadlen); diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 134e92382275..8c0f17c6863c 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -42,8 +42,8 @@ static int call_nexthop_notifiers(struct net *net, { int err; - err = atomic_notifier_call_chain(&net->nexthop.notifier_chain, - event_type, nh); + err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, + event_type, nh); return notifier_to_errno(err); } @@ -133,12 +133,9 @@ static struct nexthop *nexthop_alloc(void) static struct nh_group *nexthop_grp_alloc(u16 num_nh) { - size_t sz = offsetof(struct nexthop, nh_grp) - + sizeof(struct nh_group) - + sizeof(struct nh_grp_entry) * num_nh; struct nh_group *nhg; - nhg = kzalloc(sz, GFP_KERNEL); + nhg = kzalloc(struct_size(nhg, nh_entries, num_nh), GFP_KERNEL); if (nhg) nhg->num_nh = num_nh; @@ -279,7 +276,7 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, case AF_INET: fib_nh = &nhi->fib_nh; if (fib_nh->fib_nh_gw_family && - nla_put_u32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4)) + nla_put_be32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4)) goto nla_put_failure; break; @@ -800,7 +797,7 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, return; } - newg->has_v4 = nhg->has_v4; + newg->has_v4 = false; newg->mpath = nhg->mpath; newg->fdb_nh = nhg->fdb_nh; newg->num_nh = nhg->num_nh; @@ -809,12 +806,18 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, nhges = nhg->nh_entries; new_nhges = newg->nh_entries; for (i = 0, j = 0; i < nhg->num_nh; ++i) { + struct nh_info *nhi; + /* current nexthop getting removed */ if (nhg->nh_entries[i].nh == nh) { newg->num_nh--; continue; } + nhi = rtnl_dereference(nhges[i].nh->nh_info); + if (nhi->family == AF_INET) + newg->has_v4 = true; + list_del(&nhges[i].nh_list); new_nhges[j].nh_parent = nhges[i].nh_parent; new_nhges[j].nh = nhges[i].nh; @@ -867,8 +870,6 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) bool do_flush = false; struct fib_info *fi; - call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh); - list_for_each_entry(fi, &nh->fi_list, nh_list) { fi->fib_flags |= RTNH_F_DEAD; do_flush = true; @@ -906,6 +907,8 @@ static void __remove_nexthop(struct net *net, struct nexthop *nh, static void remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { + call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh); + /* remove from the tree */ rb_erase(&nh->rb_node, &net->nexthop.rb_root); @@ -961,6 +964,23 @@ static int replace_nexthop_grp(struct net *net, struct nexthop *old, return 0; } +static void nh_group_v4_update(struct nh_group *nhg) +{ + struct nh_grp_entry *nhges; + bool has_v4 = false; + int i; + + nhges = nhg->nh_entries; + for (i = 0; i < nhg->num_nh; i++) { + struct nh_info *nhi; + + nhi = rtnl_dereference(nhges[i].nh->nh_info); + if (nhi->family == AF_INET) + has_v4 = true; + } + nhg->has_v4 = has_v4; +} + static int replace_nexthop_single(struct net *net, struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) @@ -984,6 +1004,21 @@ static int replace_nexthop_single(struct net *net, struct nexthop *old, rcu_assign_pointer(old->nh_info, newi); rcu_assign_pointer(new->nh_info, oldi); + /* When replacing an IPv4 nexthop with an IPv6 nexthop, potentially + * update IPv4 indication in all the groups using the nexthop. + */ + if (oldi->family == AF_INET && newi->family == AF_INET6) { + struct nh_grp_entry *nhge; + + list_for_each_entry(nhge, &old->grp_list, nh_list) { + struct nexthop *nhp = nhge->nh_parent; + struct nh_group *nhg; + + nhg = rtnl_dereference(nhp->nh_grp); + nh_group_v4_update(nhg); + } + } + return 0; } @@ -1101,7 +1136,7 @@ static int insert_nexthop(struct net *net, struct nexthop *new_nh, while (1) { struct nexthop *nh; - next = rtnl_dereference(*pp); + next = *pp; if (!next) break; @@ -1924,14 +1959,15 @@ static struct notifier_block nh_netdev_notifier = { int register_nexthop_notifier(struct net *net, struct notifier_block *nb) { - return atomic_notifier_chain_register(&net->nexthop.notifier_chain, nb); + return blocking_notifier_chain_register(&net->nexthop.notifier_chain, + nb); } EXPORT_SYMBOL(register_nexthop_notifier); int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) { - return atomic_notifier_chain_unregister(&net->nexthop.notifier_chain, - nb); + return blocking_notifier_chain_unregister(&net->nexthop.notifier_chain, + nb); } EXPORT_SYMBOL(unregister_nexthop_notifier); @@ -1951,7 +1987,7 @@ static int __net_init nexthop_net_init(struct net *net) net->nexthop.devhash = kzalloc(sz, GFP_KERNEL); if (!net->nexthop.devhash) return -ENOMEM; - ATOMIC_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain); + BLOCKING_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain); return 0; } diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index df6fbefe44d4..248856b301c4 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -293,7 +293,8 @@ EXPORT_SYMBOL_GPL(ping_close); /* Checks the bind address and possibly modifies sk->sk_bound_dev_if. */ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, - struct sockaddr *uaddr, int addr_len) { + struct sockaddr *uaddr, int addr_len) +{ struct net *net = sock_net(sk); if (sk->sk_family == AF_INET) { struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; @@ -310,10 +311,10 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n", sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port)); - chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr); - if (addr->sin_addr.s_addr == htonl(INADDR_ANY)) chk_addr_ret = RTN_LOCAL; + else + chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr); if ((!inet_can_nonlocal_bind(net, isk) && chk_addr_ret != RTN_LOCAL) || @@ -383,20 +384,6 @@ static void ping_set_saddr(struct sock *sk, struct sockaddr *saddr) } } -static void ping_clear_saddr(struct sock *sk, int dif) -{ - sk->sk_bound_dev_if = dif; - if (sk->sk_family == AF_INET) { - struct inet_sock *isk = inet_sk(sk); - isk->inet_rcv_saddr = isk->inet_saddr = 0; -#if IS_ENABLED(CONFIG_IPV6) - } else if (sk->sk_family == AF_INET6) { - struct ipv6_pinfo *np = inet6_sk(sk); - memset(&sk->sk_v6_rcv_saddr, 0, sizeof(sk->sk_v6_rcv_saddr)); - memset(&np->saddr, 0, sizeof(np->saddr)); -#endif - } -} /* * We need our own bind because there are no privileged id's == local ports. * Moreover, we don't allow binding to multi- and broadcast addresses. @@ -420,12 +407,13 @@ int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) goto out; err = -EADDRINUSE; - ping_set_saddr(sk, uaddr); snum = ntohs(((struct sockaddr_in *)uaddr)->sin_port); if (ping_get_port(sk, snum) != 0) { - ping_clear_saddr(sk, dif); + /* Restore possibly modified sk->sk_bound_dev_if by ping_check_bind_addr(). */ + sk->sk_bound_dev_if = dif; goto out; } + ping_set_saddr(sk, uaddr); pr_debug("after bind(): num = %hu, dif = %d\n", isk->inet_num, @@ -647,7 +635,8 @@ static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh, } int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, - void *user_icmph, size_t icmph_len) { + void *user_icmph, size_t icmph_len) +{ u8 type, code; if (len > 0xFFFF) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 407956be7deb..1170653a89cd 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -260,11 +260,12 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) err = EHOSTUNREACH; if (code > NR_ICMP_UNREACH) break; - err = icmp_err_convert[code].errno; - harderr = icmp_err_convert[code].fatal; if (code == ICMP_FRAG_NEEDED) { harderr = inet->pmtudisc != IP_PMTUDISC_DONT; err = EMSGSIZE; + } else { + err = icmp_err_convert[code].errno; + harderr = icmp_err_convert[code].fatal; } } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 58642b29a499..d15a78b26dfa 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -623,7 +623,7 @@ static inline u32 fnhe_hashfun(__be32 daddr) u32 hval; net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd)); - hval = jhash_1word((__force u32) daddr, fnhe_hashrnd); + hval = jhash_1word((__force u32)daddr, fnhe_hashrnd); return hash_32(hval, FNHE_HASH_SHIFT); } @@ -1016,13 +1016,14 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) { struct dst_entry *dst = &rt->dst; struct net *net = dev_net(dst->dev); - u32 old_mtu = ipv4_mtu(dst); struct fib_result res; bool lock = false; + u32 old_mtu; if (ip_mtu_locked(dst)) return; + old_mtu = ipv4_mtu(dst); if (old_mtu < mtu) return; @@ -1066,7 +1067,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif, u8 protocol) { - const struct iphdr *iph = (const struct iphdr *) skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; u32 mark = IP4_REPLY_MARK(net, skb->mark); @@ -1083,7 +1084,7 @@ EXPORT_SYMBOL_GPL(ipv4_update_pmtu); static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) { - const struct iphdr *iph = (const struct iphdr *) skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; @@ -1101,7 +1102,7 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) { - const struct iphdr *iph = (const struct iphdr *) skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; struct dst_entry *odst = NULL; @@ -1131,7 +1132,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) new = true; } - __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu); + __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu); if (!dst_check(&rt->dst, 0)) { if (new) @@ -1156,7 +1157,7 @@ EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); void ipv4_redirect(struct sk_buff *skb, struct net *net, int oif, u8 protocol) { - const struct iphdr *iph = (const struct iphdr *) skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; @@ -1172,7 +1173,7 @@ EXPORT_SYMBOL_GPL(ipv4_redirect); void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) { - const struct iphdr *iph = (const struct iphdr *) skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; struct net *net = sock_net(sk); @@ -1312,7 +1313,7 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst) static unsigned int ipv4_mtu(const struct dst_entry *dst) { - const struct rtable *rt = (const struct rtable *) dst; + const struct rtable *rt = (const struct rtable *)dst; unsigned int mtu = rt->rt_pmtu; if (!mtu || time_after_eq(jiffies, rt->dst.expires)) diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index f0794f0232ba..c375c126f436 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -286,11 +286,10 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk, struct sk_buff *skb) { + struct tcp_request_sock *treq; struct request_sock *req; #ifdef CONFIG_MPTCP - struct tcp_request_sock *treq; - if (sk_is_mptcp(sk)) ops = &mptcp_subflow_request_sock_ops; #endif @@ -299,8 +298,9 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, if (!req) return NULL; -#if IS_ENABLED(CONFIG_MPTCP) treq = tcp_rsk(req); + treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield; +#if IS_ENABLED(CONFIG_MPTCP) treq->is_mptcp = sk_is_mptcp(sk); if (treq->is_mptcp) { int err = mptcp_subflow_init_cookie_req(req, sk, skb); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 54023a46db04..3e5f4f2e705e 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1330,6 +1330,15 @@ static struct ctl_table ipv4_net_table[] = { .extra2 = &comp_sack_nr_max, }, { + .procname = "tcp_reflect_tos", + .data = &init_net.ipv4.sysctl_tcp_reflect_tos, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { .procname = "udp_rmem_min", .data = &init_net.ipv4.sysctl_udp_rmem_min, .maxlen = sizeof(init_net.ipv4.sysctl_udp_rmem_min), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 31f3b858db81..65057744fac8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -418,6 +418,8 @@ void tcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&tp->tsorted_sent_queue); icsk->icsk_rto = TCP_TIMEOUT_INIT; + icsk->icsk_rto_min = TCP_RTO_MIN; + icsk->icsk_delack_max = TCP_DELACK_MAX; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U); @@ -562,7 +564,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) mask |= EPOLLIN | EPOLLRDNORM; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { - if (sk_stream_is_writeable(sk)) { + if (__sk_stream_is_writeable(sk, 1)) { mask |= EPOLLOUT | EPOLLWRNORM; } else { /* send SIGIO later */ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); @@ -574,7 +576,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) * pairs with the input side. */ smp_mb__after_atomic(); - if (sk_stream_is_writeable(sk)) + if (__sk_stream_is_writeable(sk, 1)) mask |= EPOLLOUT | EPOLLWRNORM; } } else @@ -1002,12 +1004,12 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, !tcp_skb_can_collapse_to(skb)) { new_segment: if (!sk_stream_memory_free(sk)) - goto wait_for_sndbuf; + goto wait_for_space; skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, tcp_rtx_and_write_queues_empty(sk)); if (!skb) - goto wait_for_memory; + goto wait_for_space; #ifdef CONFIG_TLS_DEVICE skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED); @@ -1026,7 +1028,7 @@ new_segment: goto new_segment; } if (!sk_wmem_schedule(sk, copy)) - goto wait_for_memory; + goto wait_for_space; if (can_coalesce) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); @@ -1067,9 +1069,8 @@ new_segment: tcp_push_one(sk, mss_now); continue; -wait_for_sndbuf: +wait_for_space: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); -wait_for_memory: tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH, size_goal); @@ -1280,7 +1281,7 @@ restart: new_segment: if (!sk_stream_memory_free(sk)) - goto wait_for_sndbuf; + goto wait_for_space; if (unlikely(process_backlog >= 16)) { process_backlog = 0; @@ -1291,7 +1292,7 @@ new_segment: skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, first_skb); if (!skb) - goto wait_for_memory; + goto wait_for_space; process_backlog++; skb->ip_summed = CHECKSUM_PARTIAL; @@ -1324,7 +1325,7 @@ new_segment: struct page_frag *pfrag = sk_page_frag(sk); if (!sk_page_frag_refill(sk, pfrag)) - goto wait_for_memory; + goto wait_for_space; if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { @@ -1338,7 +1339,7 @@ new_segment: copy = min_t(int, copy, pfrag->size - pfrag->offset); if (!sk_wmem_schedule(sk, copy)) - goto wait_for_memory; + goto wait_for_space; err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, pfrag->page, @@ -1391,9 +1392,8 @@ new_segment: tcp_push_one(sk, mss_now); continue; -wait_for_sndbuf: +wait_for_space: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); -wait_for_memory: if (copied) tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH, size_goal); @@ -1525,7 +1525,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) * calculation of whether or not we must ACK for the sake of * a window update. */ -static void tcp_cleanup_rbuf(struct sock *sk, int copied) +void tcp_cleanup_rbuf(struct sock *sk, int copied) { struct tcp_sock *tp = tcp_sk(sk); bool time_to_ack = false; @@ -2685,6 +2685,8 @@ int tcp_disconnect(struct sock *sk, int flags) icsk->icsk_backoff = 0; icsk->icsk_probes_out = 0; icsk->icsk_rto = TCP_TIMEOUT_INIT; + icsk->icsk_rto_min = TCP_RTO_MIN; + icsk->icsk_delack_max = TCP_DELACK_MAX; tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd = TCP_INIT_CWND; tp->snd_cwnd_cnt = 0; @@ -3207,7 +3209,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, break; case TCP_SAVE_SYN: - if (val < 0 || val > 1) + /* 0: disable, 1: enable, 2: start from ether_header */ + if (val < 0 || val > 2) err = -EINVAL; else tp->save_syn = val; @@ -3788,20 +3791,21 @@ static int do_tcp_getsockopt(struct sock *sk, int level, lock_sock(sk); if (tp->saved_syn) { - if (len < tp->saved_syn[0]) { - if (put_user(tp->saved_syn[0], optlen)) { + if (len < tcp_saved_syn_len(tp->saved_syn)) { + if (put_user(tcp_saved_syn_len(tp->saved_syn), + optlen)) { release_sock(sk); return -EFAULT; } release_sock(sk); return -EINVAL; } - len = tp->saved_syn[0]; + len = tcp_saved_syn_len(tp->saved_syn); if (put_user(len, optlen)) { release_sock(sk); return -EFAULT; } - if (copy_to_user(optval, tp->saved_syn + 1, len)) { + if (copy_to_user(optval, tp->saved_syn->data, len)) { release_sock(sk); return -EFAULT; } diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 7aa68f4aae6c..37f4cb2bba5c 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -567,10 +567,9 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], prot[TCP_BPF_TX].sendpage = tcp_bpf_sendpage; } -static void tcp_bpf_check_v6_needs_rebuild(struct sock *sk, struct proto *ops) +static void tcp_bpf_check_v6_needs_rebuild(struct proto *ops) { - if (sk->sk_family == AF_INET6 && - unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) { + if (unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) { spin_lock_bh(&tcpv6_prot_lock); if (likely(ops != tcpv6_prot_saved)) { tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV6], ops); @@ -603,13 +602,11 @@ struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock) int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; - if (!psock->sk_proto) { - struct proto *ops = READ_ONCE(sk->sk_prot); - - if (tcp_bpf_assert_proto_ops(ops)) + if (sk->sk_family == AF_INET6) { + if (tcp_bpf_assert_proto_ops(psock->sk_proto)) return ERR_PTR(-EINVAL); - tcp_bpf_check_v6_needs_rebuild(sk, ops); + tcp_bpf_check_v6_needs_rebuild(psock->sk_proto); } return &tcp_bpf_prots[family][config]; diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 09b62de04eea..af2814c9342a 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -295,7 +295,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, refcount_set(&req->rsk_refcnt, 2); /* Now finish processing the fastopen child socket. */ - tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); + tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, skb); tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 184ea556f50e..50834e7f958e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -138,6 +138,69 @@ void clean_acked_data_flush(void) EXPORT_SYMBOL_GPL(clean_acked_data_flush); #endif +#ifdef CONFIG_CGROUP_BPF +static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) +{ + bool unknown_opt = tcp_sk(sk)->rx_opt.saw_unknown && + BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), + BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG); + bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), + BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG); + struct bpf_sock_ops_kern sock_ops; + + if (likely(!unknown_opt && !parse_all_opt)) + return; + + /* The skb will be handled in the + * bpf_skops_established() or + * bpf_skops_write_hdr_opt(). + */ + switch (sk->sk_state) { + case TCP_SYN_RECV: + case TCP_SYN_SENT: + case TCP_LISTEN: + return; + } + + sock_owned_by_me(sk); + + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); + sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB; + sock_ops.is_fullsock = 1; + sock_ops.sk = sk; + bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); + + BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); +} + +static void bpf_skops_established(struct sock *sk, int bpf_op, + struct sk_buff *skb) +{ + struct bpf_sock_ops_kern sock_ops; + + sock_owned_by_me(sk); + + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); + sock_ops.op = bpf_op; + sock_ops.is_fullsock = 1; + sock_ops.sk = sk; + /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */ + if (skb) + bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); + + BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); +} +#else +static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) +{ +} + +static void bpf_skops_established(struct sock *sk, int bpf_op, + struct sk_buff *skb) +{ +} +#endif + static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb, unsigned int len) { @@ -3801,7 +3864,7 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie, foc->exp = exp_opt; } -static void smc_parse_options(const struct tcphdr *th, +static bool smc_parse_options(const struct tcphdr *th, struct tcp_options_received *opt_rx, const unsigned char *ptr, int opsize) @@ -3810,10 +3873,13 @@ static void smc_parse_options(const struct tcphdr *th, if (static_branch_unlikely(&tcp_have_smc)) { if (th->syn && !(opsize & 1) && opsize >= TCPOLEN_EXP_SMC_BASE && - get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) + get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) { opt_rx->smc_ok = 1; + return true; + } } #endif + return false; } /* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped @@ -3874,6 +3940,7 @@ void tcp_parse_options(const struct net *net, ptr = (const unsigned char *)(th + 1); opt_rx->saw_tstamp = 0; + opt_rx->saw_unknown = 0; while (length > 0) { int opcode = *ptr++; @@ -3964,15 +4031,21 @@ void tcp_parse_options(const struct net *net, */ if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE && get_unaligned_be16(ptr) == - TCPOPT_FASTOPEN_MAGIC) + TCPOPT_FASTOPEN_MAGIC) { tcp_parse_fastopen_option(opsize - TCPOLEN_EXP_FASTOPEN_BASE, ptr + 2, th->syn, foc, true); - else - smc_parse_options(th, opt_rx, ptr, - opsize); + break; + } + + if (smc_parse_options(th, opt_rx, ptr, opsize)) + break; + + opt_rx->saw_unknown = 1; break; + default: + opt_rx->saw_unknown = 1; } ptr += opsize-2; length -= opsize; @@ -5259,12 +5332,6 @@ static bool tcp_should_expand_sndbuf(const struct sock *sk) return true; } -/* When incoming ACK allowed to free some skb from write_queue, - * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket - * on the exit from tcp input handler. - * - * PROBLEM: sndbuf expansion does not work well with largesend. - */ static void tcp_new_space(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -5279,16 +5346,13 @@ static void tcp_new_space(struct sock *sk) static void tcp_check_space(struct sock *sk) { - if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { - sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); - /* pairs with tcp_poll() */ - smp_mb(); - if (sk->sk_socket && - test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { - tcp_new_space(sk); - if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) - tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); - } + /* pairs with tcp_poll() */ + smp_mb(); + if (sk->sk_socket && + test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { + tcp_new_space(sk); + if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) + tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } } @@ -5590,6 +5654,8 @@ syn_challenge: goto discard; } + bpf_skops_parse_hdr(sk, skb); + return true; discard: @@ -5798,7 +5864,7 @@ discard: } EXPORT_SYMBOL(tcp_rcv_established); -void tcp_init_transfer(struct sock *sk, int bpf_op) +void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -5819,7 +5885,7 @@ void tcp_init_transfer(struct sock *sk, int bpf_op) tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); tp->snd_cwnd_stamp = tcp_jiffies32; - tcp_call_bpf(sk, bpf_op, 0, NULL); + bpf_skops_established(sk, bpf_op, skb); tcp_init_congestion_control(sk); tcp_init_buffer_space(sk); } @@ -5838,7 +5904,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) sk_mark_napi_id(sk, skb); } - tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); + tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, skb); /* Prevent spurious tcp_cwnd_restart() on first data * packet. @@ -6310,7 +6376,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) } else { tcp_try_undo_spurious_syn(sk); tp->retrans_stamp = 0; - tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); + tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, + skb); WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); } smp_mb(); @@ -6599,13 +6666,27 @@ static void tcp_reqsk_record_syn(const struct sock *sk, { if (tcp_sk(sk)->save_syn) { u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb); - u32 *copy; + struct saved_syn *saved_syn; + u32 mac_hdrlen; + void *base; + + if (tcp_sk(sk)->save_syn == 2) { /* Save full header. */ + base = skb_mac_header(skb); + mac_hdrlen = skb_mac_header_len(skb); + len += mac_hdrlen; + } else { + base = skb_network_header(skb); + mac_hdrlen = 0; + } - copy = kmalloc(len + sizeof(u32), GFP_ATOMIC); - if (copy) { - copy[0] = len; - memcpy(©[1], skb_network_header(skb), len); - req->saved_syn = copy; + saved_syn = kmalloc(struct_size(saved_syn, data, len), + GFP_ATOMIC); + if (saved_syn) { + saved_syn->mac_hdrlen = mac_hdrlen; + saved_syn->network_hdrlen = skb_network_header_len(skb); + saved_syn->tcp_hdrlen = tcp_hdrlen(skb); + memcpy(saved_syn->data, base, len); + req->saved_syn = saved_syn; } } } @@ -6744,6 +6825,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_rsk(req)->snt_isn = isn; tcp_rsk(req)->txhash = net_tx_rndhash(); + tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield; tcp_openreq_init_rwin(req, sk, dst); sk_rx_queue_set(req_to_sk(req), skb); if (!want_cookie) { @@ -6752,7 +6834,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, } if (fastopen_sk) { af_ops->send_synack(fastopen_sk, dst, &fl, req, - &foc, TCP_SYNACK_FASTOPEN); + &foc, TCP_SYNACK_FASTOPEN, skb); /* Add the child socket directly into the accept queue */ if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) { reqsk_fastopen_remove(fastopen_sk, req, false); @@ -6770,7 +6852,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_timeout_init((struct sock *)req)); af_ops->send_synack(sk, dst, &fl, req, &foc, !want_cookie ? TCP_SYNACK_NORMAL : - TCP_SYNACK_COOKIE); + TCP_SYNACK_COOKIE, + skb); if (want_cookie) { reqsk_free(req); return 0; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5084333b5ab6..ace48b2790ff 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -575,7 +575,7 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) case TCP_SYN_SENT: case TCP_SYN_RECV: /* Only in fast or simultaneous open. If a fast open socket is - * is already accepted it is treated as a connected one below. + * already accepted it is treated as a connected one below. */ if (fastopen && !fastopen->sk) break; @@ -965,18 +965,23 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, - enum tcp_synack_type synack_type) + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb) { const struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; int err = -1; struct sk_buff *skb; + u8 tos; /* First, grab a route. */ if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) return -1; - skb = tcp_make_synack(sk, dst, req, foc, synack_type); + skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); + + tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ? + tcp_rsk(req)->syn_tos : inet_sk(sk)->tos; if (skb) { __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); @@ -984,7 +989,8 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, rcu_read_lock(); err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, - rcu_dereference(ireq->ireq_opt)); + rcu_dereference(ireq->ireq_opt), + tos & ~INET_ECN_MASK); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -1529,6 +1535,10 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; newinet->inet_id = prandom_u32(); + /* Set ToS of the new socket based upon the value of incoming SYN. */ + if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) + newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; + if (!dst) { dst = inet_csk_route_child_sock(sk, newsk, req); if (!dst) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 495dda2449fe..56c306e3cd2f 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -548,6 +548,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->fastopen_req = NULL; RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); + bpf_skops_init_child(sk, newsk); tcp_bpf_clone(sk, newsk); __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 85ff417bda7f..386978dcd318 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -438,6 +438,7 @@ struct tcp_out_options { u8 ws; /* window scale, 0 to disable */ u8 num_sack_blocks; /* number of SACK blocks to include */ u8 hash_size; /* bytes in hash_location */ + u8 bpf_opt_len; /* length of BPF hdr option */ __u8 *hash_location; /* temporary pointer, overloaded */ __u32 tsval, tsecr; /* need to include OPTION_TS */ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ @@ -452,6 +453,145 @@ static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts) #endif } +#ifdef CONFIG_CGROUP_BPF +static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb, + enum tcp_synack_type synack_type) +{ + if (unlikely(!skb)) + return BPF_WRITE_HDR_TCP_CURRENT_MSS; + + if (unlikely(synack_type == TCP_SYNACK_COOKIE)) + return BPF_WRITE_HDR_TCP_SYNACK_COOKIE; + + return 0; +} + +/* req, syn_skb and synack_type are used when writing synack */ +static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct sk_buff *syn_skb, + enum tcp_synack_type synack_type, + struct tcp_out_options *opts, + unsigned int *remaining) +{ + struct bpf_sock_ops_kern sock_ops; + int err; + + if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) || + !*remaining) + return; + + /* *remaining has already been aligned to 4 bytes, so *remaining >= 4 */ + + /* init sock_ops */ + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); + + sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB; + + if (req) { + /* The listen "sk" cannot be passed here because + * it is not locked. It would not make too much + * sense to do bpf_setsockopt(listen_sk) based + * on individual connection request also. + * + * Thus, "req" is passed here and the cgroup-bpf-progs + * of the listen "sk" will be run. + * + * "req" is also used here for fastopen even the "sk" here is + * a fullsock "child" sk. It is to keep the behavior + * consistent between fastopen and non-fastopen on + * the bpf programming side. + */ + sock_ops.sk = (struct sock *)req; + sock_ops.syn_skb = syn_skb; + } else { + sock_owned_by_me(sk); + + sock_ops.is_fullsock = 1; + sock_ops.sk = sk; + } + + sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type); + sock_ops.remaining_opt_len = *remaining; + /* tcp_current_mss() does not pass a skb */ + if (skb) + bpf_skops_init_skb(&sock_ops, skb, 0); + + err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk); + + if (err || sock_ops.remaining_opt_len == *remaining) + return; + + opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len; + /* round up to 4 bytes */ + opts->bpf_opt_len = (opts->bpf_opt_len + 3) & ~3; + + *remaining -= opts->bpf_opt_len; +} + +static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct sk_buff *syn_skb, + enum tcp_synack_type synack_type, + struct tcp_out_options *opts) +{ + u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len; + struct bpf_sock_ops_kern sock_ops; + int err; + + if (likely(!max_opt_len)) + return; + + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); + + sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB; + + if (req) { + sock_ops.sk = (struct sock *)req; + sock_ops.syn_skb = syn_skb; + } else { + sock_owned_by_me(sk); + + sock_ops.is_fullsock = 1; + sock_ops.sk = sk; + } + + sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type); + sock_ops.remaining_opt_len = max_opt_len; + first_opt_off = tcp_hdrlen(skb) - max_opt_len; + bpf_skops_init_skb(&sock_ops, skb, first_opt_off); + + err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk); + + if (err) + nr_written = 0; + else + nr_written = max_opt_len - sock_ops.remaining_opt_len; + + if (nr_written < max_opt_len) + memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP, + max_opt_len - nr_written); +} +#else +static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct sk_buff *syn_skb, + enum tcp_synack_type synack_type, + struct tcp_out_options *opts, + unsigned int *remaining) +{ +} + +static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct sk_buff *syn_skb, + enum tcp_synack_type synack_type, + struct tcp_out_options *opts) +{ +} +#endif + /* Write previously computed TCP options to the packet. * * Beware: Something in the Internet is very sensitive to the ordering of @@ -691,6 +831,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } } + bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); + return MAX_TCP_OPTION_SPACE - remaining; } @@ -701,7 +843,8 @@ static unsigned int tcp_synack_options(const struct sock *sk, struct tcp_out_options *opts, const struct tcp_md5sig_key *md5, struct tcp_fastopen_cookie *foc, - enum tcp_synack_type synack_type) + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; @@ -758,6 +901,9 @@ static unsigned int tcp_synack_options(const struct sock *sk, smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); + bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb, + synack_type, opts, &remaining); + return MAX_TCP_OPTION_SPACE - remaining; } @@ -826,6 +972,15 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; } + if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp, + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) { + unsigned int remaining = MAX_TCP_OPTION_SPACE - size; + + bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); + + size = MAX_TCP_OPTION_SPACE - remaining; + } + return size; } @@ -1213,6 +1368,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, } #endif + /* BPF prog is the last one writing header option */ + bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts); + INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check, tcp_v6_send_check, tcp_v4_send_check, sk, skb); @@ -1524,7 +1682,6 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) skb->truesize -= delta_truesize; sk_wmem_queued_add(sk, -delta_truesize); sk_mem_uncharge(sk, delta_truesize); - sock_set_flag(sk, SOCK_QUEUE_SHRUNK); } /* Any change of skb->len requires recalculation of tso factor. */ @@ -3336,20 +3493,20 @@ int tcp_send_synack(struct sock *sk) } /** - * tcp_make_synack - Prepare a SYN-ACK. - * sk: listener socket - * dst: dst entry attached to the SYNACK - * req: request_sock pointer - * foc: cookie for tcp fast open - * synack_type: Type of synback to prepare - * - * Allocate one skb and build a SYNACK packet. - * @dst is consumed : Caller should not use it again. + * tcp_make_synack - Allocate one skb and build a SYNACK packet. + * @sk: listener socket + * @dst: dst entry attached to the SYNACK. It is consumed and caller + * should not use it again. + * @req: request_sock pointer + * @foc: cookie for tcp fast open + * @synack_type: Type of synack to prepare + * @syn_skb: SYN packet just received. It could be NULL for rtx case. */ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct tcp_fastopen_cookie *foc, - enum tcp_synack_type synack_type) + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); const struct tcp_sock *tp = tcp_sk(sk); @@ -3408,8 +3565,11 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); #endif skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4); + /* bpf program will be interested in the tcp_flags */ + TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK; tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5, - foc, synack_type) + sizeof(*th); + foc, synack_type, + syn_skb) + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); @@ -3441,6 +3601,9 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, rcu_read_unlock(); #endif + bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb, + synack_type, &opts); + skb->skb_mstamp_ns = now; tcp_add_tx_delay(skb, tp); @@ -3741,6 +3904,8 @@ void tcp_send_delayed_ack(struct sock *sk) ato = min(ato, max_ato); } + ato = min_t(u32, ato, inet_csk(sk)->icsk_delack_max); + /* Stay within the limit we were given */ timeout = jiffies + ato; @@ -3934,7 +4099,8 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) int res; tcp_rsk(req)->txhash = net_tx_rndhash(); - res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL); + res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL, + NULL); if (!res) { __TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 6cebf412d590..5842081bc8a2 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -10,7 +10,7 @@ #include <net/tcp.h> /* These factors derived from the recommended values in the aer: - * .01 and and 7/8. + * .01 and 7/8. */ #define TCP_SCALABLE_AI_CNT 100U #define TCP_SCALABLE_MD_SCALE 3 diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e88efba07551..09f0a23d1a01 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1170,7 +1170,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc.oif = inet->uc_index; } else if (ipv4_is_lbcast(daddr) && inet->uc_index) { /* oif is set, packet is to local broadcast and - * and uc_index is set. oif is most likely set + * uc_index is set. oif is most likely set * by sk_bound_dev_if. If uc_index != oif check if the * oif is an L3 master and uc_index is an L3 slave. * If so, we want to allow the send using the uc_index. diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index eddd973e6575..7a94791efc1a 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -22,10 +22,9 @@ static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base) prot->close = sock_map_close; } -static void udp_bpf_check_v6_needs_rebuild(struct sock *sk, struct proto *ops) +static void udp_bpf_check_v6_needs_rebuild(struct proto *ops) { - if (sk->sk_family == AF_INET6 && - unlikely(ops != smp_load_acquire(&udpv6_prot_saved))) { + if (unlikely(ops != smp_load_acquire(&udpv6_prot_saved))) { spin_lock_bh(&udpv6_prot_lock); if (likely(ops != udpv6_prot_saved)) { udp_bpf_rebuild_protos(&udp_bpf_prots[UDP_BPF_IPV6], ops); @@ -46,8 +45,8 @@ struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock) { int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6; - if (!psock->sk_proto) - udp_bpf_check_v6_needs_rebuild(sk, READ_ONCE(sk->sk_prot)); + if (sk->sk_family == AF_INET6) + udp_bpf_check_v6_needs_rebuild(psock->sk_proto); return &udp_bpf_prots[family]; } |