diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/af_inet.c | 2 | ||||
-rw-r--r-- | net/ipv4/bpf_tcp_ca.c | 3 | ||||
-rw-r--r-- | net/ipv4/fib_frontend.c | 2 | ||||
-rw-r--r-- | net/ipv4/inet_connection_sock.c | 2 | ||||
-rw-r--r-- | net/ipv4/inet_fragment.c | 47 | ||||
-rw-r--r-- | net/ipv4/inet_hashtables.c | 68 | ||||
-rw-r--r-- | net/ipv4/netfilter/arp_tables.c | 14 | ||||
-rw-r--r-- | net/ipv4/netfilter/ip_tables.c | 14 | ||||
-rw-r--r-- | net/ipv4/route.c | 7 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 475 | ||||
-rw-r--r-- | net/ipv4/tcp_cong.c | 5 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 12 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 42 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 9 | ||||
-rw-r--r-- | net/ipv4/udp.c | 2 |
15 files changed, 528 insertions, 176 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index b7260c8cef2e..b94fa8eb831b 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -450,7 +450,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* BPF prog is run before any checks are done so that if the prog * changes context in a wrong way it will be caught. */ - err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr); + err = BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr); if (err) return err; diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 618954f82764..d520e61649c8 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -95,6 +95,7 @@ static bool bpf_tcp_ca_is_valid_access(int off, int size, } static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, + const struct btf *btf, const struct btf_type *t, int off, int size, enum bpf_access_type atype, u32 *next_btf_id) @@ -102,7 +103,7 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, size_t end; if (atype == BPF_READ) - return btf_struct_access(log, t, off, size, atype, next_btf_id); + return btf_struct_access(log, btf, t, off, size, atype, next_btf_id); if (t != tcp_sock_type) { bpf_log(log, "only read is supported\n"); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index b87140a1fa28..cdf6ec5aa45d 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -825,7 +825,7 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, if (has_gw && has_via) { NL_SET_ERR_MSG(extack, "Nexthop configuration can not contain both GATEWAY and VIA"); - goto errout; + return -EINVAL; } return 0; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 4148f5f78f31..f60869acbef0 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -787,7 +787,7 @@ static void reqsk_queue_hash_req(struct request_sock *req, timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED); mod_timer(&req->rsk_timer, jiffies + timeout); - inet_ehash_insert(req_to_sk(req), NULL); + inet_ehash_insert(req_to_sk(req), NULL, NULL); /* before letting lookups find us, make sure all req fields * are committed to memory and refcnt initialized. */ diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 10d31733297d..05cd198d7a6b 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -145,12 +145,16 @@ static void inet_frags_free_cb(void *ptr, void *arg) inet_frag_destroy(fq); } -static void fqdir_work_fn(struct work_struct *work) +static LLIST_HEAD(fqdir_free_list); + +static void fqdir_free_fn(struct work_struct *work) { - struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work); - struct inet_frags *f = fqdir->f; + struct llist_node *kill_list; + struct fqdir *fqdir, *tmp; + struct inet_frags *f; - rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL); + /* Atomically snapshot the list of fqdirs to free */ + kill_list = llist_del_all(&fqdir_free_list); /* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu) * have completed, since they need to dereference fqdir. @@ -158,10 +162,25 @@ static void fqdir_work_fn(struct work_struct *work) */ rcu_barrier(); - if (refcount_dec_and_test(&f->refcnt)) - complete(&f->completion); + llist_for_each_entry_safe(fqdir, tmp, kill_list, free_list) { + f = fqdir->f; + if (refcount_dec_and_test(&f->refcnt)) + complete(&f->completion); - kfree(fqdir); + kfree(fqdir); + } +} + +static DECLARE_WORK(fqdir_free_work, fqdir_free_fn); + +static void fqdir_work_fn(struct work_struct *work) +{ + struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work); + + rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL); + + if (llist_add(&fqdir->free_list, &fqdir_free_list)) + queue_work(system_wq, &fqdir_free_work); } int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net) @@ -184,10 +203,22 @@ int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net) } EXPORT_SYMBOL(fqdir_init); +static struct workqueue_struct *inet_frag_wq; + +static int __init inet_frag_wq_init(void) +{ + inet_frag_wq = create_workqueue("inet_frag_wq"); + if (!inet_frag_wq) + panic("Could not create inet frag workq"); + return 0; +} + +pure_initcall(inet_frag_wq_init); + void fqdir_exit(struct fqdir *fqdir) { INIT_WORK(&fqdir->destroy_work, fqdir_work_fn); - queue_work(system_wq, &fqdir->destroy_work); + queue_work(inet_frag_wq, &fqdir->destroy_work); } EXPORT_SYMBOL(fqdir_exit); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 8cbe74313f38..45fb450b4522 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -20,6 +20,9 @@ #include <net/addrconf.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> +#if IS_ENABLED(CONFIG_IPV6) +#include <net/inet6_hashtables.h> +#endif #include <net/secure_seq.h> #include <net/ip.h> #include <net/tcp.h> @@ -508,10 +511,52 @@ static u32 inet_sk_port_offset(const struct sock *sk) inet->inet_dport); } -/* insert a socket into ehash, and eventually remove another one - * (The another one can be a SYN_RECV or TIMEWAIT +/* Searches for an exsiting socket in the ehash bucket list. + * Returns true if found, false otherwise. */ -bool inet_ehash_insert(struct sock *sk, struct sock *osk) +static bool inet_ehash_lookup_by_sk(struct sock *sk, + struct hlist_nulls_head *list) +{ + const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num); + const int sdif = sk->sk_bound_dev_if; + const int dif = sk->sk_bound_dev_if; + const struct hlist_nulls_node *node; + struct net *net = sock_net(sk); + struct sock *esk; + + INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr); + + sk_nulls_for_each_rcu(esk, node, list) { + if (esk->sk_hash != sk->sk_hash) + continue; + if (sk->sk_family == AF_INET) { + if (unlikely(INET_MATCH(esk, net, acookie, + sk->sk_daddr, + sk->sk_rcv_saddr, + ports, dif, sdif))) { + return true; + } + } +#if IS_ENABLED(CONFIG_IPV6) + else if (sk->sk_family == AF_INET6) { + if (unlikely(INET6_MATCH(esk, net, + &sk->sk_v6_daddr, + &sk->sk_v6_rcv_saddr, + ports, dif, sdif))) { + return true; + } + } +#endif + } + return false; +} + +/* Insert a socket into ehash, and eventually remove another one + * (The another one can be a SYN_RECV or TIMEWAIT) + * If an existing socket already exists, socket sk is not inserted, + * and sets found_dup_sk parameter to true. + */ +bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct hlist_nulls_head *list; @@ -530,16 +575,23 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk) if (osk) { WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); ret = sk_nulls_del_node_init_rcu(osk); + } else if (found_dup_sk) { + *found_dup_sk = inet_ehash_lookup_by_sk(sk, list); + if (*found_dup_sk) + ret = false; } + if (ret) __sk_nulls_add_node_rcu(sk, list); + spin_unlock(lock); + return ret; } -bool inet_ehash_nolisten(struct sock *sk, struct sock *osk) +bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) { - bool ok = inet_ehash_insert(sk, osk); + bool ok = inet_ehash_insert(sk, osk, found_dup_sk); if (ok) { sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); @@ -583,7 +635,7 @@ int __inet_hash(struct sock *sk, struct sock *osk) int err = 0; if (sk->sk_state != TCP_LISTEN) { - inet_ehash_nolisten(sk, osk); + inet_ehash_nolisten(sk, osk, NULL); return 0; } WARN_ON(!sk_unhashed(sk)); @@ -679,7 +731,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - inet_ehash_nolisten(sk, NULL); + inet_ehash_nolisten(sk, NULL, NULL); spin_unlock_bh(&head->lock); return 0; } @@ -758,7 +810,7 @@ ok: inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { inet_sk(sk)->inet_sport = htons(port); - inet_ehash_nolisten(sk, (struct sock *)tw); + inet_ehash_nolisten(sk, (struct sock *)tw, NULL); } if (tw) inet_twsk_bind_unhash(tw, hinfo); diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index d1e04d2b5170..563b62b76a5f 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -203,7 +203,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, local_bh_disable(); addend = xt_write_recseq_begin(); - private = READ_ONCE(table->private); /* Address dependency. */ + private = rcu_access_pointer(table->private); cpu = smp_processor_id(); table_base = private->entries; jumpstack = (struct arpt_entry **)private->jumpstack[cpu]; @@ -649,7 +649,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; - const struct xt_table_info *private = table->private; + const struct xt_table_info *private = xt_table_get_private_protected(table); /* We need atomic snapshot of counters: rest doesn't change * (other than comefrom, which userspace doesn't care @@ -673,7 +673,7 @@ static int copy_entries_to_user(unsigned int total_size, unsigned int off, num; const struct arpt_entry *e; struct xt_counters *counters; - struct xt_table_info *private = table->private; + struct xt_table_info *private = xt_table_get_private_protected(table); int ret = 0; void *loc_cpu_entry; @@ -807,7 +807,7 @@ static int get_info(struct net *net, void __user *user, const int *len) t = xt_request_find_table_lock(net, NFPROTO_ARP, name); if (!IS_ERR(t)) { struct arpt_getinfo info; - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = xt_table_get_private_protected(t); #ifdef CONFIG_COMPAT struct xt_table_info tmp; @@ -860,7 +860,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, t = xt_find_table_lock(net, NFPROTO_ARP, get.name); if (!IS_ERR(t)) { - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = xt_table_get_private_protected(t); if (get.size == private->size) ret = copy_entries_to_user(private->size, @@ -1017,7 +1017,7 @@ static int do_add_counters(struct net *net, sockptr_t arg, unsigned int len) } local_bh_disable(); - private = t->private; + private = xt_table_get_private_protected(t); if (private->number != tmp.num_counters) { ret = -EINVAL; goto unlock_up_free; @@ -1330,7 +1330,7 @@ static int compat_copy_entries_to_user(unsigned int total_size, void __user *userptr) { struct xt_counters *counters; - const struct xt_table_info *private = table->private; + const struct xt_table_info *private = xt_table_get_private_protected(table); void __user *pos; unsigned int size; int ret = 0; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index f15bc21d7301..6e2851f8d3a3 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -258,7 +258,7 @@ ipt_do_table(struct sk_buff *skb, WARN_ON(!(table->valid_hooks & (1 << hook))); local_bh_disable(); addend = xt_write_recseq_begin(); - private = READ_ONCE(table->private); /* Address dependency. */ + private = rcu_access_pointer(table->private); cpu = smp_processor_id(); table_base = private->entries; jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; @@ -791,7 +791,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; - const struct xt_table_info *private = table->private; + const struct xt_table_info *private = xt_table_get_private_protected(table); /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care @@ -815,7 +815,7 @@ copy_entries_to_user(unsigned int total_size, unsigned int off, num; const struct ipt_entry *e; struct xt_counters *counters; - const struct xt_table_info *private = table->private; + const struct xt_table_info *private = xt_table_get_private_protected(table); int ret = 0; const void *loc_cpu_entry; @@ -964,7 +964,7 @@ static int get_info(struct net *net, void __user *user, const int *len) t = xt_request_find_table_lock(net, AF_INET, name); if (!IS_ERR(t)) { struct ipt_getinfo info; - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = xt_table_get_private_protected(t); #ifdef CONFIG_COMPAT struct xt_table_info tmp; @@ -1018,7 +1018,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr, t = xt_find_table_lock(net, AF_INET, get.name); if (!IS_ERR(t)) { - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = xt_table_get_private_protected(t); if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); @@ -1173,7 +1173,7 @@ do_add_counters(struct net *net, sockptr_t arg, unsigned int len) } local_bh_disable(); - private = t->private; + private = xt_table_get_private_protected(t); if (private->number != tmp.num_counters) { ret = -EINVAL; goto unlock_up_free; @@ -1543,7 +1543,7 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, void __user *userptr) { struct xt_counters *counters; - const struct xt_table_info *private = table->private; + const struct xt_table_info *private = xt_table_get_private_protected(table); void __user *pos; unsigned int size; int ret = 0; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index c962f0d96d8d..e26652ff7059 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3225,7 +3225,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fl4.daddr = dst; fl4.saddr = src; - fl4.flowi4_tos = rtm->rtm_tos; + fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK; fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; fl4.flowi4_mark = mark; fl4.flowi4_uid = uid; @@ -3249,8 +3249,9 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fl4.flowi4_iif = iif; /* for rt_fill_info */ skb->dev = dev; skb->mark = mark; - err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos, - dev, &res); + err = ip_route_input_rcu(skb, dst, src, + rtm->rtm_tos & IPTOS_RT_MASK, dev, + &res); rt = skb_rtable(skb); if (err == 0 && rt->dst.error) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b285b338a019..ed42d2193c5c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1758,52 +1758,272 @@ int tcp_mmap(struct file *file, struct socket *sock, } EXPORT_SYMBOL(tcp_mmap); +static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb, + u32 *offset_frag) +{ + skb_frag_t *frag; + + offset_skb -= skb_headlen(skb); + if ((int)offset_skb < 0 || skb_has_frag_list(skb)) + return NULL; + + frag = skb_shinfo(skb)->frags; + while (offset_skb) { + if (skb_frag_size(frag) > offset_skb) { + *offset_frag = offset_skb; + return frag; + } + offset_skb -= skb_frag_size(frag); + ++frag; + } + *offset_frag = 0; + return frag; +} + +static bool can_map_frag(const skb_frag_t *frag) +{ + return skb_frag_size(frag) == PAGE_SIZE && !skb_frag_off(frag); +} + +static int find_next_mappable_frag(const skb_frag_t *frag, + int remaining_in_skb) +{ + int offset = 0; + + if (likely(can_map_frag(frag))) + return 0; + + while (offset < remaining_in_skb && !can_map_frag(frag)) { + offset += skb_frag_size(frag); + ++frag; + } + return offset; +} + +static void tcp_zerocopy_set_hint_for_skb(struct sock *sk, + struct tcp_zerocopy_receive *zc, + struct sk_buff *skb, u32 offset) +{ + u32 frag_offset, partial_frag_remainder = 0; + int mappable_offset; + skb_frag_t *frag; + + /* worst case: skip to next skb. try to improve on this case below */ + zc->recv_skip_hint = skb->len - offset; + + /* Find the frag containing this offset (and how far into that frag) */ + frag = skb_advance_to_frag(skb, offset, &frag_offset); + if (!frag) + return; + + if (frag_offset) { + struct skb_shared_info *info = skb_shinfo(skb); + + /* We read part of the last frag, must recvmsg() rest of skb. */ + if (frag == &info->frags[info->nr_frags - 1]) + return; + + /* Else, we must at least read the remainder in this frag. */ + partial_frag_remainder = skb_frag_size(frag) - frag_offset; + zc->recv_skip_hint -= partial_frag_remainder; + ++frag; + } + + /* partial_frag_remainder: If part way through a frag, must read rest. + * mappable_offset: Bytes till next mappable frag, *not* counting bytes + * in partial_frag_remainder. + */ + mappable_offset = find_next_mappable_frag(frag, zc->recv_skip_hint); + zc->recv_skip_hint = mappable_offset + partial_frag_remainder; +} + +static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, + struct scm_timestamping_internal *tss, + int *cmsg_flags); +static int receive_fallback_to_copy(struct sock *sk, + struct tcp_zerocopy_receive *zc, int inq) +{ + unsigned long copy_address = (unsigned long)zc->copybuf_address; + struct scm_timestamping_internal tss_unused; + int err, cmsg_flags_unused; + struct msghdr msg = {}; + struct iovec iov; + + zc->length = 0; + zc->recv_skip_hint = 0; + + if (copy_address != zc->copybuf_address) + return -EINVAL; + + err = import_single_range(READ, (void __user *)copy_address, + inq, &iov, &msg.msg_iter); + if (err) + return err; + + err = tcp_recvmsg_locked(sk, &msg, inq, /*nonblock=*/1, /*flags=*/0, + &tss_unused, &cmsg_flags_unused); + if (err < 0) + return err; + + zc->copybuf_len = err; + if (likely(zc->copybuf_len)) { + struct sk_buff *skb; + u32 offset; + + skb = tcp_recv_skb(sk, tcp_sk(sk)->copied_seq, &offset); + if (skb) + tcp_zerocopy_set_hint_for_skb(sk, zc, skb, offset); + } + return 0; +} + +static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc, + struct sk_buff *skb, u32 copylen, + u32 *offset, u32 *seq) +{ + unsigned long copy_address = (unsigned long)zc->copybuf_address; + struct msghdr msg = {}; + struct iovec iov; + int err; + + if (copy_address != zc->copybuf_address) + return -EINVAL; + + err = import_single_range(READ, (void __user *)copy_address, + copylen, &iov, &msg.msg_iter); + if (err) + return err; + err = skb_copy_datagram_msg(skb, *offset, &msg, copylen); + if (err) + return err; + zc->recv_skip_hint -= copylen; + *offset += copylen; + *seq += copylen; + return (__s32)copylen; +} + +static int tcp_zerocopy_handle_leftover_data(struct tcp_zerocopy_receive *zc, + struct sock *sk, + struct sk_buff *skb, + u32 *seq, + s32 copybuf_len) +{ + u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint); + + if (!copylen) + return 0; + /* skb is null if inq < PAGE_SIZE. */ + if (skb) + offset = *seq - TCP_SKB_CB(skb)->seq; + else + skb = tcp_recv_skb(sk, *seq, &offset); + + zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset, + seq); + return zc->copybuf_len < 0 ? 0 : copylen; +} + +static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, + struct page **pending_pages, + unsigned long pages_remaining, + unsigned long *address, + u32 *length, + u32 *seq, + struct tcp_zerocopy_receive *zc, + u32 total_bytes_to_map, + int err) +{ + /* At least one page did not map. Try zapping if we skipped earlier. */ + if (err == -EBUSY && + zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT) { + u32 maybe_zap_len; + + maybe_zap_len = total_bytes_to_map - /* All bytes to map */ + *length + /* Mapped or pending */ + (pages_remaining * PAGE_SIZE); /* Failed map. */ + zap_page_range(vma, *address, maybe_zap_len); + err = 0; + } + + if (!err) { + unsigned long leftover_pages = pages_remaining; + int bytes_mapped; + + /* We called zap_page_range, try to reinsert. */ + err = vm_insert_pages(vma, *address, + pending_pages, + &pages_remaining); + bytes_mapped = PAGE_SIZE * (leftover_pages - pages_remaining); + *seq += bytes_mapped; + *address += bytes_mapped; + } + if (err) { + /* Either we were unable to zap, OR we zapped, retried an + * insert, and still had an issue. Either ways, pages_remaining + * is the number of pages we were unable to map, and we unroll + * some state we speculatively touched before. + */ + const int bytes_not_mapped = PAGE_SIZE * pages_remaining; + + *length -= bytes_not_mapped; + zc->recv_skip_hint += bytes_not_mapped; + } + return err; +} + static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma, struct page **pages, - unsigned long pages_to_map, - unsigned long *insert_addr, - u32 *length_with_pending, + unsigned int pages_to_map, + unsigned long *address, + u32 *length, u32 *seq, - struct tcp_zerocopy_receive *zc) + struct tcp_zerocopy_receive *zc, + u32 total_bytes_to_map) { unsigned long pages_remaining = pages_to_map; - int bytes_mapped; - int ret; + unsigned int pages_mapped; + unsigned int bytes_mapped; + int err; - ret = vm_insert_pages(vma, *insert_addr, pages, &pages_remaining); - bytes_mapped = PAGE_SIZE * (pages_to_map - pages_remaining); + err = vm_insert_pages(vma, *address, pages, &pages_remaining); + pages_mapped = pages_to_map - (unsigned int)pages_remaining; + bytes_mapped = PAGE_SIZE * pages_mapped; /* Even if vm_insert_pages fails, it may have partially succeeded in * mapping (some but not all of the pages). */ *seq += bytes_mapped; - *insert_addr += bytes_mapped; - if (ret) { - /* But if vm_insert_pages did fail, we have to unroll some state - * we speculatively touched before. - */ - const int bytes_not_mapped = PAGE_SIZE * pages_remaining; - *length_with_pending -= bytes_not_mapped; - zc->recv_skip_hint += bytes_not_mapped; - } - return ret; + *address += bytes_mapped; + + if (likely(!err)) + return 0; + + /* Error: maybe zap and retry + rollback state for failed inserts. */ + return tcp_zerocopy_vm_insert_batch_error(vma, pages + pages_mapped, + pages_remaining, address, length, seq, zc, total_bytes_to_map, + err); } +#define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32 static int tcp_zerocopy_receive(struct sock *sk, struct tcp_zerocopy_receive *zc) { + u32 length = 0, offset, vma_len, avail_len, copylen = 0; unsigned long address = (unsigned long)zc->address; - u32 length = 0, seq, offset, zap_len; - #define PAGE_BATCH_SIZE 8 - struct page *pages[PAGE_BATCH_SIZE]; + struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE]; + s32 copybuf_len = zc->copybuf_len; + struct tcp_sock *tp = tcp_sk(sk); const skb_frag_t *frags = NULL; + unsigned int pages_to_map = 0; struct vm_area_struct *vma; struct sk_buff *skb = NULL; - unsigned long pg_idx = 0; - unsigned long curr_addr; - struct tcp_sock *tp; - int inq; + u32 seq = tp->copied_seq; + u32 total_bytes_to_map; + int inq = tcp_inq(sk); int ret; + zc->copybuf_len = 0; + if (address & (PAGE_SIZE - 1) || address != zc->address) return -EINVAL; @@ -1812,7 +2032,16 @@ static int tcp_zerocopy_receive(struct sock *sk, sock_rps_record_flow(sk); - tp = tcp_sk(sk); + if (inq && inq <= copybuf_len) + return receive_fallback_to_copy(sk, zc, inq); + + if (inq < PAGE_SIZE) { + zc->length = 0; + zc->recv_skip_hint = inq; + if (!inq && sock_flag(sk, SOCK_DONE)) + return -EIO; + return 0; + } mmap_read_lock(current->mm); @@ -1821,33 +2050,26 @@ static int tcp_zerocopy_receive(struct sock *sk, mmap_read_unlock(current->mm); return -EINVAL; } - zc->length = min_t(unsigned long, zc->length, vma->vm_end - address); - - seq = tp->copied_seq; - inq = tcp_inq(sk); - zc->length = min_t(u32, zc->length, inq); - zap_len = zc->length & ~(PAGE_SIZE - 1); - if (zap_len) { - zap_page_range(vma, address, zap_len); + vma_len = min_t(unsigned long, zc->length, vma->vm_end - address); + avail_len = min_t(u32, vma_len, inq); + total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1); + if (total_bytes_to_map) { + if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT)) + zap_page_range(vma, address, total_bytes_to_map); + zc->length = total_bytes_to_map; zc->recv_skip_hint = 0; } else { - zc->recv_skip_hint = zc->length; + zc->length = avail_len; + zc->recv_skip_hint = avail_len; } ret = 0; - curr_addr = address; while (length + PAGE_SIZE <= zc->length) { + int mappable_offset; + struct page *page; + if (zc->recv_skip_hint < PAGE_SIZE) { - /* If we're here, finish the current batch. */ - if (pg_idx) { - ret = tcp_zerocopy_vm_insert_batch(vma, pages, - pg_idx, - &curr_addr, - &length, - &seq, zc); - if (ret) - goto out; - pg_idx = 0; - } + u32 offset_frag; + if (skb) { if (zc->recv_skip_hint > 0) break; @@ -1857,56 +2079,57 @@ static int tcp_zerocopy_receive(struct sock *sk, skb = tcp_recv_skb(sk, seq, &offset); } zc->recv_skip_hint = skb->len - offset; - offset -= skb_headlen(skb); - if ((int)offset < 0 || skb_has_frag_list(skb)) + frags = skb_advance_to_frag(skb, offset, &offset_frag); + if (!frags || offset_frag) break; - frags = skb_shinfo(skb)->frags; - while (offset) { - if (skb_frag_size(frags) > offset) - goto out; - offset -= skb_frag_size(frags); - frags++; - } } - if (skb_frag_size(frags) != PAGE_SIZE || skb_frag_off(frags)) { - int remaining = zc->recv_skip_hint; - while (remaining && (skb_frag_size(frags) != PAGE_SIZE || - skb_frag_off(frags))) { - remaining -= skb_frag_size(frags); - frags++; - } - zc->recv_skip_hint -= remaining; + mappable_offset = find_next_mappable_frag(frags, + zc->recv_skip_hint); + if (mappable_offset) { + zc->recv_skip_hint = mappable_offset; break; } - pages[pg_idx] = skb_frag_page(frags); - pg_idx++; + page = skb_frag_page(frags); + prefetchw(page); + pages[pages_to_map++] = page; length += PAGE_SIZE; zc->recv_skip_hint -= PAGE_SIZE; frags++; - if (pg_idx == PAGE_BATCH_SIZE) { - ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx, - &curr_addr, &length, - &seq, zc); + if (pages_to_map == TCP_ZEROCOPY_PAGE_BATCH_SIZE || + zc->recv_skip_hint < PAGE_SIZE) { + /* Either full batch, or we're about to go to next skb + * (and we cannot unroll failed ops across skbs). + */ + ret = tcp_zerocopy_vm_insert_batch(vma, pages, + pages_to_map, + &address, &length, + &seq, zc, + total_bytes_to_map); if (ret) goto out; - pg_idx = 0; + pages_to_map = 0; } } - if (pg_idx) { - ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx, - &curr_addr, &length, &seq, - zc); + if (pages_to_map) { + ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map, + &address, &length, &seq, + zc, total_bytes_to_map); } out: mmap_read_unlock(current->mm); - if (length) { + /* Try to copy straggler data. */ + if (!ret) + copylen = tcp_zerocopy_handle_leftover_data(zc, sk, skb, &seq, + copybuf_len); + + if (length + copylen) { WRITE_ONCE(tp->copied_seq, seq); tcp_rcv_space_adjust(sk); /* Clean up data we have read: This will do ACK frames. */ tcp_recv_skb(sk, seq, &offset); - tcp_cleanup_rbuf(sk, length); + tcp_cleanup_rbuf(sk, length + copylen); ret = 0; if (length == zc->length) zc->recv_skip_hint = 0; @@ -2028,36 +2251,28 @@ static int tcp_inq_hint(struct sock *sk) * Probably, code can be easily improved even more. */ -int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, - int flags, int *addr_len) +static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, + struct scm_timestamping_internal *tss, + int *cmsg_flags) { struct tcp_sock *tp = tcp_sk(sk); int copied = 0; u32 peek_seq; u32 *seq; unsigned long used; - int err, inq; + int err; int target; /* Read at least this many bytes */ long timeo; struct sk_buff *skb, *last; u32 urg_hole = 0; - struct scm_timestamping_internal tss; - int cmsg_flags; - - if (unlikely(flags & MSG_ERRQUEUE)) - return inet_recv_error(sk, msg, len, addr_len); - - if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) && - (sk->sk_state == TCP_ESTABLISHED)) - sk_busy_loop(sk, nonblock); - - lock_sock(sk); err = -ENOTCONN; if (sk->sk_state == TCP_LISTEN) goto out; - cmsg_flags = tp->recvmsg_inq ? 1 : 0; + if (tp->recvmsg_inq) + *cmsg_flags = 1; timeo = sock_rcvtimeo(sk, nonblock); /* Urgent data needs to be handled specially. */ @@ -2237,8 +2452,8 @@ skip_copy: } if (TCP_SKB_CB(skb)->has_rxtstamp) { - tcp_update_recv_tstamps(skb, &tss); - cmsg_flags |= 2; + tcp_update_recv_tstamps(skb, tss); + *cmsg_flags |= 2; } if (used + offset < skb->len) @@ -2264,22 +2479,9 @@ found_fin_ok: /* Clean up data we have read: This will do ACK frames. */ tcp_cleanup_rbuf(sk, copied); - - release_sock(sk); - - if (cmsg_flags) { - if (cmsg_flags & 2) - tcp_recv_timestamp(msg, sk, &tss); - if (cmsg_flags & 1) { - inq = tcp_inq_hint(sk); - put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq); - } - } - return copied; out: - release_sock(sk); return err; recv_urg: @@ -2290,6 +2492,36 @@ recv_sndq: err = tcp_peek_sndq(sk, msg, len); goto out; } + +int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, + int flags, int *addr_len) +{ + int cmsg_flags = 0, ret, inq; + struct scm_timestamping_internal tss; + + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + + if (sk_can_busy_loop(sk) && + skb_queue_empty_lockless(&sk->sk_receive_queue) && + sk->sk_state == TCP_ESTABLISHED) + sk_busy_loop(sk, nonblock); + + lock_sock(sk); + ret = tcp_recvmsg_locked(sk, msg, len, nonblock, flags, &tss, + &cmsg_flags); + release_sock(sk); + + if (cmsg_flags && ret >= 0) { + if (cmsg_flags & 2) + tcp_recv_timestamp(msg, sk, &tss); + if (cmsg_flags & 1) { + inq = tcp_inq_hint(sk); + put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq); + } + } + return ret; +} EXPORT_SYMBOL(tcp_recvmsg); void tcp_set_state(struct sock *sk, int state) @@ -3042,6 +3274,21 @@ int tcp_sock_set_keepcnt(struct sock *sk, int val) } EXPORT_SYMBOL(tcp_sock_set_keepcnt); +int tcp_set_window_clamp(struct sock *sk, int val) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (!val) { + if (sk->sk_state != TCP_CLOSE) + return -EINVAL; + tp->window_clamp = 0; + } else { + tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ? + SOCK_MIN_RCVBUF / 2 : val; + } + return 0; +} + /* * Socket option code for TCP. */ @@ -3255,15 +3502,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, break; case TCP_WINDOW_CLAMP: - if (!val) { - if (sk->sk_state != TCP_CLOSE) { - err = -EINVAL; - break; - } - tp->window_clamp = 0; - } else - tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ? - SOCK_MIN_RCVBUF / 2 : val; + err = tcp_set_window_clamp(sk, val); break; case TCP_QUICKACK: @@ -3843,7 +4082,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, } #ifdef CONFIG_MMU case TCP_ZEROCOPY_RECEIVE: { - struct tcp_zerocopy_receive zc; + struct tcp_zerocopy_receive zc = {}; int err; if (get_user(len, optlen)) @@ -3860,7 +4099,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, lock_sock(sk); err = tcp_zerocopy_receive(sk, &zc); release_sock(sk); - if (len == sizeof(zc)) + if (len >= offsetofend(struct tcp_zerocopy_receive, err)) goto zerocopy_rcv_sk_err; switch (len) { case offsetofend(struct tcp_zerocopy_receive, err): diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index db47ac24d057..563d016e7478 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -198,6 +198,11 @@ static void tcp_reinit_congestion_control(struct sock *sk, icsk->icsk_ca_setsockopt = 1; memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); + if (ca->flags & TCP_CONG_NEEDS_ECN) + INET_ECN_xmit(sk); + else + INET_ECN_dontxmit(sk); + if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) tcp_init_congestion_control(sk); } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fb3a7750f623..d6ad3b5c38e7 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -510,7 +510,6 @@ static void tcp_init_buffer_space(struct sock *sk) if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) tcp_sndbuf_expand(sk); - tp->rcvq_space.space = min_t(u32, tp->rcv_wnd, TCP_INIT_CWND * tp->advmss); tcp_mstamp_refresh(tp); tp->rcvq_space.time = tp->tcp_mstamp; tp->rcvq_space.seq = tp->copied_seq; @@ -534,6 +533,8 @@ static void tcp_init_buffer_space(struct sock *sk) tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); tp->snd_cwnd_stamp = tcp_jiffies32; + tp->rcvq_space.space = min3(tp->rcv_ssthresh, tp->rcv_wnd, + (u32)TCP_INIT_CWND * tp->advmss); } /* 4. Recalculate window clamp after socket hit its memory bounds. */ @@ -6799,18 +6800,13 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, /* Note: tcp_v6_init_req() might override ir_iif for link locals */ inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb); - af_ops->init_req(req, sk, skb); - - if (security_inet_conn_request(sk, skb, req)) + dst = af_ops->route_req(sk, skb, &fl, req); + if (!dst) goto drop_and_free; if (tmp_opt.tstamp_ok) tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb); - dst = af_ops->route_req(sk, &fl, req); - if (!dst) - goto drop_and_free; - if (!want_cookie && !isn) { /* Kill the following clause, if you dislike this way. */ if (!net->ipv4.sysctl_tcp_syncookies && diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c2d5132c523c..58207c7769d0 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -980,17 +980,23 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); - tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ? - tcp_rsk(req)->syn_tos : inet_sk(sk)->tos; - if (skb) { __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); + tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ? + (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | + (inet_sk(sk)->tos & INET_ECN_MASK) : + inet_sk(sk)->tos; + + if (!INET_ECN_is_capable(tos) && + tcp_bpf_ca_needs_ecn((struct sock *)req)) + tos |= INET_ECN_ECT_0; + rcu_read_lock(); err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, rcu_dereference(ireq->ireq_opt), - tos & ~INET_ECN_MASK); + tos); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -1439,9 +1445,15 @@ static void tcp_v4_init_req(struct request_sock *req, } static struct dst_entry *tcp_v4_route_req(const struct sock *sk, + struct sk_buff *skb, struct flowi *fl, - const struct request_sock *req) + struct request_sock *req) { + tcp_v4_init_req(req, sk, skb); + + if (security_inet_conn_request(sk, skb, req)) + return NULL; + return inet_csk_route_req(sk, &fl->u.ip4, req); } @@ -1461,7 +1473,6 @@ const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .req_md5_lookup = tcp_v4_md5_lookup, .calc_md5_hash = tcp_v4_md5_hash_skb, #endif - .init_req = tcp_v4_init_req, #ifdef CONFIG_SYN_COOKIES .cookie_init_seq = cookie_v4_init_sequence, #endif @@ -1498,6 +1509,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, bool *own_req) { struct inet_request_sock *ireq; + bool found_dup_sk = false; struct inet_sock *newinet; struct tcp_sock *newtp; struct sock *newsk; @@ -1535,7 +1547,9 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; newinet->inet_id = prandom_u32(); - /* Set ToS of the new socket based upon the value of incoming SYN. */ + /* Set ToS of the new socket based upon the value of incoming SYN. + * ECT bits are set later in tcp_init_transfer(). + */ if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; @@ -1575,12 +1589,22 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, if (__inet_inherit_port(sk, newsk) < 0) goto put_and_exit; - *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); + *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), + &found_dup_sk); if (likely(*own_req)) { tcp_move_syn(newtp, req); ireq->ireq_opt = NULL; } else { - newinet->inet_opt = NULL; + if (!req_unhash && found_dup_sk) { + /* This code path should only be executed in the + * syncookie case only + */ + bh_unlock_sock(newsk); + sock_put(newsk); + newsk = NULL; + } else { + newinet->inet_opt = NULL; + } } return newsk; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 41880d3521ed..f322e798a351 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1882,7 +1882,8 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) * window, and remember whether we were cwnd-limited then. */ if (!before(tp->snd_una, tp->max_packets_seq) || - tp->packets_out > tp->max_packets_out) { + tp->packets_out > tp->max_packets_out || + is_cwnd_limited) { tp->max_packets_out = tp->packets_out; tp->max_packets_seq = tp->snd_nxt; tp->is_cwnd_limited = is_cwnd_limited; @@ -2706,6 +2707,10 @@ repair: else tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED); + is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd); + if (likely(sent_pkts || is_cwnd_limited)) + tcp_cwnd_validate(sk, is_cwnd_limited); + if (likely(sent_pkts)) { if (tcp_in_cwnd_reduction(sk)) tp->prr_out += sent_pkts; @@ -2713,8 +2718,6 @@ repair: /* Send one loss probe per tail loss episode. */ if (push_one != 2) tcp_schedule_loss_probe(sk, false); - is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd); - tcp_cwnd_validate(sk, is_cwnd_limited); return false; } return !tp->packets_out && !tcp_write_queue_empty(sk); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index a3f105227ccc..dece195f212c 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2175,7 +2175,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) __skb_pull(skb, skb_transport_offset(skb)); ret = udp_queue_rcv_one_skb(sk, skb); if (ret > 0) - ip_protocol_deliver_rcu(dev_net(skb->dev), skb, -ret); + ip_protocol_deliver_rcu(dev_net(skb->dev), skb, ret); } return 0; } |