summaryrefslogtreecommitdiff
path: root/net/core/sock.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/core/sock.c')
-rw-r--r--net/core/sock.c204
1 files changed, 122 insertions, 82 deletions
diff --git a/net/core/sock.c b/net/core/sock.c
index 782343bb925b..043db3ce023e 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -6,7 +7,6 @@
* Generic socket support routines. Memory allocators, socket lock/release
* handler for protocols to use and generic option handler.
*
- *
* Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Florian La Roche, <flla@stud.uni-sb.de>
@@ -81,12 +81,6 @@
* Arnaldo C. Melo : cleanups, use skb_queue_purge
*
* To Fix:
- *
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -137,6 +131,7 @@
#include <linux/filter.h>
#include <net/sock_reuseport.h>
+#include <net/bpf_sk_storage.h>
#include <trace/events/sock.h>
@@ -338,7 +333,6 @@ EXPORT_SYMBOL(__sk_backlog_rcv);
static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
{
struct __kernel_sock_timeval tv;
- int size;
if (timeo == MAX_SCHEDULE_TIMEOUT) {
tv.tv_sec = 0;
@@ -348,7 +342,7 @@ static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
}
- if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
+ if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
*(struct old_timeval32 *)optval = tv32;
return sizeof(tv32);
@@ -359,20 +353,18 @@ static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
old_tv.tv_sec = tv.tv_sec;
old_tv.tv_usec = tv.tv_usec;
*(struct __kernel_old_timeval *)optval = old_tv;
- size = sizeof(old_tv);
- } else {
- *(struct __kernel_sock_timeval *)optval = tv;
- size = sizeof(tv);
+ return sizeof(old_tv);
}
- return size;
+ *(struct __kernel_sock_timeval *)optval = tv;
+ return sizeof(tv);
}
static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool old_timeval)
{
struct __kernel_sock_timeval tv;
- if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
+ if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
struct old_timeval32 tv32;
if (optlen < sizeof(tv32))
@@ -526,8 +518,8 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
rc = sk_backlog_rcv(sk, skb);
- mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
- } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
+ mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
+ } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
bh_unlock_sock(sk);
atomic_inc(&sk->sk_drops);
goto discard_and_relse;
@@ -692,7 +684,8 @@ out:
return ret;
}
-static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
+static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit,
+ int valbool)
{
if (valbool)
sock_set_flag(sk, bit);
@@ -790,7 +783,8 @@ set_sndbuf:
*/
val = min_t(int, val, INT_MAX / 2);
sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
- sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
+ WRITE_ONCE(sk->sk_sndbuf,
+ max_t(int, val * 2, SOCK_MIN_SNDBUF));
/* Wake up sending tasks if we upped the value. */
sk->sk_write_space(sk);
break;
@@ -836,7 +830,8 @@ set_rcvbuf:
* returning the value we actually used in getsockopt
* is the most desirable behavior.
*/
- sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
+ WRITE_ONCE(sk->sk_rcvbuf,
+ max_t(int, val * 2, SOCK_MIN_RCVBUF));
break;
case SO_RCVBUFFORCE:
@@ -979,7 +974,7 @@ set_rcvbuf:
if (sock->ops->set_rcvlowat)
ret = sock->ops->set_rcvlowat(sk, val);
else
- sk->sk_rcvlowat = val ? : 1;
+ WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
break;
case SO_RCVTIMEO_OLD:
@@ -1044,6 +1039,10 @@ set_rcvbuf:
}
break;
+ case SO_DETACH_REUSEPORT_BPF:
+ ret = reuseport_detach_prog(sk);
+ break;
+
case SO_DETACH_FILTER:
ret = sk_detach_filter(sk);
break;
@@ -1126,7 +1125,7 @@ set_rcvbuf:
break;
}
case SO_INCOMING_CPU:
- sk->sk_incoming_cpu = val;
+ WRITE_ONCE(sk->sk_incoming_cpu, val);
break;
case SO_CNX_ADVICE:
@@ -1475,16 +1474,13 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_INCOMING_CPU:
- v.val = sk->sk_incoming_cpu;
+ v.val = READ_ONCE(sk->sk_incoming_cpu);
break;
case SO_MEMINFO:
{
u32 meminfo[SK_MEMINFO_VARS];
- if (get_user(len, optlen))
- return -EFAULT;
-
sk_get_meminfo(sk, meminfo);
len = min_t(unsigned int, len, sizeof(meminfo));
@@ -1601,7 +1597,7 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
if (!sk)
return sk;
- if (priority & __GFP_ZERO)
+ if (want_init_on_alloc(priority))
sk_prot_clear_nulls(sk, prot->obj_size);
} else
sk = kmalloc(prot->obj_size, priority);
@@ -1704,11 +1700,13 @@ static void __sk_destruct(struct rcu_head *head)
sk_filter_uncharge(sk, filter);
RCU_INIT_POINTER(sk->sk_filter, NULL);
}
- if (rcu_access_pointer(sk->sk_reuseport_cb))
- reuseport_detach_sock(sk);
sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
+#ifdef CONFIG_BPF_SYSCALL
+ bpf_sk_storage_free(sk);
+#endif
+
if (atomic_read(&sk->sk_omem_alloc))
pr_debug("%s: optmem leakage (%d bytes) detected\n",
__func__, atomic_read(&sk->sk_omem_alloc));
@@ -1728,7 +1726,14 @@ static void __sk_destruct(struct rcu_head *head)
void sk_destruct(struct sock *sk)
{
- if (sock_flag(sk, SOCK_RCU_FREE))
+ bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
+
+ if (rcu_access_pointer(sk->sk_reuseport_cb)) {
+ reuseport_detach_sock(sk);
+ use_call_rcu = true;
+ }
+
+ if (use_call_rcu)
call_rcu(&sk->sk_rcu, __sk_destruct);
else
__sk_destruct(&sk->sk_rcu);
@@ -1852,6 +1857,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
}
RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
+ if (bpf_sk_storage_clone(sk, newsk)) {
+ sk_free_unlock_clone(newsk);
+ newsk = NULL;
+ goto out;
+ }
+
newsk->sk_err = 0;
newsk->sk_err_soft = 0;
newsk->sk_priority = 0;
@@ -1989,6 +2000,19 @@ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
}
EXPORT_SYMBOL(skb_set_owner_w);
+static bool can_skb_orphan_partial(const struct sk_buff *skb)
+{
+#ifdef CONFIG_TLS_DEVICE
+ /* Drivers depend on in-order delivery for crypto offload,
+ * partial orphan breaks out-of-order-OK logic.
+ */
+ if (skb->decrypted)
+ return false;
+#endif
+ return (skb->destructor == sock_wfree ||
+ (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
+}
+
/* This helper is used by netem, as it can hold packets in its
* delay queue. We want to allow the owner socket to send more
* packets, as if they were already TX completed by a typical driver.
@@ -2000,11 +2024,7 @@ void skb_orphan_partial(struct sk_buff *skb)
if (skb_is_tcp_pure_ack(skb))
return;
- if (skb->destructor == sock_wfree
-#ifdef CONFIG_INET
- || skb->destructor == tcp_wfree
-#endif
- ) {
+ if (can_skb_orphan_partial(skb)) {
struct sock *sk = skb->sk;
if (refcount_inc_not_zero(&sk->sk_refcnt)) {
@@ -2068,8 +2088,10 @@ EXPORT_SYMBOL(sock_i_ino);
struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
gfp_t priority)
{
- if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
+ if (force ||
+ refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
struct sk_buff *skb = alloc_skb(size, priority);
+
if (skb) {
skb_set_owner_w(skb, sk);
return skb;
@@ -2170,7 +2192,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo)
break;
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
+ if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
break;
if (sk->sk_shutdown & SEND_SHUTDOWN)
break;
@@ -2205,7 +2227,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
if (sk->sk_shutdown & SEND_SHUTDOWN)
goto failure;
- if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
+ if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
break;
sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
@@ -2314,13 +2336,14 @@ static void sk_leave_memory_pressure(struct sock *sk)
} else {
unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
- if (memory_pressure && *memory_pressure)
- *memory_pressure = 0;
+ if (memory_pressure && READ_ONCE(*memory_pressure))
+ WRITE_ONCE(*memory_pressure, 0);
}
}
/* On 32bit arches, an skb frag is limited to 2^15 */
#define SKB_FRAG_PAGE_ORDER get_order(32768)
+DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
/**
* skb_page_frag_refill - check that a page_frag contains enough room
@@ -2345,7 +2368,8 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
}
pfrag->offset = 0;
- if (SKB_FRAG_PAGE_ORDER) {
+ if (SKB_FRAG_PAGE_ORDER &&
+ !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
/* Avoid direct reclaim but allow kswapd to wake */
pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
__GFP_COMP | __GFP_NOWARN |
@@ -2784,7 +2808,7 @@ static void sock_def_write_space(struct sock *sk)
/* Do not wake up a writer until he can make "significant"
* progress. --DaveM
*/
- if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
+ if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
wq = rcu_dereference(sk->sk_wq);
if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
@@ -2842,7 +2866,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
if (sock) {
sk->sk_type = sock->type;
- RCU_INIT_POINTER(sk->sk_wq, sock->wq);
+ RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
sock->sk = sk;
sk->sk_uid = SOCK_INODE(sock)->i_uid;
} else {
@@ -2977,41 +3001,46 @@ bool lock_sock_fast(struct sock *sk)
}
EXPORT_SYMBOL(lock_sock_fast);
-int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
+int sock_gettstamp(struct socket *sock, void __user *userstamp,
+ bool timeval, bool time32)
{
- struct timeval tv;
+ struct sock *sk = sock->sk;
+ struct timespec64 ts;
sock_enable_timestamp(sk, SOCK_TIMESTAMP);
- tv = ktime_to_timeval(sock_read_timestamp(sk));
- if (tv.tv_sec == -1)
+ ts = ktime_to_timespec64(sock_read_timestamp(sk));
+ if (ts.tv_sec == -1)
return -ENOENT;
- if (tv.tv_sec == 0) {
+ if (ts.tv_sec == 0) {
ktime_t kt = ktime_get_real();
sock_write_timestamp(sk, kt);
- tv = ktime_to_timeval(kt);
+ ts = ktime_to_timespec64(kt);
}
- return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
-}
-EXPORT_SYMBOL(sock_get_timestamp);
-int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
-{
- struct timespec ts;
+ if (timeval)
+ ts.tv_nsec /= 1000;
- sock_enable_timestamp(sk, SOCK_TIMESTAMP);
- ts = ktime_to_timespec(sock_read_timestamp(sk));
- if (ts.tv_sec == -1)
- return -ENOENT;
- if (ts.tv_sec == 0) {
- ktime_t kt = ktime_get_real();
- sock_write_timestamp(sk, kt);
- ts = ktime_to_timespec(sk->sk_stamp);
+#ifdef CONFIG_COMPAT_32BIT_TIME
+ if (time32)
+ return put_old_timespec32(&ts, userstamp);
+#endif
+#ifdef CONFIG_SPARC64
+ /* beware of padding in sparc64 timeval */
+ if (timeval && !in_compat_syscall()) {
+ struct __kernel_old_timeval __user tv = {
+ .tv_sec = ts.tv_sec,
+ .tv_usec = ts.tv_nsec,
+ };
+ if (copy_to_user(userstamp, &tv, sizeof(tv)))
+ return -EFAULT;
+ return 0;
}
- return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
+#endif
+ return put_timespec64(&ts, userstamp);
}
-EXPORT_SYMBOL(sock_get_timestampns);
+EXPORT_SYMBOL(sock_gettstamp);
-void sock_enable_timestamp(struct sock *sk, int flag)
+void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
{
if (!sock_flag(sk, flag)) {
unsigned long previous_flags = sk->sk_flags;
@@ -3177,13 +3206,13 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem)
memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
- mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
+ mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
- mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
+ mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
- mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
+ mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
- mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
+ mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
}
@@ -3268,16 +3297,17 @@ static __init int net_inuse_init(void)
core_initcall(net_inuse_init);
-static void assign_proto_idx(struct proto *prot)
+static int assign_proto_idx(struct proto *prot)
{
prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
pr_err("PROTO_INUSE_NR exhausted\n");
- return;
+ return -ENOSPC;
}
set_bit(prot->inuse_idx, proto_inuse_idx);
+ return 0;
}
static void release_proto_idx(struct proto *prot)
@@ -3286,8 +3316,9 @@ static void release_proto_idx(struct proto *prot)
clear_bit(prot->inuse_idx, proto_inuse_idx);
}
#else
-static inline void assign_proto_idx(struct proto *prot)
+static inline int assign_proto_idx(struct proto *prot)
{
+ return 0;
}
static inline void release_proto_idx(struct proto *prot)
@@ -3336,6 +3367,8 @@ static int req_prot_init(const struct proto *prot)
int proto_register(struct proto *prot, int alloc_slab)
{
+ int ret = -ENOBUFS;
+
if (alloc_slab) {
prot->slab = kmem_cache_create_usercopy(prot->name,
prot->obj_size, 0,
@@ -3372,20 +3405,27 @@ int proto_register(struct proto *prot, int alloc_slab)
}
mutex_lock(&proto_list_mutex);
+ ret = assign_proto_idx(prot);
+ if (ret) {
+ mutex_unlock(&proto_list_mutex);
+ goto out_free_timewait_sock_slab_name;
+ }
list_add(&prot->node, &proto_list);
- assign_proto_idx(prot);
mutex_unlock(&proto_list_mutex);
- return 0;
+ return ret;
out_free_timewait_sock_slab_name:
- kfree(prot->twsk_prot->twsk_slab_name);
+ if (alloc_slab && prot->twsk_prot)
+ kfree(prot->twsk_prot->twsk_slab_name);
out_free_request_sock_slab:
- req_prot_cleanup(prot->rsk_prot);
+ if (alloc_slab) {
+ req_prot_cleanup(prot->rsk_prot);
- kmem_cache_destroy(prot->slab);
- prot->slab = NULL;
+ kmem_cache_destroy(prot->slab);
+ prot->slab = NULL;
+ }
out:
- return -ENOBUFS;
+ return ret;
}
EXPORT_SYMBOL(proto_register);
@@ -3459,7 +3499,7 @@ static long sock_prot_memory_allocated(struct proto *proto)
return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
}
-static char *sock_prot_memory_pressure(struct proto *proto)
+static const char *sock_prot_memory_pressure(struct proto *proto)
{
return proto->memory_pressure != NULL ?
proto_memory_pressure(proto) ? "yes" : "no" : "NI";
@@ -3558,7 +3598,7 @@ bool sk_busy_loop_end(void *p, unsigned long start_time)
{
struct sock *sk = p;
- return !skb_queue_empty(&sk->sk_receive_queue) ||
+ return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
sk_busy_loop_timeout(sk, start_time);
}
EXPORT_SYMBOL(sk_busy_loop_end);