summaryrefslogtreecommitdiff
path: root/net/core/sock.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/core/sock.c')
-rw-r--r--net/core/sock.c195
1 files changed, 159 insertions, 36 deletions
diff --git a/net/core/sock.c b/net/core/sock.c
index 727f924b7f91..415f441c63b9 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -307,16 +307,6 @@ static struct lock_class_key af_wlock_keys[AF_MAX];
static struct lock_class_key af_elock_keys[AF_MAX];
static struct lock_class_key af_kern_callback_keys[AF_MAX];
-/* Take into consideration the size of the struct sk_buff overhead in the
- * determination of these values, since that is non-constant across
- * platforms. This makes socket queueing behavior and performance
- * not depend upon such differences.
- */
-#define _SK_MEM_PACKETS 256
-#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
-#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
-#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
-
/* Run time adjustable parameters. */
__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
EXPORT_SYMBOL(sysctl_wmem_max);
@@ -1038,6 +1028,10 @@ set_rcvbuf:
#endif
case SO_MAX_PACING_RATE:
+ if (val != ~0U)
+ cmpxchg(&sk->sk_pacing_status,
+ SK_PACING_NONE,
+ SK_PACING_NEEDED);
sk->sk_max_pacing_rate = val;
sk->sk_pacing_rate = min(sk->sk_pacing_rate,
sk->sk_max_pacing_rate);
@@ -1051,6 +1045,20 @@ set_rcvbuf:
if (val == 1)
dst_negative_advice(sk);
break;
+
+ case SO_ZEROCOPY:
+ if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6)
+ ret = -ENOTSUPP;
+ else if (sk->sk_protocol != IPPROTO_TCP)
+ ret = -ENOTSUPP;
+ else if (sk->sk_state != TCP_CLOSE)
+ ret = -EBUSY;
+ else if (val < 0 || val > 1)
+ ret = -EINVAL;
+ else
+ sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
+ break;
+
default:
ret = -ENOPROTOOPT;
break;
@@ -1074,6 +1082,18 @@ static void cred_to_ucred(struct pid *pid, const struct cred *cred,
}
}
+static int groups_to_user(gid_t __user *dst, const struct group_info *src)
+{
+ struct user_namespace *user_ns = current_user_ns();
+ int i;
+
+ for (i = 0; i < src->ngroups; i++)
+ if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
+ return -EFAULT;
+
+ return 0;
+}
+
int sock_getsockopt(struct socket *sock, int level, int optname,
char __user *optval, int __user *optlen)
{
@@ -1227,6 +1247,27 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
goto lenout;
}
+ case SO_PEERGROUPS:
+ {
+ int ret, n;
+
+ if (!sk->sk_peer_cred)
+ return -ENODATA;
+
+ n = sk->sk_peer_cred->group_info->ngroups;
+ if (len < n * sizeof(gid_t)) {
+ len = n * sizeof(gid_t);
+ return put_user(len, optlen) ? -EFAULT : -ERANGE;
+ }
+ len = n * sizeof(gid_t);
+
+ ret = groups_to_user((gid_t __user *)optval,
+ sk->sk_peer_cred->group_info);
+ if (ret)
+ return ret;
+ goto lenout;
+ }
+
case SO_PEERNAME:
{
char address[128];
@@ -1346,6 +1387,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.val64 = sock_gen_cookie(sk);
break;
+ case SO_ZEROCOPY:
+ v.val = sock_flag(sk, SOCK_ZEROCOPY);
+ break;
+
default:
/* We implement the SO_SNDLOWAT etc to not be settable
* (1003.1g 7).
@@ -1491,7 +1536,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
if (likely(sk->sk_net_refcnt))
get_net(net);
sock_net_set(sk, net);
- atomic_set(&sk->sk_wmem_alloc, 1);
+ refcount_set(&sk->sk_wmem_alloc, 1);
mem_cgroup_sk_alloc(sk);
cgroup_sk_alloc(&sk->sk_cgrp_data);
@@ -1515,7 +1560,7 @@ static void __sk_destruct(struct rcu_head *head)
sk->sk_destruct(sk);
filter = rcu_dereference_check(sk->sk_filter,
- atomic_read(&sk->sk_wmem_alloc) == 0);
+ refcount_read(&sk->sk_wmem_alloc) == 0);
if (filter) {
sk_filter_uncharge(sk, filter);
RCU_INIT_POINTER(sk->sk_filter, NULL);
@@ -1565,7 +1610,7 @@ void sk_free(struct sock *sk)
* some packets are still in some tx queue.
* If not null, sock_wfree() will call __sk_free(sk) later
*/
- if (atomic_dec_and_test(&sk->sk_wmem_alloc))
+ if (refcount_dec_and_test(&sk->sk_wmem_alloc))
__sk_free(sk);
}
EXPORT_SYMBOL(sk_free);
@@ -1609,6 +1654,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
sock_copy(newsk, sk);
+ newsk->sk_prot_creator = sk->sk_prot;
+
/* SANITY */
if (likely(newsk->sk_net_refcnt))
get_net(sock_net(newsk));
@@ -1622,7 +1669,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
/*
* sk_wmem_alloc set to one (see sk_free() and sock_wfree())
*/
- atomic_set(&newsk->sk_wmem_alloc, 1);
+ refcount_set(&newsk->sk_wmem_alloc, 1);
atomic_set(&newsk->sk_omem_alloc, 0);
sk_init_common(newsk);
@@ -1630,19 +1677,28 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
newsk->sk_dst_pending_confirm = 0;
newsk->sk_wmem_queued = 0;
newsk->sk_forward_alloc = 0;
+
+ /* sk->sk_memcg will be populated at accept() time */
+ newsk->sk_memcg = NULL;
+
atomic_set(&newsk->sk_drops, 0);
newsk->sk_send_head = NULL;
newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
+ atomic_set(&newsk->sk_zckey, 0);
sock_reset_flag(newsk, SOCK_DONE);
+ cgroup_sk_alloc(&newsk->sk_cgrp_data);
- filter = rcu_dereference_protected(newsk->sk_filter, 1);
+ rcu_read_lock();
+ filter = rcu_dereference(sk->sk_filter);
if (filter != NULL)
/* though it's an empty new sock, the charging may fail
* if sysctl_optmem_max was changed between creation of
* original socket and cloning
*/
is_charged = sk_filter_charge(newsk, filter);
+ RCU_INIT_POINTER(newsk->sk_filter, filter);
+ rcu_read_unlock();
if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
/* We need to make sure that we don't uncharge the new
@@ -1663,15 +1719,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
newsk->sk_incoming_cpu = raw_smp_processor_id();
atomic64_set(&newsk->sk_cookie, 0);
- mem_cgroup_sk_alloc(newsk);
- cgroup_sk_alloc(&newsk->sk_cgrp_data);
-
/*
* Before updating sk_refcnt, we must commit prior changes to memory
* (Documentation/RCU/rculist_nulls.txt for details)
*/
smp_wmb();
- atomic_set(&newsk->sk_refcnt, 2);
+ refcount_set(&newsk->sk_refcnt, 2);
/*
* Increment the counter in the same struct proto as the master
@@ -1720,7 +1773,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
sk->sk_route_caps &= ~sk->sk_route_nocaps;
if (sk_can_gso(sk)) {
- if (dst->header_len) {
+ if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
} else {
sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
@@ -1750,7 +1803,7 @@ void sock_wfree(struct sk_buff *skb)
* Keep a reference on sk_wmem_alloc, this will be released
* after sk_write_space() call
*/
- atomic_sub(len - 1, &sk->sk_wmem_alloc);
+ WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
sk->sk_write_space(sk);
len = 1;
}
@@ -1758,7 +1811,7 @@ void sock_wfree(struct sk_buff *skb)
* if sk_wmem_alloc reaches 0, we must finish what sk_free()
* could not do because of in-flight packets
*/
- if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
+ if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
__sk_free(sk);
}
EXPORT_SYMBOL(sock_wfree);
@@ -1770,7 +1823,7 @@ void __sock_wfree(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
- if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
+ if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
__sk_free(sk);
}
@@ -1792,7 +1845,7 @@ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
* is enough to guarantee sk_free() wont free this sock until
* all in-flight packets are completed
*/
- atomic_add(skb->truesize, &sk->sk_wmem_alloc);
+ refcount_add(skb->truesize, &sk->sk_wmem_alloc);
}
EXPORT_SYMBOL(skb_set_owner_w);
@@ -1814,8 +1867,8 @@ void skb_orphan_partial(struct sk_buff *skb)
) {
struct sock *sk = skb->sk;
- if (atomic_inc_not_zero(&sk->sk_refcnt)) {
- atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
+ if (refcount_inc_not_zero(&sk->sk_refcnt)) {
+ WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
skb->destructor = sock_efree;
}
} else {
@@ -1875,7 +1928,7 @@ EXPORT_SYMBOL(sock_i_ino);
struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
gfp_t priority)
{
- if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
+ if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
struct sk_buff *skb = alloc_skb(size, priority);
if (skb) {
skb_set_owner_w(skb, sk);
@@ -1886,6 +1939,33 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
}
EXPORT_SYMBOL(sock_wmalloc);
+static void sock_ofree(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+
+ atomic_sub(skb->truesize, &sk->sk_omem_alloc);
+}
+
+struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
+ gfp_t priority)
+{
+ struct sk_buff *skb;
+
+ /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
+ if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
+ sysctl_optmem_max)
+ return NULL;
+
+ skb = alloc_skb(size, priority);
+ if (!skb)
+ return NULL;
+
+ atomic_add(skb->truesize, &sk->sk_omem_alloc);
+ skb->sk = sk;
+ skb->destructor = sock_ofree;
+ return skb;
+}
+
/*
* Allocate a memory block from the socket's option memory buffer.
*/
@@ -1950,7 +2030,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo)
break;
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
+ if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
break;
if (sk->sk_shutdown & SEND_SHUTDOWN)
break;
@@ -2072,6 +2152,26 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
}
EXPORT_SYMBOL(sock_cmsg_send);
+static void sk_enter_memory_pressure(struct sock *sk)
+{
+ if (!sk->sk_prot->enter_memory_pressure)
+ return;
+
+ sk->sk_prot->enter_memory_pressure(sk);
+}
+
+static void sk_leave_memory_pressure(struct sock *sk)
+{
+ if (sk->sk_prot->leave_memory_pressure) {
+ sk->sk_prot->leave_memory_pressure(sk);
+ } else {
+ unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
+
+ if (memory_pressure && *memory_pressure)
+ *memory_pressure = 0;
+ }
+}
+
/* On 32bit arches, an skb frag is limited to 2^15 */
#define SKB_FRAG_PAGE_ORDER get_order(32768)
@@ -2253,7 +2353,7 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
if (sk->sk_type == SOCK_STREAM) {
if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
return 1;
- } else if (atomic_read(&sk->sk_wmem_alloc) <
+ } else if (refcount_read(&sk->sk_wmem_alloc) <
prot->sysctl_wmem[0])
return 1;
}
@@ -2351,9 +2451,6 @@ EXPORT_SYMBOL(__sk_mem_reclaim);
int sk_set_peek_off(struct sock *sk, int val)
{
- if (val < 0)
- return -EINVAL;
-
sk->sk_peek_off = val;
return 0;
}
@@ -2443,6 +2540,12 @@ int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
}
EXPORT_SYMBOL(sock_no_sendmsg);
+int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_sendmsg_locked);
+
int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
int flags)
{
@@ -2471,6 +2574,22 @@ ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, siz
}
EXPORT_SYMBOL(sock_no_sendpage);
+ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
+ int offset, size_t size, int flags)
+{
+ ssize_t res;
+ struct msghdr msg = {.msg_flags = flags};
+ struct kvec iov;
+ char *kaddr = kmap(page);
+
+ iov.iov_base = kaddr + offset;
+ iov.iov_len = size;
+ res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
+ kunmap(page);
+ return res;
+}
+EXPORT_SYMBOL(sock_no_sendpage_locked);
+
/*
* Default Socket Callbacks
*/
@@ -2520,7 +2639,7 @@ static void sock_def_write_space(struct sock *sk)
/* Do not wake up a writer until he can make "significant"
* progress. --DaveM
*/
- if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
+ if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
wq = rcu_dereference(sk->sk_wq);
if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
@@ -2616,6 +2735,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
sk->sk_stamp = SK_DEFAULT_STAMP;
+ atomic_set(&sk->sk_zckey, 0);
#ifdef CONFIG_NET_RX_BUSY_POLL
sk->sk_napi_id = 0;
@@ -2630,7 +2750,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
* (Documentation/RCU/rculist_nulls.txt for details)
*/
smp_wmb();
- atomic_set(&sk->sk_refcnt, 1);
+ refcount_set(&sk->sk_refcnt, 1);
atomic_set(&sk->sk_drops, 0);
}
EXPORT_SYMBOL(sock_init_data);
@@ -2675,9 +2795,12 @@ EXPORT_SYMBOL(release_sock);
* @sk: socket
*
* This version should be used for very small section, where process wont block
- * return false if fast path is taken
+ * return false if fast path is taken:
+ *
* sk_lock.slock locked, owned = 0, BH disabled
- * return true if slow path is taken
+ *
+ * return true if slow path is taken:
+ *
* sk_lock.slock unlocked, owned = 1, BH enabled
*/
bool lock_sock_fast(struct sock *sk)