summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2013-01-23 22:44:10 +0400
committerDavid S. Miller <davem@davemloft.net>2013-01-23 22:44:10 +0400
commitc617f398edd4db2b8567a28e899a88f8f574798d (patch)
tree8769b30ecfc83acceb5c0095e21db371efad8b61 /include
parent4a633a602c26497b8285a202830829d3be007c7b (diff)
parent72289b96c943757220ccc681fe2e22b46e21aced (diff)
downloadlinux-c617f398edd4db2b8567a28e899a88f8f574798d.tar.xz
Merge branch 'soreuseport'
Tom Herbert says: ==================== This series implements so_reuseport (SO_REUSEPORT socket option) for TCP and UDP.  For TCP, so_reuseport allows multiple listener sockets to be bound to the same port.  In the case of UDP, so_reuseport allows multiple sockets to bind to the same port.  To prevent port hijacking all sockets bound to the same port using so_reuseport must have the same uid.  Received packets are distributed to multiple sockets bound to the same port using a 4-tuple hash. The motivating case for so_resuseport in TCP would be something like a web server binding to port 80 running with multiple threads, where each thread might have it's own listener socket.  This could be done as an alternative to other models: 1) have one listener thread which dispatches completed connections to workers. 2) accept on a single listener socket from multiple threads.  In case #1 the listener thread can easily become the bottleneck with high connection turn-over rate. In case #2, the proportion of connections accepted per thread tends to be uneven under high connection load (assuming simple event loop: while (1) { accept(); process() }, wakeup does not promote fairness among the sockets.  We have seen the  disproportion to be as high as 3:1 ratio between thread accepting most connections and the one accepting the fewest.  With so_reusport the distribution is uniform. The TCP implementation has a problem in that the request sockets for a listener are attached to a listener socket.  If a SYN is received, a listener socket is chosen and request structure is created (SYN-RECV state).  If the subsequent ack in 3WHS does not match the same port by so_reusport, the connection state is not found (reset) and the request structure is orphaned.  This scenario would occur when the number of listener sockets bound to a port changes (new ones are added, or old ones closed).  We are looking for a solution to this, maybe allow multiple sockets to share the same request table... The motivating case for so_reuseport in UDP would be something like a DNS server.  An alternative would be to recv on the same socket from multiple threads.  As in the case of TCP, the load across these threads tends to be disproportionate and we also see a lot of contection on the socket lock.  Note that SO_REUSEADDR already allows multiple UDP sockets to bind to the same port, however there is no provision to prevent hijacking and nothing to distribute packets across all the sockets sharing the same bound port.  This patch does not change the semantics of SO_REUSEADDR, but provides usable functionality of it for unicast. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'include')
-rw-r--r--include/linux/random.h6
-rw-r--r--include/net/inet6_hashtables.h5
-rw-r--r--include/net/inet_hashtables.h13
-rw-r--r--include/net/netfilter/nf_tproxy_core.h2
-rw-r--r--include/net/sock.h5
-rw-r--r--include/uapi/asm-generic/socket.h3
6 files changed, 27 insertions, 7 deletions
diff --git a/include/linux/random.h b/include/linux/random.h
index d9846088c2c5..347ce553a306 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -74,4 +74,10 @@ static inline int arch_get_random_int(unsigned int *v)
}
#endif
+/* Pseudo random number generator from numerical recipes. */
+static inline u32 next_pseudo_random32(u32 seed)
+{
+ return seed * 1664525 + 1013904223;
+}
+
#endif /* _LINUX_RANDOM_H */
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index 9e34c877a770..7ca75cbbf75e 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -71,6 +71,8 @@ extern struct sock *__inet6_lookup_established(struct net *net,
extern struct sock *inet6_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
+ const struct in6_addr *saddr,
+ const __be16 sport,
const struct in6_addr *daddr,
const unsigned short hnum,
const int dif);
@@ -88,7 +90,8 @@ static inline struct sock *__inet6_lookup(struct net *net,
if (sk)
return sk;
- return inet6_lookup_listener(net, hashinfo, daddr, hnum, dif);
+ return inet6_lookup_listener(net, hashinfo, saddr, sport,
+ daddr, hnum, dif);
}
static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo,
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 67a8fa098e3a..7b2ae9d37076 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -81,7 +81,9 @@ struct inet_bind_bucket {
struct net *ib_net;
#endif
unsigned short port;
- signed short fastreuse;
+ signed char fastreuse;
+ signed char fastreuseport;
+ kuid_t fastuid;
int num_owners;
struct hlist_node node;
struct hlist_head owners;
@@ -257,15 +259,19 @@ extern void inet_unhash(struct sock *sk);
extern struct sock *__inet_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
+ const __be32 saddr,
+ const __be16 sport,
const __be32 daddr,
const unsigned short hnum,
const int dif);
static inline struct sock *inet_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
+ __be32 saddr, __be16 sport,
__be32 daddr, __be16 dport, int dif)
{
- return __inet_lookup_listener(net, hashinfo, daddr, ntohs(dport), dif);
+ return __inet_lookup_listener(net, hashinfo, saddr, sport,
+ daddr, ntohs(dport), dif);
}
/* Socket demux engine toys. */
@@ -358,7 +364,8 @@ static inline struct sock *__inet_lookup(struct net *net,
struct sock *sk = __inet_lookup_established(net, hashinfo,
saddr, sport, daddr, hnum, dif);
- return sk ? : __inet_lookup_listener(net, hashinfo, daddr, hnum, dif);
+ return sk ? : __inet_lookup_listener(net, hashinfo, saddr, sport,
+ daddr, hnum, dif);
}
static inline struct sock *inet_lookup(struct net *net,
diff --git a/include/net/netfilter/nf_tproxy_core.h b/include/net/netfilter/nf_tproxy_core.h
index 75ca9291cf2c..36d9379d4c4b 100644
--- a/include/net/netfilter/nf_tproxy_core.h
+++ b/include/net/netfilter/nf_tproxy_core.h
@@ -82,6 +82,7 @@ nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
break;
case NFT_LOOKUP_LISTENER:
sk = inet_lookup_listener(net, &tcp_hashinfo,
+ saddr, sport,
daddr, dport,
in->ifindex);
@@ -151,6 +152,7 @@ nf_tproxy_get_sock_v6(struct net *net, const u8 protocol,
break;
case NFT_LOOKUP_LISTENER:
sk = inet6_lookup_listener(net, &tcp_hashinfo,
+ saddr, sport,
daddr, ntohs(dport),
in->ifindex);
diff --git a/include/net/sock.h b/include/net/sock.h
index 5a34e2f03657..581dc6bd7dc6 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -140,6 +140,7 @@ typedef __u64 __bitwise __addrpair;
* @skc_family: network address family
* @skc_state: Connection state
* @skc_reuse: %SO_REUSEADDR setting
+ * @skc_reuseport: %SO_REUSEPORT setting
* @skc_bound_dev_if: bound device index if != 0
* @skc_bind_node: bind hash linkage for various protocol lookup tables
* @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol
@@ -179,7 +180,8 @@ struct sock_common {
unsigned short skc_family;
volatile unsigned char skc_state;
- unsigned char skc_reuse;
+ unsigned char skc_reuse:4;
+ unsigned char skc_reuseport:4;
int skc_bound_dev_if;
union {
struct hlist_node skc_bind_node;
@@ -297,6 +299,7 @@ struct sock {
#define sk_family __sk_common.skc_family
#define sk_state __sk_common.skc_state
#define sk_reuse __sk_common.skc_reuse
+#define sk_reuseport __sk_common.skc_reuseport
#define sk_bound_dev_if __sk_common.skc_bound_dev_if
#define sk_bind_node __sk_common.skc_bind_node
#define sk_prot __sk_common.skc_prot
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 3f6a99201410..4ef3acbba5da 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -22,8 +22,7 @@
#define SO_PRIORITY 12
#define SO_LINGER 13
#define SO_BSDCOMPAT 14
-/* To add :#define SO_REUSEPORT 15 */
-
+#define SO_REUSEPORT 15
#ifndef SO_PASSCRED /* powerpc only differs in these */
#define SO_PASSCRED 16
#define SO_PEERCRED 17