Merge tag 'nf-next-26-03-04' of https://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next

Florian Westphal says: ==================== netfilter: updates for net-next The following patchset contains Netfilter updates for *net-next*, including changes to IPv6 stack and updates to IPVS from Julian Anastasov. 1) ipv6: export fib6_lookup for nft_fib_ipv6 module 2) factor out ipv6_anycast_destination logic so its usable without dst_entry. These are dependencies for patch 3. 3) switch nft_fib_ipv6 module to no longer need temporary dst_entry object allocations by using fib6_lookup() + RCU. This gets us ~13% higher packet rate in my tests. Patches 4 to 8, from Eric Dumazet, zap sk_callback_lock usage in netfilter. Patch 9 removes another sk_callback_lock instance. Remaining patches, from Julian Anastasov, improve IPVS, Quoting Julian: * Add infrastructure for resizable hash tables based on hlist_bl. * Change the 256-bucket service hash table to be resizable. * Change the global connection table to be per-net and resizable. * Make connection hashing more secure for setups with multiple services. netfilter pull request nf-next-26-03-04 * tag 'nf-next-26-03-04' of https://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next: ipvs: use more keys for connection hashing ipvs: switch to per-net connection table ipvs: use resizable hash table for services ipvs: add resizable hash tables rculist_bl: add hlist_bl_for_each_entry_continue_rcu netfilter: nfnetlink_queue: remove locking in nfqnl_get_sk_secctx netfilter: nfnetlink_queue: no longer acquire sk_callback_lock netfilter: nfnetlink_log: no longer acquire sk_callback_lock netfilter: nft_meta: no longer acquire sk_callback_lock in nft_meta_get_eval_skugid() netfilter: xt_owner: no longer acquire sk_callback_lock in mt_owner() netfilter: nf_log_syslog: no longer acquire sk_callback_lock in nf_log_dump_sk_uid_gid() netfilter: nft_fib_ipv6: switch to fib6_lookup ipv6: make ipv6_anycast_destination logic usable without dst_entry ipv6: export fib6_lookup for nft_fib_ipv6 ==================== Link: https://patch.msgid.link/20260304114921.31042-1-fw@strlen.de Signed-off-by: Paolo Abeni <pabeni@redhat.com>
author: Paolo Abeni <pabeni@redhat.com> 2026-03-05 13:32:49 +0300
committer: Paolo Abeni <pabeni@redhat.com> 2026-03-05 13:32:50 +0300
commit: 6d32a196beb41525f055f720e98f97fb441f0ee0 (patch)
tree: 530e7bd31012de8ae7483c50641b48501d070161
parent: d8103bfe41eeb8b6842672502a0025b7278931a8 (diff)
parent: f20c73b0460d15301cf1bddf0f85d060a38a75df (diff)
download: linux-6d32a196beb41525f055f720e98f97fb441f0ee0.tar.xz
16 files changed, 2011 insertions, 513 deletions
diff --git a/include/linux/rculist_bl.h b/include/linux/rculist_bl.h
index 0b952d06eb0b..36363b876e53 100644
--- a/include/linux/rculist_bl.h
+++ b/include/linux/rculist_bl.h
@@ -8,21 +8,31 @@
 #include <linux/list_bl.h>
 #include <linux/rcupdate.h>
 
+/* return the first ptr or next element in an RCU protected list */
+#define hlist_bl_first_rcu(head)	\
+	(*((struct hlist_bl_node __rcu **)(&(head)->first)))
+#define hlist_bl_next_rcu(node)	\
+	(*((struct hlist_bl_node __rcu **)(&(node)->next)))
+
 static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h,
 					struct hlist_bl_node *n)
 {
 	LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK);
 	LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) !=
 							LIST_BL_LOCKMASK);
-	rcu_assign_pointer(h->first,
+	rcu_assign_pointer(hlist_bl_first_rcu(h),
 		(struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK));
 }
 
-static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h)
-{
-	return (struct hlist_bl_node *)
-		((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK);
-}
+#define hlist_bl_first_rcu_dereference(head)				\
+({									\
+	struct hlist_bl_head *__head = (head);				\
+									\
+	(struct hlist_bl_node *)					\
+	((unsigned long)rcu_dereference_check(hlist_bl_first_rcu(__head), \
+					      hlist_bl_is_locked(__head)) & \
+					      ~LIST_BL_LOCKMASK);	\
+})
 
 /**
  * hlist_bl_del_rcu - deletes entry from hash list without re-initialization
@@ -73,7 +83,7 @@ static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n,
 {
 	struct hlist_bl_node *first;
 
-	/* don't need hlist_bl_first_rcu because we're under lock */
+	/* don't need hlist_bl_first_rcu* because we're under lock */
 	first = hlist_bl_first(h);
 
 	n->next = first;
@@ -93,9 +103,30 @@ static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n,
  *
  */
 #define hlist_bl_for_each_entry_rcu(tpos, pos, head, member)		\
-	for (pos = hlist_bl_first_rcu(head);				\
+	for (pos = hlist_bl_first_rcu_dereference(head);		\
 		pos &&							\
 		({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \
-		pos = rcu_dereference_raw(pos->next))
+		pos = rcu_dereference_raw(hlist_bl_next_rcu(pos)))
+
+/**
+ * hlist_bl_for_each_entry_continue_rcu - continue iteration over list of given
+ *   type
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_bl_node to use as a loop cursor.
+ * @member:	the name of the hlist_bl_node within the struct.
+ *
+ * Continue to iterate over list of given type, continuing after
+ * the current position which must have been in the list when the RCU read
+ * lock was taken.
+ * This would typically require either that you obtained the node from a
+ * previous walk of the list in the same RCU read-side critical section, or
+ * that you held some sort of non-RCU reference (such as a reference count)
+ * to keep the node alive *and* in the list.
+ */
+#define hlist_bl_for_each_entry_continue_rcu(tpos, pos, member)		\
+	for (pos = rcu_dereference_raw(hlist_bl_next_rcu(&(tpos)->member)); \
+	     pos &&							\
+	     ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \
+	     pos = rcu_dereference_raw(hlist_bl_next_rcu(pos)))
 
 #endif
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index a55f9bf95fe3..0c8eeb6abe7a 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -252,15 +252,22 @@ static inline bool ipv6_unicast_destination(const struct sk_buff *skb)
 	return rt->rt6i_flags & RTF_LOCAL;
 }
 
+static inline bool __ipv6_anycast_destination(const struct rt6key *rt6i_dst,
+					      u32 rt6i_flags,
+					      const struct in6_addr *daddr)
+{
+	return rt6i_flags & RTF_ANYCAST ||
+	       (rt6i_dst->plen < 127 &&
+	       !(rt6i_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) &&
+	       ipv6_addr_equal(&rt6i_dst->addr, daddr));
+}
+
 static inline bool ipv6_anycast_destination(const struct dst_entry *dst,
 					    const struct in6_addr *daddr)
 {
 	const struct rt6_info *rt = dst_rt6_info(dst);
 
-	return rt->rt6i_flags & RTF_ANYCAST ||
-		(rt->rt6i_dst.plen < 127 &&
-		 !(rt->rt6i_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) &&
-		 ipv6_addr_equal(&rt->rt6i_dst.addr, daddr));
+	return __ipv6_anycast_destination(&rt->rt6i_dst, rt->rt6i_flags, daddr);
 }
 
 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index ad8a16146ac5..72d325c81313 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -11,6 +11,7 @@
 #include <asm/types.h>                  /* for __uXX types */
 
 #include <linux/list.h>                 /* for struct list_head */
+#include <linux/rculist_bl.h>           /* for struct hlist_bl_head */
 #include <linux/spinlock.h>             /* for struct rwlock_t */
 #include <linux/atomic.h>               /* for struct atomic_t */
 #include <linux/refcount.h>             /* for struct refcount_t */
@@ -30,15 +31,22 @@
 #endif
 #include <net/net_namespace.h>		/* Netw namespace */
 #include <linux/sched/isolation.h>
+#include <linux/siphash.h>
 
 #define IP_VS_HDR_INVERSE	1
 #define IP_VS_HDR_ICMP		2
-/*
- *	Hash table: for virtual service lookups
- */
-#define IP_VS_SVC_TAB_BITS 8
-#define IP_VS_SVC_TAB_SIZE BIT(IP_VS_SVC_TAB_BITS)
-#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
+
+/* conn_tab limits (as per Kconfig) */
+#define IP_VS_CONN_TAB_MIN_BITS	8
+#if BITS_PER_LONG > 32
+#define IP_VS_CONN_TAB_MAX_BITS	27
+#else
+#define IP_VS_CONN_TAB_MAX_BITS	20
+#endif
+
+/* svc_table limits */
+#define IP_VS_SVC_TAB_MIN_BITS	4
+#define IP_VS_SVC_TAB_MAX_BITS	20
 
 /* Generic access of ipvs struct */
 static inline struct netns_ipvs *net_ipvs(struct net* net)
@@ -49,8 +57,6 @@ static inline struct netns_ipvs *net_ipvs(struct net* net)
 /* Connections' size value needed by ip_vs_ctl.c */
 extern int ip_vs_conn_tab_size;
 
-extern struct mutex __ip_vs_mutex;
-
 struct ip_vs_iphdr {
 	int hdr_flags;	/* ipvs flags */
 	__u32 off;	/* Where IP or IPv4 header starts */
@@ -271,6 +277,10 @@ static inline const char *ip_vs_dbg_addr(int af, char *buf, size_t buf_len,
 			pr_err(msg, ##__VA_ARGS__);			\
 	} while (0)
 
+struct ip_vs_aligned_lock {
+	spinlock_t	l;	/* Protect buckets */
+} ____cacheline_aligned_in_smp;
+
 /* For arrays per family */
 enum {
 	IP_VS_AF_INET,
@@ -283,6 +293,13 @@ static inline int ip_vs_af_index(int af)
 	return af == AF_INET6 ? IP_VS_AF_INET6 : IP_VS_AF_INET;
 }
 
+/* work_flags */
+enum {
+	IP_VS_WORK_SVC_RESIZE,		/* Schedule svc_resize_work */
+	IP_VS_WORK_SVC_NORESIZE,	/* Stopping svc_resize_work */
+	IP_VS_WORK_CONN_RESIZE,		/* Schedule conn_resize_work */
+};
+
 /* The port number of FTP service (in network order). */
 #define FTPPORT  cpu_to_be16(21)
 #define FTPDATA  cpu_to_be16(20)
@@ -484,6 +501,198 @@ struct ip_vs_est_kt_data {
 	int			est_row;	/* estimated row */
 };
 
+/* IPVS resizable hash tables */
+struct ip_vs_rht {
+	struct hlist_bl_head		*buckets;
+	struct ip_vs_rht __rcu		*new_tbl; /* New/Same table	*/
+	seqcount_t			*seqc;	/* Protects moves	*/
+	struct ip_vs_aligned_lock	*lock;	/* Protect seqc		*/
+	int				mask;	/* Buckets mask		*/
+	int				size;	/* Buckets		*/
+	int				seqc_mask; /* seqc mask		*/
+	int				lock_mask; /* lock mask		*/
+	u32				table_id;
+	int				u_thresh; /* upper threshold	*/
+	int				l_thresh; /* lower threshold	*/
+	int				lfactor;  /* Load Factor (shift)*/
+	int				bits;	/* size = 1 << bits	*/
+	siphash_key_t			hash_key;
+	struct rcu_head			rcu_head;
+};
+
+/**
+ * ip_vs_rht_for_each_table() - Walk the hash tables
+ * @table:	struct ip_vs_rht __rcu *table
+ * @t:		current table, used as cursor, struct ip_vs_rht *var
+ * @p:		previous table, temp struct ip_vs_rht *var
+ *
+ * Walk tables assuming others can not change the installed tables
+ */
+#define ip_vs_rht_for_each_table(table, t, p)				\
+	for (p = NULL, t = rcu_dereference_protected(table, 1);		\
+	     t != p;							\
+	     p = t, t = rcu_dereference_protected(t->new_tbl, 1))
+
+/**
+ * ip_vs_rht_for_each_table_rcu() - Walk the hash tables under RCU reader lock
+ * @table:	struct ip_vs_rht __rcu *table
+ * @t:		current table, used as cursor, struct ip_vs_rht *var
+ * @p:		previous table, temp struct ip_vs_rht *var
+ *
+ * We usually search in one table and also in second table on resizing
+ */
+#define ip_vs_rht_for_each_table_rcu(table, t, p)			\
+	for (p = NULL, t = rcu_dereference(table);			\
+	     t != p;							\
+	     p = t, t = rcu_dereference(t->new_tbl))
+
+/**
+ * ip_vs_rht_for_each_bucket() - Walk all table buckets
+ * @t:		current table, used as cursor, struct ip_vs_rht *var
+ * @bucket:	bucket index, used as cursor, u32 var
+ * @head:	bucket address, used as cursor, struct hlist_bl_head *var
+ */
+#define ip_vs_rht_for_each_bucket(t, bucket, head)			\
+	for (bucket = 0, head = (t)->buckets;				\
+	     bucket < t->size; bucket++, head++)
+
+/**
+ * ip_vs_rht_for_bucket_retry() - Retry bucket if entries are moved
+ * @t:		current table, used as cursor, struct ip_vs_rht *var
+ * @bucket:	index of current bucket or hash key
+ * @sc:		temp seqcount_t *var
+ * @seq:	temp unsigned int var for sequence count
+ * @retry:	temp int var
+ */
+#define ip_vs_rht_for_bucket_retry(t, bucket, sc, seq, retry)		\
+	for (retry = 1, sc = &(t)->seqc[(bucket) & (t)->seqc_mask];	\
+	     retry && ({ seq = read_seqcount_begin(sc); 1; });		\
+	     retry = read_seqcount_retry(sc, seq))
+
+/**
+ * DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU() - Declare variables
+ *
+ * Variables for ip_vs_rht_walk_buckets_rcu
+ */
+#define DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU()				\
+	struct ip_vs_rht *_t, *_p;					\
+	unsigned int _seq;						\
+	seqcount_t *_sc;						\
+	u32 _bucket;							\
+	int _retry
+/**
+ * ip_vs_rht_walk_buckets_rcu() - Walk all buckets under RCU read lock
+ * @table:	struct ip_vs_rht __rcu *table
+ * @head:	bucket address, used as cursor, struct hlist_bl_head *var
+ *
+ * Can be used while others add/delete/move entries
+ * Not suitable if duplicates are not desired
+ * Possible cases for reader that uses cond_resched_rcu() in the loop:
+ * - new table can not be installed, no need to repeat
+ * - new table can be installed => check and repeat if new table is
+ * installed, needed for !PREEMPT_RCU
+ */
+#define ip_vs_rht_walk_buckets_rcu(table, head)				\
+	ip_vs_rht_for_each_table_rcu(table, _t, _p)			\
+		ip_vs_rht_for_each_bucket(_t, _bucket, head)		\
+			ip_vs_rht_for_bucket_retry(_t, _bucket, _sc,	\
+						   _seq, _retry)
+
+/**
+ * DECLARE_IP_VS_RHT_WALK_BUCKET_RCU() - Declare variables
+ *
+ * Variables for ip_vs_rht_walk_bucket_rcu
+ */
+#define DECLARE_IP_VS_RHT_WALK_BUCKET_RCU()				\
+	unsigned int _seq;						\
+	seqcount_t *_sc;						\
+	int _retry
+/**
+ * ip_vs_rht_walk_bucket_rcu() - Walk bucket under RCU read lock
+ * @t:		current table, struct ip_vs_rht *var
+ * @bucket:	index of current bucket or hash key
+ * @head:	bucket address, used as cursor, struct hlist_bl_head *var
+ *
+ * Can be used while others add/delete/move entries
+ * Not suitable if duplicates are not desired
+ * Possible cases for reader that uses cond_resched_rcu() in the loop:
+ * - new table can not be installed, no need to repeat
+ * - new table can be installed => check and repeat if new table is
+ * installed, needed for !PREEMPT_RCU
+ */
+#define ip_vs_rht_walk_bucket_rcu(t, bucket, head)			\
+	if (({ head = (t)->buckets + ((bucket) & (t)->mask); 0; }))	\
+		{}							\
+	else								\
+		ip_vs_rht_for_bucket_retry(t, (bucket), _sc, _seq, _retry)
+
+/**
+ * DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU() - Declare variables
+ *
+ * Variables for ip_vs_rht_walk_buckets_safe_rcu
+ */
+#define DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU()			\
+	struct ip_vs_rht *_t, *_p;					\
+	u32 _bucket
+/**
+ * ip_vs_rht_walk_buckets_safe_rcu() - Walk all buckets under RCU read lock
+ * @table:	struct ip_vs_rht __rcu *table
+ * @head:	bucket address, used as cursor, struct hlist_bl_head *var
+ *
+ * Can be used while others add/delete entries but moving is disabled
+ * Using cond_resched_rcu() should be safe if tables do not change
+ */
+#define ip_vs_rht_walk_buckets_safe_rcu(table, head)			\
+	ip_vs_rht_for_each_table_rcu(table, _t, _p)			\
+		ip_vs_rht_for_each_bucket(_t, _bucket, head)
+
+/**
+ * DECLARE_IP_VS_RHT_WALK_BUCKETS() - Declare variables
+ *
+ * Variables for ip_vs_rht_walk_buckets
+ */
+#define DECLARE_IP_VS_RHT_WALK_BUCKETS()				\
+	struct ip_vs_rht *_t, *_p;					\
+	u32 _bucket
+
+/**
+ * ip_vs_rht_walk_buckets() - Walk all buckets
+ * @table:	struct ip_vs_rht __rcu *table
+ * @head:	bucket address, used as cursor, struct hlist_bl_head *var
+ *
+ * Use if others can not add/delete/move entries
+ */
+#define ip_vs_rht_walk_buckets(table, head)				\
+	ip_vs_rht_for_each_table(table, _t, _p)				\
+		ip_vs_rht_for_each_bucket(_t, _bucket, head)
+
+/* Entries can be in one of two tables, so we flip bit when new table is
+ * created and store it as highest bit in hash keys
+ */
+#define IP_VS_RHT_TABLE_ID_MASK	BIT(31)
+
+/* Check if hash key is from this table */
+static inline bool ip_vs_rht_same_table(struct ip_vs_rht *t, u32 hash_key)
+{
+	return !((t->table_id ^ hash_key) & IP_VS_RHT_TABLE_ID_MASK);
+}
+
+/* Build per-table hash key from hash value */
+static inline u32 ip_vs_rht_build_hash_key(struct ip_vs_rht *t, u32 hash)
+{
+	return t->table_id | (hash & ~IP_VS_RHT_TABLE_ID_MASK);
+}
+
+void ip_vs_rht_free(struct ip_vs_rht *t);
+void ip_vs_rht_rcu_free(struct rcu_head *head);
+struct ip_vs_rht *ip_vs_rht_alloc(int buckets, int scounts, int locks);
+int ip_vs_rht_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, int n,
+			   int lfactor, int min_bits, int max_bits);
+void ip_vs_rht_set_thresholds(struct ip_vs_rht *t, int size, int lfactor,
+			      int min_bits, int max_bits);
+u32 ip_vs_rht_hash_linfo(struct ip_vs_rht *t, int af,
+			 const union nf_inet_addr *addr, u32 v1, u32 v2);
+
 struct dst_entry;
 struct iphdr;
 struct ip_vs_conn;
@@ -577,50 +786,48 @@ struct ip_vs_conn_param {
 	__u8				pe_data_len;
 };
 
+/* Hash node in conn_tab */
+struct ip_vs_conn_hnode {
+	struct hlist_bl_node	node;		/* node in conn_tab */
+	u32			hash_key;	/* Key for the hash table */
+	u8			dir;		/* 0=out->in, 1=in->out */
+} __packed;
+
 /* IP_VS structure allocated for each dynamically scheduled connection */
 struct ip_vs_conn {
-	struct hlist_node	c_list;         /* hashed list heads */
-	/* Protocol, addresses and port numbers */
+	/* Cacheline for hash table nodes - rarely modified */
+
+	struct ip_vs_conn_hnode	hn0;		/* Original direction */
+	u8			af;		/* address family */
 	__be16                  cport;
+	struct ip_vs_conn_hnode	hn1;		/* Reply direction */
+	u8			daf;		/* Address family of the dest */
 	__be16                  dport;
-	__be16                  vport;
-	u16			af;		/* address family */
-	union nf_inet_addr      caddr;          /* client address */
-	union nf_inet_addr      vaddr;          /* virtual address */
-	union nf_inet_addr      daddr;          /* destination address */
+	struct ip_vs_dest       *dest;          /* real server */
+	atomic_t                n_control;      /* Number of controlled ones */
 	volatile __u32          flags;          /* status flags */
-	__u16                   protocol;       /* Which protocol (TCP/UDP) */
-	__u16			daf;		/* Address family of the dest */
-	struct netns_ipvs	*ipvs;
-
-	/* counter and timer */
-	refcount_t		refcnt;		/* reference count */
-	struct timer_list	timer;		/* Expiration timer */
-	volatile unsigned long	timeout;	/* timeout */
+	/* 44/64 */
 
-	/* Flags and state transition */
-	spinlock_t              lock;           /* lock for state transition */
+	struct ip_vs_conn       *control;       /* Master control connection */
+	const struct ip_vs_pe	*pe;
+	char			*pe_data;
+	__u8			pe_data_len;
 	volatile __u16          state;          /* state info */
 	volatile __u16          old_state;      /* old state, to be used for
 						 * state transition triggered
 						 * synchronization
 						 */
-	__u32			fwmark;		/* Fire wall mark from skb */
-	unsigned long		sync_endtime;	/* jiffies + sent_retries */
+	/* 2-byte hole */
+	/* 64/96 */
 
-	/* Control members */
-	struct ip_vs_conn       *control;       /* Master control connection */
-	atomic_t                n_control;      /* Number of controlled ones */
-	struct ip_vs_dest       *dest;          /* real server */
-	atomic_t                in_pkts;        /* incoming packet counter */
+	union nf_inet_addr      caddr;          /* client address */
+	union nf_inet_addr      vaddr;          /* virtual address */
+	/* 96/128 */
 
-	/* Packet transmitter for different forwarding methods.  If it
-	 * mangles the packet, it must return NF_DROP or better NF_STOLEN,
-	 * otherwise this must be changed to a sk_buff **.
-	 * NF_ACCEPT can be returned when destination is local.
-	 */
-	int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp,
-			   struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph);
+	union nf_inet_addr      daddr;          /* destination address */
+	__u32			fwmark;		/* Fire wall mark from skb */
+	__be16                  vport;
+	__u16                   protocol;       /* Which protocol (TCP/UDP) */
 
 	/* Note: we can group the following members into a structure,
 	 * in order to save more space, and the following members are
@@ -628,14 +835,31 @@ struct ip_vs_conn {
 	 */
 	struct ip_vs_app        *app;           /* bound ip_vs_app object */
 	void                    *app_data;      /* Application private data */
+	/* 128/168 */
 	struct_group(sync_conn_opt,
 		struct ip_vs_seq  in_seq;       /* incoming seq. struct */
 		struct ip_vs_seq  out_seq;      /* outgoing seq. struct */
 	);
+	/* 152/192 */
 
-	const struct ip_vs_pe	*pe;
-	char			*pe_data;
-	__u8			pe_data_len;
+	struct timer_list	timer;		/* Expiration timer */
+	volatile unsigned long	timeout;	/* timeout */
+	spinlock_t              lock;           /* lock for state transition */
+	refcount_t		refcnt;		/* reference count */
+	atomic_t                in_pkts;        /* incoming packet counter */
+	/* 64-bit: 4-byte gap */
+
+	/* 188/256 */
+	unsigned long		sync_endtime;	/* jiffies + sent_retries */
+	struct netns_ipvs	*ipvs;
+
+	/* Packet transmitter for different forwarding methods.  If it
+	 * mangles the packet, it must return NF_DROP or better NF_STOLEN,
+	 * otherwise this must be changed to a sk_buff **.
+	 * NF_ACCEPT can be returned when destination is local.
+	 */
+	int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp,
+			   struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph);
 
 	struct rcu_head		rcu_head;
 };
@@ -691,14 +915,15 @@ struct ip_vs_dest_user_kern {
  * forwarding entries.
  */
 struct ip_vs_service {
-	struct hlist_node	s_list;   /* node in service table */
-	atomic_t		refcnt;   /* reference counter */
-
+	struct hlist_bl_node	s_list;   /* node in service table */
+	u32			hash_key; /* Key for the hash table */
 	u16			af;       /* address family */
 	__u16			protocol; /* which protocol (TCP/UDP) */
+
 	union nf_inet_addr	addr;	  /* IP address for virtual service */
-	__be16			port;	  /* port number for the service */
 	__u32                   fwmark;   /* firewall mark of the service */
+	atomic_t		refcnt;   /* reference counter */
+	__be16			port;	  /* port number for the service */
 	unsigned int		flags;	  /* service status flags */
 	unsigned int		timeout;  /* persistent timeout in ticks */
 	__be32			netmask;  /* grouping granularity, mask/plen */
@@ -808,8 +1033,8 @@ struct ip_vs_pe {
 	int (*fill_param)(struct ip_vs_conn_param *p, struct sk_buff *skb);
 	bool (*ct_match)(const struct ip_vs_conn_param *p,
 			 struct ip_vs_conn *ct);
-	u32 (*hashkey_raw)(const struct ip_vs_conn_param *p, u32 initval,
-			   bool inverse);
+	u32 (*hashkey_raw)(const struct ip_vs_conn_param *p,
+			   struct ip_vs_rht *t, bool inverse);
 	int (*show_pe_data)(const struct ip_vs_conn *cp, char *buf);
 	/* create connections for real-server outgoing packets */
 	struct ip_vs_conn* (*conn_out)(struct ip_vs_service *svc,
@@ -949,6 +1174,7 @@ struct netns_ipvs {
 	/* ip_vs_conn */
 	atomic_t		conn_count;      /* connection counter */
 	atomic_t		no_cport_conns[IP_VS_AF_MAX];
+	struct delayed_work	conn_resize_work;/* resize conn_tab */
 
 	/* ip_vs_ctl */
 	struct ip_vs_stats_rcu	*tot_stats;      /* Statistics & est. */
@@ -957,6 +1183,10 @@ struct netns_ipvs {
 	struct list_head	dest_trash;
 	spinlock_t		dest_trash_lock;
 	struct timer_list	dest_trash_timer; /* expiration timer */
+	struct mutex		service_mutex;    /* service reconfig */
+	struct rw_semaphore	svc_resize_sem;   /* svc_table resizing */
+	struct delayed_work	svc_resize_work;  /* resize svc_table */
+	atomic_t		svc_table_changes;/* ++ on new table */
 	/* Service counters */
 	atomic_t		num_services[IP_VS_AF_MAX];   /* Services */
 	atomic_t		fwm_services[IP_VS_AF_MAX];   /* Services */
@@ -1021,6 +1251,8 @@ struct netns_ipvs {
 	int			sysctl_est_nice;	/* kthread nice */
 	int			est_stopped;		/* stop tasks */
 #endif
+	int			sysctl_conn_lfactor;
+	int			sysctl_svc_lfactor;
 
 	/* ip_vs_lblc */
 	int			sysctl_lblc_expiration;
@@ -1030,6 +1262,7 @@ struct netns_ipvs {
 	int			sysctl_lblcr_expiration;
 	struct ctl_table_header	*lblcr_ctl_header;
 	struct ctl_table	*lblcr_ctl_table;
+	unsigned long		work_flags;	/* IP_VS_WORK_* flags */
 	/* ip_vs_est */
 	struct delayed_work	est_reload_work;/* Reload kthread tasks */
 	struct mutex		est_mutex;	/* protect kthread tasks */
@@ -1061,9 +1294,9 @@ struct netns_ipvs {
 	unsigned int		mixed_address_family_dests;
 	unsigned int		hooks_afmask;	/* &1=AF_INET, &2=AF_INET6 */
 
-	/* the service mutex that protect svc_table and svc_fwm_table */
-	struct mutex service_mutex;
-	struct hlist_head svc_table[IP_VS_SVC_TAB_SIZE];	/* Services */
+	struct ip_vs_rht __rcu	*svc_table;	/* Services */
+	struct ip_vs_rht __rcu	*conn_tab;	/* Connections */
+	atomic_t		conn_tab_changes;/* ++ on new table */
 };
 
 #define DEFAULT_SYNC_THRESHOLD	3
@@ -1313,6 +1546,24 @@ static inline int sysctl_est_nice(struct netns_ipvs *ipvs)
 
 #endif
 
+/* Get load factor to map conn_count/u_thresh to t->size */
+static inline int sysctl_conn_lfactor(struct netns_ipvs *ipvs)
+{
+	return READ_ONCE(ipvs->sysctl_conn_lfactor);
+}
+
+/* Get load factor to map num_services/u_thresh to t->size
+ * Smaller value decreases u_thresh to reduce collisions but increases
+ * the table size
+ * Returns factor where:
+ * - <0: u_thresh = size >> -factor, eg. lfactor -2 = 25% load
+ * - >=0: u_thresh = size << factor, eg. lfactor 1 = 200% load
+ */
+static inline int sysctl_svc_lfactor(struct netns_ipvs *ipvs)
+{
+	return READ_ONCE(ipvs->sysctl_svc_lfactor);
+}
+
 /* IPVS core functions
  * (from ip_vs_core.c)
  */
@@ -1386,6 +1637,23 @@ static inline void __ip_vs_conn_put(struct ip_vs_conn *cp)
 }
 void ip_vs_conn_put(struct ip_vs_conn *cp);
 void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport);
+int ip_vs_conn_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t,
+			    int lfactor);
+struct ip_vs_rht *ip_vs_conn_tab_alloc(struct netns_ipvs *ipvs, int buckets,
+				       int lfactor);
+
+static inline struct ip_vs_conn *
+ip_vs_hn0_to_conn(struct ip_vs_conn_hnode *hn)
+{
+	return container_of(hn, struct ip_vs_conn, hn0);
+}
+
+static inline struct ip_vs_conn *
+ip_vs_hn_to_conn(struct ip_vs_conn_hnode *hn)
+{
+	return hn->dir ? container_of(hn, struct ip_vs_conn, hn1) :
+			 container_of(hn, struct ip_vs_conn, hn0);
+}
 
 struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
 				  const union nf_inet_addr *daddr,
@@ -1739,6 +2007,13 @@ static inline char ip_vs_fwd_tag(struct ip_vs_conn *cp)
 	return fwd;
 }
 
+/* Check if connection uses double hashing */
+static inline bool ip_vs_conn_use_hash2(struct ip_vs_conn *cp)
+{
+	return IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ &&
+	       !(cp->flags & IP_VS_CONN_F_TEMPLATE);
+}
+
 void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
 		    struct ip_vs_conn *cp, int dir);
 
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index fd5f7112a51f..e1b2b4fa6e18 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -92,6 +92,9 @@ int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
 
 	return err;
 }
+#if IS_MODULE(CONFIG_NFT_FIB_IPV6)
+EXPORT_SYMBOL_GPL(fib6_lookup);
+#endif
 
 struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 9058e71241dc..a6e58a435735 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -342,6 +342,9 @@ int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
 	return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6,
 				 res, flags);
 }
+#if IS_MODULE(CONFIG_NFT_FIB_IPV6)
+EXPORT_SYMBOL_GPL(fib6_lookup);
+#endif
 
 static void __net_init fib6_tables_init(struct net *net)
 {
diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c
index dc375b725b28..8b2dba88ee96 100644
--- a/net/ipv6/netfilter/nft_fib_ipv6.c
+++ b/net/ipv6/netfilter/nft_fib_ipv6.c
@@ -52,7 +52,13 @@ static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv,
 	fl6->flowlabel = (*(__be32 *)iph) & IPV6_FLOWINFO_MASK;
 	fl6->flowi6_l3mdev = nft_fib_l3mdev_master_ifindex_rcu(pkt, dev);
 
-	return lookup_flags;
+	return lookup_flags | RT6_LOOKUP_F_DST_NOREF;
+}
+
+static int nft_fib6_lookup(struct net *net, struct flowi6 *fl6,
+			   struct fib6_result *res, int flags)
+{
+	return fib6_lookup(net, fl6->flowi6_oif, fl6, res, flags);
 }
 
 static u32 __nft_fib6_eval_type(const struct nft_fib *priv,
@@ -60,13 +66,14 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv,
 				struct ipv6hdr *iph)
 {
 	const struct net_device *dev = NULL;
+	struct fib6_result res = {};
 	int route_err, addrtype;
-	struct rt6_info *rt;
 	struct flowi6 fl6 = {
 		.flowi6_iif = LOOPBACK_IFINDEX,
 		.flowi6_proto = pkt->tprot,
 		.flowi6_uid = sock_net_uid(nft_net(pkt), NULL),
 	};
+	int lookup_flags;
 	u32 ret = 0;
 
 	if (priv->flags & NFTA_FIB_F_IIF)
@@ -74,29 +81,23 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv,
 	else if (priv->flags & NFTA_FIB_F_OIF)
 		dev = nft_out(pkt);
 
-	nft_fib6_flowi_init(&fl6, priv, pkt, dev, iph);
+	lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, dev, iph);
 
 	if (dev && nf_ipv6_chk_addr(nft_net(pkt), &fl6.daddr, dev, true))
 		ret = RTN_LOCAL;
 
-	route_err = nf_ip6_route(nft_net(pkt), (struct dst_entry **)&rt,
-				 flowi6_to_flowi(&fl6), false);
+	route_err = nft_fib6_lookup(nft_net(pkt), &fl6, &res, lookup_flags);
 	if (route_err)
 		goto err;
 
-	if (rt->rt6i_flags & RTF_REJECT) {
-		route_err = rt->dst.error;
-		dst_release(&rt->dst);
-		goto err;
-	}
+	if (res.fib6_flags & RTF_REJECT)
+		return res.fib6_type;
 
-	if (ipv6_anycast_destination((struct dst_entry *)rt, &fl6.daddr))
+	if (__ipv6_anycast_destination(&res.f6i->fib6_dst, res.fib6_flags, &fl6.daddr))
 		ret = RTN_ANYCAST;
-	else if (!dev && rt->rt6i_flags & RTF_LOCAL)
+	else if (!dev && res.fib6_flags & RTF_LOCAL)
 		ret = RTN_LOCAL;
 
-	dst_release(&rt->dst);
-
 	if (ret)
 		return ret;
 
@@ -152,6 +153,33 @@ static bool nft_fib_v6_skip_icmpv6(const struct sk_buff *skb, u8 next, const str
 	return ipv6_addr_type(&iph->daddr) & IPV6_ADDR_LINKLOCAL;
 }
 
+static bool nft_fib6_info_nh_dev_match(const struct net_device *nh_dev,
+				       const struct net_device *dev)
+{
+	return nh_dev == dev ||
+	       l3mdev_master_ifindex_rcu(nh_dev) == dev->ifindex;
+}
+
+static bool nft_fib6_info_nh_uses_dev(struct fib6_info *rt,
+				      const struct net_device *dev)
+{
+	const struct net_device *nh_dev;
+	struct fib6_info *iter;
+
+	nh_dev = fib6_info_nh_dev(rt);
+	if (nft_fib6_info_nh_dev_match(nh_dev, dev))
+		return true;
+
+	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
+		nh_dev = fib6_info_nh_dev(iter);
+
+		if (nft_fib6_info_nh_dev_match(nh_dev, dev))
+			return true;
+	}
+
+	return false;
+}
+
 void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
 		   const struct nft_pktinfo *pkt)
 {
@@ -160,14 +188,14 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
 	const struct net_device *found = NULL;
 	const struct net_device *oif = NULL;
 	u32 *dest = &regs->data[priv->dreg];
+	struct fib6_result res = {};
 	struct ipv6hdr *iph, _iph;
 	struct flowi6 fl6 = {
 		.flowi6_iif = LOOPBACK_IFINDEX,
 		.flowi6_proto = pkt->tprot,
 		.flowi6_uid = sock_net_uid(nft_net(pkt), NULL),
 	};
-	struct rt6_info *rt;
-	int lookup_flags;
+	int lookup_flags, ret;
 
 	if (nft_fib_can_skip(pkt)) {
 		nft_fib_store_result(dest, priv, nft_in(pkt));
@@ -193,26 +221,17 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
 	lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif, iph);
 
 	*dest = 0;
-	rt = (void *)ip6_route_lookup(nft_net(pkt), &fl6, pkt->skb,
-				      lookup_flags);
-	if (rt->dst.error)
-		goto put_rt_err;
-
-	/* Should not see RTF_LOCAL here */
-	if (rt->rt6i_flags & (RTF_REJECT | RTF_ANYCAST | RTF_LOCAL))
-		goto put_rt_err;
+	ret = nft_fib6_lookup(nft_net(pkt), &fl6, &res, lookup_flags);
+	if (ret || res.fib6_flags & (RTF_REJECT | RTF_ANYCAST | RTF_LOCAL))
+		return;
 
 	if (!oif) {
-		found = rt->rt6i_idev->dev;
+		found = fib6_info_nh_dev(res.f6i);
 	} else {
-		if (oif == rt->rt6i_idev->dev ||
-		    l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) == oif->ifindex)
+		if (nft_fib6_info_nh_uses_dev(res.f6i, oif))
 			found = oif;
 	}
-
 	nft_fib_store_result(dest, priv, found);
- put_rt_err:
-	ip6_rt_put(rt);
 }
 EXPORT_SYMBOL_GPL(nft_fib6_eval);
 
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index deaf16a90e2f..2082bfb2d93c 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -47,28 +47,12 @@ static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
 module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444);
 MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size");
 
-/* size and mask values */
+/* Max table size */
 int ip_vs_conn_tab_size __read_mostly;
-static int ip_vs_conn_tab_mask __read_mostly;
-
-/*
- *  Connection hash table: for input and output packets lookups of IPVS
- */
-static struct hlist_head *ip_vs_conn_tab __read_mostly;
 
 /*  SLAB cache for IPVS connections */
 static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
 
-/* random value for IPVS connection hash */
-static unsigned int ip_vs_conn_rnd __read_mostly;
-
-/*
- *  Fine locking granularity for big connection hash table
- */
-#define CT_LOCKARRAY_BITS  5
-#define CT_LOCKARRAY_SIZE  (1<<CT_LOCKARRAY_BITS)
-#define CT_LOCKARRAY_MASK  (CT_LOCKARRAY_SIZE-1)
-
 /* We need an addrstrlen that works with or without v6 */
 #ifdef CONFIG_IP_VS_IPV6
 #define IP_VS_ADDRSTRLEN INET6_ADDRSTRLEN
@@ -76,23 +60,102 @@ static unsigned int ip_vs_conn_rnd __read_mostly;
 #define IP_VS_ADDRSTRLEN (8+1)
 #endif
 
-struct ip_vs_aligned_lock
+/* Connection hashing:
+ * - hash (add conn) and unhash (del conn) are safe for RCU readers walking
+ * the bucket, they will not jump to another bucket or hash table and to miss
+ * conns
+ * - rehash (fill cport) hashes the conn to new bucket or even new table,
+ * so we use seqcount to retry lookups on buckets where we delete
+ * conns (unhash) because after hashing their next ptr can point to another
+ * bucket or hash table
+ * - hash table resize works like rehash but always rehashes into new table
+ * - bit lock on bucket serializes all operations that modify the chain
+ * - cp->lock protects conn fields like cp->flags, cp->dest
+ */
+
+/* Lock conn_tab bucket for conn hash/unhash, not for rehash */
+static __always_inline void
+conn_tab_lock(struct ip_vs_rht *t, struct ip_vs_conn *cp, u32 hash_key,
+	      u32 hash_key2, bool use2, bool new_hash,
+	      struct hlist_bl_head **head_ret, struct hlist_bl_head **head2_ret)
 {
-	spinlock_t	l;
-} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+	struct hlist_bl_head *head, *head2;
+	u32 hash_key_new, hash_key_new2;
+	struct ip_vs_rht *t2 = t;
+	u32 idx, idx2;
+
+	idx = hash_key & t->mask;
+	if (use2)
+		idx2 = hash_key2 & t->mask;
+	else
+		idx2 = idx;
+	if (!new_hash) {
+		/* We need to lock the bucket in the right table */
+
+retry:
+		if (!ip_vs_rht_same_table(t, hash_key)) {
+			/* It is already moved to new table */
+			t = rcu_dereference(t->new_tbl);
+			/* Rehashing works in two steps and we may detect
+			 * both nodes in different tables, use idx/idx2
+			 * for proper lock ordering for heads.
+			 */
+			idx = hash_key & t->mask;
+			idx |= IP_VS_RHT_TABLE_ID_MASK;
+		}
+		if (use2) {
+			if (!ip_vs_rht_same_table(t2, hash_key2)) {
+				/* It is already moved to new table */
+				t2 = rcu_dereference(t2->new_tbl);
+				idx2 = hash_key2 & t2->mask;
+				idx2 |= IP_VS_RHT_TABLE_ID_MASK;
+			}
+		} else {
+			idx2 = idx;
+		}
+	}
 
-/* lock array for conn table */
-static struct ip_vs_aligned_lock
-__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
+	head = t->buckets + (hash_key & t->mask);
+	head2 = use2 ? t2->buckets + (hash_key2 & t2->mask) : head;
 
-static inline void ct_write_lock_bh(unsigned int key)
-{
-	spin_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+	local_bh_disable();
+	/* Do not touch seqcount, this is a safe operation */
+
+	if (idx <= idx2) {
+		hlist_bl_lock(head);
+		if (head != head2)
+			hlist_bl_lock(head2);
+	} else {
+		hlist_bl_lock(head2);
+		hlist_bl_lock(head);
+	}
+	if (!new_hash) {
+		/* Ensure hash_key is read under lock */
+		hash_key_new = READ_ONCE(cp->hn0.hash_key);
+		hash_key_new2 = READ_ONCE(cp->hn1.hash_key);
+		/* Hash changed ? */
+		if (hash_key != hash_key_new ||
+		    (hash_key2 != hash_key_new2 && use2)) {
+			if (head != head2)
+				hlist_bl_unlock(head2);
+			hlist_bl_unlock(head);
+			local_bh_enable();
+			hash_key = hash_key_new;
+			hash_key2 = hash_key_new2;
+			goto retry;
+		}
+	}
+	*head_ret = head;
+	*head2_ret = head2;
 }
 
-static inline void ct_write_unlock_bh(unsigned int key)
+static inline void conn_tab_unlock(struct hlist_bl_head *head,
+				   struct hlist_bl_head *head2)
 {
-	spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+	if (head != head2)
+		hlist_bl_unlock(head2);
+	hlist_bl_unlock(head);
+	local_bh_enable();
 }
 
 static void ip_vs_conn_expire(struct timer_list *t);
@@ -100,48 +163,70 @@ static void ip_vs_conn_expire(struct timer_list *t);
 /*
  *	Returns hash value for IPVS connection entry
  */
-static unsigned int ip_vs_conn_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto,
-				       const union nf_inet_addr *addr,
-				       __be16 port)
+static u32 ip_vs_conn_hashkey(struct ip_vs_rht *t, int af, unsigned int proto,
+			      const union nf_inet_addr *addr, __be16 port,
+			      const union nf_inet_addr *laddr, __be16 lport)
 {
+	u64 a = (u32)proto << 16 | (__force u32)port;
+	u64 d;
+
 #ifdef CONFIG_IP_VS_IPV6
-	if (af == AF_INET6)
-		return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
-				    (__force u32)port, proto, ip_vs_conn_rnd) ^
-			((size_t)ipvs>>8)) & ip_vs_conn_tab_mask;
+	if (af == AF_INET6) {
+		u64 b = (u64)addr->all[0] << 32 | addr->all[1];
+		u64 c = (u64)addr->all[2] << 32 | addr->all[3];
+
+		a |= (u64)laddr->all[2] << 32 ^ (__force u32)lport;
+		c ^= laddr->all[1];
+		d = (u64)laddr->all[0] << 32 | laddr->all[3];
+		return (u32)siphash_4u64(a, b, c, d, &t->hash_key);
+	}
 #endif
-	return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
-			    ip_vs_conn_rnd) ^
-		((size_t)ipvs>>8)) & ip_vs_conn_tab_mask;
+	a |= (u64)addr->all[0] << 32;
+	d = (u64)laddr->all[0] << 32 | (__force u32)lport;
+	return (u32)siphash_2u64(a, d, &t->hash_key);
 }
 
 static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
-					     bool inverse)
+					     struct ip_vs_rht *t, bool inverse)
 {
+	const union nf_inet_addr *laddr;
 	const union nf_inet_addr *addr;
+	__be16 lport;
 	__be16 port;
 
 	if (p->pe_data && p->pe->hashkey_raw)
-		return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) &
-			ip_vs_conn_tab_mask;
+		return p->pe->hashkey_raw(p, t, inverse);
 
 	if (likely(!inverse)) {
 		addr = p->caddr;
 		port = p->cport;
+		laddr = p->vaddr;
+		lport = p->vport;
 	} else {
 		addr = p->vaddr;
 		port = p->vport;
+		laddr = p->caddr;
+		lport = p->cport;
 	}
 
-	return ip_vs_conn_hashkey(p->ipvs, p->af, p->protocol, addr, port);
+	return ip_vs_conn_hashkey(t, p->af, p->protocol, addr, port, laddr,
+				  lport);
 }
 
-static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
+static unsigned int ip_vs_conn_hashkey_conn(struct ip_vs_rht *t,
+					    const struct ip_vs_conn *cp,
+					    bool out)
 {
 	struct ip_vs_conn_param p;
 
-	ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol,
-			      &cp->caddr, cp->cport, NULL, 0, &p);
+	if (!out)
+		ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol,
+				      &cp->caddr, cp->cport, &cp->vaddr,
+				      cp->vport, &p);
+	else
+		ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol,
+				      &cp->daddr, cp->dport, &cp->caddr,
+				      cp->cport, &p);
 
 	if (cp->pe) {
 		p.pe = cp->pe;
@@ -149,31 +234,51 @@ static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
 		p.pe_data_len = cp->pe_data_len;
 	}
 
-	return ip_vs_conn_hashkey_param(&p, false);
+	return ip_vs_conn_hashkey_param(&p, t, out);
 }
 
-/*
- *	Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port.
+/*	Hashes ip_vs_conn in conn_tab
  *	returns bool success.
  */
 static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
 {
-	unsigned int hash;
+	struct netns_ipvs *ipvs = cp->ipvs;
+	struct hlist_bl_head *head, *head2;
+	u32 hash_key, hash_key2;
+	struct ip_vs_rht *t;
+	u32 hash, hash2;
+	bool use2;
 	int ret;
 
 	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
 		return 0;
 
-	/* Hash by protocol, client address and port */
-	hash = ip_vs_conn_hashkey_conn(cp);
+	/* New entries go into recent table */
+	t = rcu_dereference(ipvs->conn_tab);
+	t = rcu_dereference(t->new_tbl);
 
-	ct_write_lock_bh(hash);
+	hash = ip_vs_conn_hashkey_conn(t, cp, false);
+	hash_key = ip_vs_rht_build_hash_key(t, hash);
+	if (ip_vs_conn_use_hash2(cp)) {
+		hash2 = ip_vs_conn_hashkey_conn(t, cp, true);
+		hash_key2 = ip_vs_rht_build_hash_key(t, hash2);
+		use2 = true;
+	} else {
+		hash_key2 = hash_key;
+		use2 = false;
+	}
+	conn_tab_lock(t, cp, hash_key, hash_key2, use2, true /* new_hash */,
+		      &head, &head2);
 	spin_lock(&cp->lock);
 
 	if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
 		cp->flags |= IP_VS_CONN_F_HASHED;
+		WRITE_ONCE(cp->hn0.hash_key, hash_key);
+		WRITE_ONCE(cp->hn1.hash_key, hash_key2);
 		refcount_inc(&cp->refcnt);
-		hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]);
+		hlist_bl_add_head_rcu(&cp->hn0.node, head);
+		if (use2)
+			hlist_bl_add_head_rcu(&cp->hn1.node, head2);
 		ret = 1;
 	} else {
 		pr_err("%s(): request for already hashed, called from %pS\n",
@@ -182,75 +287,64 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
 	}
 
 	spin_unlock(&cp->lock);
-	ct_write_unlock_bh(hash);
-
-	return ret;
-}
-
-
-/*
- *	UNhashes ip_vs_conn from ip_vs_conn_tab.
- *	returns bool success. Caller should hold conn reference.
- */
-static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
-{
-	unsigned int hash;
-	int ret;
+	conn_tab_unlock(head, head2);
 
-	/* unhash it and decrease its reference counter */
-	hash = ip_vs_conn_hashkey_conn(cp);
-
-	ct_write_lock_bh(hash);
-	spin_lock(&cp->lock);
-
-	if (cp->flags & IP_VS_CONN_F_HASHED) {
-		hlist_del_rcu(&cp->c_list);
-		cp->flags &= ~IP_VS_CONN_F_HASHED;
-		refcount_dec(&cp->refcnt);
-		ret = 1;
-	} else
-		ret = 0;
-
-	spin_unlock(&cp->lock);
-	ct_write_unlock_bh(hash);
+	/* Schedule resizing if load increases */
+	if (atomic_read(&ipvs->conn_count) > t->u_thresh &&
+	    !test_and_set_bit(IP_VS_WORK_CONN_RESIZE, &ipvs->work_flags))
+		mod_delayed_work(system_unbound_wq, &ipvs->conn_resize_work, 0);
 
 	return ret;
 }
 
-/* Try to unlink ip_vs_conn from ip_vs_conn_tab.
+/* Try to unlink ip_vs_conn from conn_tab.
  * returns bool success.
  */
 static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
 {
-	unsigned int hash;
+	struct netns_ipvs *ipvs = cp->ipvs;
+	struct hlist_bl_head *head, *head2;
+	u32 hash_key, hash_key2;
+	struct ip_vs_rht *t;
 	bool ret = false;
+	bool use2;
 
 	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
 		return refcount_dec_if_one(&cp->refcnt);
 
-	hash = ip_vs_conn_hashkey_conn(cp);
+	rcu_read_lock();
+
+	t = rcu_dereference(ipvs->conn_tab);
+	hash_key = READ_ONCE(cp->hn0.hash_key);
+	hash_key2 = READ_ONCE(cp->hn1.hash_key);
+	use2 = ip_vs_conn_use_hash2(cp);
 
-	ct_write_lock_bh(hash);
+	conn_tab_lock(t, cp, hash_key, hash_key2, use2, false /* new_hash */,
+		      &head, &head2);
 	spin_lock(&cp->lock);
 
 	if (cp->flags & IP_VS_CONN_F_HASHED) {
 		/* Decrease refcnt and unlink conn only if we are last user */
 		if (refcount_dec_if_one(&cp->refcnt)) {
-			hlist_del_rcu(&cp->c_list);
+			hlist_bl_del_rcu(&cp->hn0.node);
+			if (use2)
+				hlist_bl_del_rcu(&cp->hn1.node);
 			cp->flags &= ~IP_VS_CONN_F_HASHED;
 			ret = true;
 		}
 	}
 
 	spin_unlock(&cp->lock);
-	ct_write_unlock_bh(hash);
+	conn_tab_unlock(head, head2);
+
+	rcu_read_unlock();
 
 	return ret;
 }
 
 
 /*
- *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+ *  Gets ip_vs_conn associated with supplied parameters in the conn_tab.
  *  Called for pkts coming from OUTside-to-INside.
  *	p->caddr, p->cport: pkt source address (foreign host)
  *	p->vaddr, p->vport: pkt dest address (load balancer)
@@ -258,26 +352,42 @@ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
 static inline struct ip_vs_conn *
 __ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
 {
-	unsigned int hash;
+	DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
+	struct netns_ipvs *ipvs = p->ipvs;
+	struct ip_vs_conn_hnode *hn;
+	struct hlist_bl_head *head;
+	struct ip_vs_rht *t, *pt;
+	struct hlist_bl_node *e;
 	struct ip_vs_conn *cp;
-
-	hash = ip_vs_conn_hashkey_param(p, false);
+	u32 hash, hash_key;
 
 	rcu_read_lock();
 
-	hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (p->cport == cp->cport && p->vport == cp->vport &&
-		    cp->af == p->af &&
-		    ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
-		    ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
-		    ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
-		    p->protocol == cp->protocol &&
-		    cp->ipvs == p->ipvs) {
-			if (!__ip_vs_conn_get(cp))
-				continue;
-			/* HIT */
-			rcu_read_unlock();
-			return cp;
+	ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) {
+		hash = ip_vs_conn_hashkey_param(p, t, false);
+		hash_key = ip_vs_rht_build_hash_key(t, hash);
+		ip_vs_rht_walk_bucket_rcu(t, hash_key, head) {
+			hlist_bl_for_each_entry_rcu(hn, e, head, node) {
+				if (READ_ONCE(hn->hash_key) != hash_key ||
+				    hn->dir != 0)
+					continue;
+				cp = ip_vs_hn0_to_conn(hn);
+				if (p->cport == cp->cport &&
+				    p->vport == cp->vport && cp->af == p->af &&
+				    ip_vs_addr_equal(p->af, p->caddr,
+						     &cp->caddr) &&
+				    ip_vs_addr_equal(p->af, p->vaddr,
+						     &cp->vaddr) &&
+				    (!p->cport ^
+				     (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
+				    p->protocol == cp->protocol) {
+					if (__ip_vs_conn_get(cp)) {
+						/* HIT */
+						rcu_read_unlock();
+						return cp;
+					}
+				}
+			}
 		}
 	}
 
@@ -350,37 +460,53 @@ EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto);
 /* Get reference to connection template */
 struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
 {
-	unsigned int hash;
+	DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
+	struct netns_ipvs *ipvs = p->ipvs;
+	struct ip_vs_conn_hnode *hn;
+	struct hlist_bl_head *head;
+	struct ip_vs_rht *t, *pt;
+	struct hlist_bl_node *e;
 	struct ip_vs_conn *cp;
-
-	hash = ip_vs_conn_hashkey_param(p, false);
+	u32 hash, hash_key;
 
 	rcu_read_lock();
 
-	hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (unlikely(p->pe_data && p->pe->ct_match)) {
-			if (cp->ipvs != p->ipvs)
-				continue;
-			if (p->pe == cp->pe && p->pe->ct_match(p, cp)) {
-				if (__ip_vs_conn_get(cp))
-					goto out;
+	ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) {
+		hash = ip_vs_conn_hashkey_param(p, t, false);
+		hash_key = ip_vs_rht_build_hash_key(t, hash);
+		ip_vs_rht_walk_bucket_rcu(t, hash_key, head) {
+			hlist_bl_for_each_entry_rcu(hn, e, head, node) {
+				if (READ_ONCE(hn->hash_key) != hash_key ||
+				    hn->dir != 0)
+					continue;
+				cp = ip_vs_hn0_to_conn(hn);
+				if (unlikely(p->pe_data && p->pe->ct_match)) {
+					if (p->pe == cp->pe &&
+					    p->pe->ct_match(p, cp) &&
+					    __ip_vs_conn_get(cp))
+						goto out;
+					continue;
+				}
+				if (cp->af == p->af &&
+				    ip_vs_addr_equal(p->af, p->caddr,
+						     &cp->caddr) &&
+				    /* protocol should only be IPPROTO_IP if
+				     * p->vaddr is a fwmark
+				     */
+				    ip_vs_addr_equal(p->protocol == IPPROTO_IP ?
+						     AF_UNSPEC : p->af,
+						     p->vaddr, &cp->vaddr) &&
+				    p->vport == cp->vport &&
+				    p->cport == cp->cport &&
+				    cp->flags & IP_VS_CONN_F_TEMPLATE &&
+				    p->protocol == cp->protocol &&
+				    cp->dport != htons(0xffff)) {
+					if (__ip_vs_conn_get(cp))
+						goto out;
+				}
 			}
-			continue;
 		}
 
-		if (cp->af == p->af &&
-		    ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
-		    /* protocol should only be IPPROTO_IP if
-		     * p->vaddr is a fwmark */
-		    ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC :
-				     p->af, p->vaddr, &cp->vaddr) &&
-		    p->vport == cp->vport && p->cport == cp->cport &&
-		    cp->flags & IP_VS_CONN_F_TEMPLATE &&
-		    p->protocol == cp->protocol &&
-		    cp->ipvs == p->ipvs) {
-			if (__ip_vs_conn_get(cp))
-				goto out;
-		}
 	}
 	cp = NULL;
 
@@ -396,58 +522,68 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
 	return cp;
 }
 
-/* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+/* Gets ip_vs_conn associated with supplied parameters in the conn_tab.
  * Called for pkts coming from inside-to-OUTside.
  *	p->caddr, p->cport: pkt source address (inside host)
  *	p->vaddr, p->vport: pkt dest address (foreign host) */
 struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
 {
-	unsigned int hash;
-	struct ip_vs_conn *cp, *ret=NULL;
+	DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
+	struct netns_ipvs *ipvs = p->ipvs;
 	const union nf_inet_addr *saddr;
+	struct ip_vs_conn_hnode *hn;
+	struct hlist_bl_head *head;
+	struct ip_vs_rht *t, *pt;
+	struct hlist_bl_node *e;
+	struct ip_vs_conn *cp;
+	u32 hash, hash_key;
 	__be16 sport;
 
-	/*
-	 *	Check for "full" addressed entries
-	 */
-	hash = ip_vs_conn_hashkey_param(p, true);
-
 	rcu_read_lock();
 
-	hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (p->vport != cp->cport)
-			continue;
+	ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) {
+		hash = ip_vs_conn_hashkey_param(p, t, true);
+		hash_key = ip_vs_rht_build_hash_key(t, hash);
+		ip_vs_rht_walk_bucket_rcu(t, hash_key, head) {
+			hlist_bl_for_each_entry_rcu(hn, e, head, node) {
+				/* dir can be 0 for DR/TUN */
+				if (READ_ONCE(hn->hash_key) != hash_key)
+					continue;
+				cp = ip_vs_hn_to_conn(hn);
+				if (p->vport != cp->cport)
+					continue;
 
-		if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
-			sport = cp->vport;
-			saddr = &cp->vaddr;
-		} else {
-			sport = cp->dport;
-			saddr = &cp->daddr;
-		}
+				if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
+					sport = cp->vport;
+					saddr = &cp->vaddr;
+				} else {
+					sport = cp->dport;
+					saddr = &cp->daddr;
+				}
 
-		if (p->cport == sport && cp->af == p->af &&
-		    ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
-		    ip_vs_addr_equal(p->af, p->caddr, saddr) &&
-		    p->protocol == cp->protocol &&
-		    cp->ipvs == p->ipvs) {
-			if (!__ip_vs_conn_get(cp))
-				continue;
-			/* HIT */
-			ret = cp;
-			break;
+				if (p->cport == sport && cp->af == p->af &&
+				    ip_vs_addr_equal(p->af, p->vaddr,
+						     &cp->caddr) &&
+				    ip_vs_addr_equal(p->af, p->caddr, saddr) &&
+				    p->protocol == cp->protocol) {
+					if (__ip_vs_conn_get(cp))
+						goto out;
+				}
+			}
 		}
 	}
+	cp = NULL;
 
+out:
 	rcu_read_unlock();
 
 	IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
 		      ip_vs_proto_name(p->protocol),
 		      IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
 		      IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
-		      ret ? "hit" : "not hit");
+		      cp ? "hit" : "not hit");
 
-	return ret;
+	return cp;
 }
 
 struct ip_vs_conn *
@@ -492,23 +628,289 @@ void ip_vs_conn_put(struct ip_vs_conn *cp)
  */
 void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
 {
-	if (ip_vs_conn_unhash(cp)) {
-		struct netns_ipvs *ipvs = cp->ipvs;
-		int af_id = ip_vs_af_index(cp->af);
+	struct hlist_bl_head *head, *head2, *head_new;
+	bool use2 = ip_vs_conn_use_hash2(cp);
+	struct netns_ipvs *ipvs = cp->ipvs;
+	int af_id = ip_vs_af_index(cp->af);
+	u32 hash_r = 0, hash_key_r = 0;
+	struct ip_vs_rht *t, *tp, *t2;
+	struct ip_vs_conn_hnode *hn;
+	u32 hash_key, hash_key_new;
+	struct ip_vs_conn_param p;
+	int ntbl;
+	int dir;
 
-		spin_lock_bh(&cp->lock);
-		if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
+	/* No packets from inside, so we can do it in 2 steps. */
+	dir = use2 ? 1 : 0;
+
+next_dir:
+	if (dir)
+		ip_vs_conn_fill_param(ipvs, cp->af, cp->protocol, &cp->daddr,
+				      cp->dport, &cp->caddr, cport, &p);
+	else
+		ip_vs_conn_fill_param(ipvs, cp->af, cp->protocol, &cp->caddr,
+				      cport, &cp->vaddr, cp->vport, &p);
+	hn = dir ? &cp->hn1 : &cp->hn0;
+	ntbl = 0;
+
+	/* Attempt to rehash cp safely, by informing seqcount readers */
+	t = rcu_dereference(ipvs->conn_tab);
+	hash_key = READ_ONCE(hn->hash_key);
+	tp = NULL;
+
+retry:
+	/* Moved to new table ? */
+	if (!ip_vs_rht_same_table(t, hash_key)) {
+		t = rcu_dereference(t->new_tbl);
+		ntbl++;
+		/* We are lost? */
+		if (ntbl >= 2)
+			return;
+	}
+
+	/* Rehashing during resize? Use the recent table for adds */
+	t2 = rcu_dereference(t->new_tbl);
+	/* Calc new hash once per table */
+	if (tp != t2) {
+		hash_r = ip_vs_conn_hashkey_param(&p, t2, dir);
+		hash_key_r = ip_vs_rht_build_hash_key(t2, hash_r);
+		tp = t2;
+	}
+	head = t->buckets + (hash_key & t->mask);
+	head2 = t2->buckets + (hash_key_r & t2->mask);
+	head_new = head2;
+
+	if (head > head2 && t == t2)
+		swap(head, head2);
+
+	/* Lock seqcount only for the old bucket, even if we are on new table
+	 * because it affects the del operation, not the adding.
+	 */
+	spin_lock_bh(&t->lock[hash_key & t->lock_mask].l);
+	preempt_disable_nested();
+	write_seqcount_begin(&t->seqc[hash_key & t->seqc_mask]);
+
+	/* Lock buckets in same (increasing) order */
+	hlist_bl_lock(head);
+	if (head != head2)
+		hlist_bl_lock(head2);
+
+	/* Ensure hash_key is read under lock */
+	hash_key_new = READ_ONCE(hn->hash_key);
+	/* Racing with another rehashing ? */
+	if (unlikely(hash_key != hash_key_new)) {
+		if (head != head2)
+			hlist_bl_unlock(head2);
+		hlist_bl_unlock(head);
+		write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]);
+		preempt_enable_nested();
+		spin_unlock_bh(&t->lock[hash_key & t->lock_mask].l);
+		hash_key = hash_key_new;
+		goto retry;
+	}
+
+	spin_lock(&cp->lock);
+	if ((cp->flags & IP_VS_CONN_F_NO_CPORT) &&
+	    (cp->flags & IP_VS_CONN_F_HASHED)) {
+		/* We do not recalc hash_key_r under lock, we assume the
+		 * parameters in cp do not change, i.e. cport is
+		 * the only possible change.
+		 */
+		WRITE_ONCE(hn->hash_key, hash_key_r);
+		if (!use2)
+			WRITE_ONCE(cp->hn1.hash_key, hash_key_r);
+		/* For dir=1 we do not check in flags if hn is already
+		 * rehashed but this check will do it.
+		 */
+		if (head != head2) {
+			hlist_bl_del_rcu(&hn->node);
+			hlist_bl_add_head_rcu(&hn->node, head_new);
+		}
+		if (!dir) {
 			atomic_dec(&ipvs->no_cport_conns[af_id]);
 			cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
 			cp->cport = cport;
 		}
-		spin_unlock_bh(&cp->lock);
-
-		/* hash on new dport */
-		ip_vs_conn_hash(cp);
 	}
+	spin_unlock(&cp->lock);
+
+	if (head != head2)
+		hlist_bl_unlock(head2);
+	hlist_bl_unlock(head);
+	write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]);
+	preempt_enable_nested();
+	spin_unlock_bh(&t->lock[hash_key & t->lock_mask].l);
+	if (dir--)
+		goto next_dir;
 }
 
+/* Get default load factor to map conn_count/u_thresh to t->size */
+static int ip_vs_conn_default_load_factor(struct netns_ipvs *ipvs)
+{
+	int factor;
+
+	if (net_eq(ipvs->net, &init_net))
+		factor = -3;
+	else
+		factor = -1;
+	/* Double hashing adds twice more nodes for NAT */
+	factor--;
+	return factor;
+}
+
+/* Get the desired conn_tab size */
+int ip_vs_conn_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t,
+			    int lfactor)
+{
+	return ip_vs_rht_desired_size(ipvs, t, atomic_read(&ipvs->conn_count),
+				      lfactor, IP_VS_CONN_TAB_MIN_BITS,
+				      ip_vs_conn_tab_bits);
+}
+
+/* Allocate conn_tab */
+struct ip_vs_rht *ip_vs_conn_tab_alloc(struct netns_ipvs *ipvs, int buckets,
+				       int lfactor)
+{
+	struct ip_vs_rht *t;
+	int scounts, locks;
+
+	/* scounts: affects readers during resize */
+	scounts = clamp(buckets >> 6, 1, 256);
+	/* locks: based on parallel IP_VS_CONN_F_NO_CPORT operations + resize */
+	locks = clamp(8, 1, scounts);
+
+	t = ip_vs_rht_alloc(buckets, scounts, locks);
+	if (!t)
+		return NULL;
+	t->lfactor = lfactor;
+	ip_vs_rht_set_thresholds(t, t->size, lfactor, IP_VS_CONN_TAB_MIN_BITS,
+				 ip_vs_conn_tab_bits);
+	return t;
+}
+
+/* conn_tab resizer work */
+static void conn_resize_work_handler(struct work_struct *work)
+{
+	struct hlist_bl_head *head, *head2;
+	unsigned int resched_score = 0;
+	struct hlist_bl_node *cn, *nn;
+	struct ip_vs_rht *t, *t_new;
+	struct ip_vs_conn_hnode *hn;
+	struct netns_ipvs *ipvs;
+	struct ip_vs_conn *cp;
+	bool more_work = false;
+	u32 hash, hash_key;
+	int limit = 0;
+	int new_size;
+	int lfactor;
+	u32 bucket;
+
+	ipvs = container_of(work, struct netns_ipvs, conn_resize_work.work);
+
+	/* Allow work to be queued again */
+	clear_bit(IP_VS_WORK_CONN_RESIZE, &ipvs->work_flags);
+	t = rcu_dereference_protected(ipvs->conn_tab, 1);
+	/* Do nothing if table is removed */
+	if (!t)
+		goto out;
+	/* New table needs to be registered? BUG! */
+	if (t != rcu_dereference_protected(t->new_tbl, 1))
+		goto out;
+
+	lfactor = sysctl_conn_lfactor(ipvs);
+	/* Should we resize ? */
+	new_size = ip_vs_conn_desired_size(ipvs, t, lfactor);
+	if (new_size == t->size && lfactor == t->lfactor)
+		goto out;
+
+	t_new = ip_vs_conn_tab_alloc(ipvs, new_size, lfactor);
+	if (!t_new) {
+		more_work = true;
+		goto out;
+	}
+	/* Flip the table_id */
+	t_new->table_id = t->table_id ^ IP_VS_RHT_TABLE_ID_MASK;
+
+	rcu_assign_pointer(t->new_tbl, t_new);
+
+	/* Wait RCU readers to see the new table, we do not want new
+	 * conns to go into old table and to be left there.
+	 */
+	synchronize_rcu();
+
+	ip_vs_rht_for_each_bucket(t, bucket, head) {
+same_bucket:
+		if (++limit >= 16) {
+			if (resched_score >= 100) {
+				resched_score = 0;
+				cond_resched();
+			}
+			limit = 0;
+		}
+		if (hlist_bl_empty(head)) {
+			resched_score++;
+			continue;
+		}
+		/* Preemption calls ahead... */
+		resched_score = 0;
+
+		/* seqcount_t usage considering PREEMPT_RT rules:
+		 * - other writers (SoftIRQ) => serialize with spin_lock_bh
+		 * - readers (SoftIRQ) => disable BHs
+		 * - readers (processes) => preemption should be disabled
+		 */
+		spin_lock_bh(&t->lock[bucket & t->lock_mask].l);
+		preempt_disable_nested();
+		write_seqcount_begin(&t->seqc[bucket & t->seqc_mask]);
+		hlist_bl_lock(head);
+
+		hlist_bl_for_each_entry_safe(hn, cn, nn, head, node) {
+			cp = ip_vs_hn_to_conn(hn);
+			hash = ip_vs_conn_hashkey_conn(t_new, cp, hn->dir);
+			hash_key = ip_vs_rht_build_hash_key(t_new, hash);
+
+			head2 = t_new->buckets + (hash & t_new->mask);
+			hlist_bl_lock(head2);
+			/* t_new->seqc are not used at this stage, we race
+			 * only with add/del, so only lock the bucket.
+			 */
+			hlist_bl_del_rcu(&hn->node);
+			WRITE_ONCE(hn->hash_key, hash_key);
+			/* Keep both hash keys in sync if no double hashing */
+			if (!ip_vs_conn_use_hash2(cp))
+				WRITE_ONCE(cp->hn1.hash_key, hash_key);
+			hlist_bl_add_head_rcu(&hn->node, head2);
+			hlist_bl_unlock(head2);
+			/* Too long chain? Do it in steps */
+			if (++limit >= 64)
+				break;
+		}
+
+		hlist_bl_unlock(head);
+		write_seqcount_end(&t->seqc[bucket & t->seqc_mask]);
+		preempt_enable_nested();
+		spin_unlock_bh(&t->lock[bucket & t->lock_mask].l);
+		if (limit >= 64)
+			goto same_bucket;
+	}
+
+	rcu_assign_pointer(ipvs->conn_tab, t_new);
+	/* Inform readers that new table is installed */
+	smp_mb__before_atomic();
+	atomic_inc(&ipvs->conn_tab_changes);
+
+	/* RCU readers should not see more than two tables in chain.
+	 * To prevent new table to be attached wait here instead of
+	 * freeing the old table in RCU callback.
+	 */
+	synchronize_rcu();
+	ip_vs_rht_free(t);
+
+out:
+	/* Monitor if we need to shrink table */
+	queue_delayed_work(system_unbound_wq, &ipvs->conn_resize_work,
+			   more_work ? 1 : 2 * HZ);
+}
 
 /*
  *	Bind a connection entry with the corresponding packet_xmit.
@@ -792,17 +1194,11 @@ int ip_vs_check_template(struct ip_vs_conn *ct, struct ip_vs_dest *cdest)
 			      IP_VS_DBG_ADDR(ct->daf, &ct->daddr),
 			      ntohs(ct->dport));
 
-		/*
-		 * Invalidate the connection template
+		/* Invalidate the connection template. Prefer to avoid
+		 * rehashing, it will move it as first in chain, so use
+		 * only dport as indication, it is not a hash key.
 		 */
-		if (ct->vport != htons(0xffff)) {
-			if (ip_vs_conn_unhash(ct)) {
-				ct->dport = htons(0xffff);
-				ct->vport = htons(0xffff);
-				ct->cport = 0;
-				ip_vs_conn_hash(ct);
-			}
-		}
+		ct->dport = htons(0xffff);
 
 		/*
 		 * Simply decrease the refcnt of the template,
@@ -943,7 +1339,7 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
 
 
 /*
- *	Create a new connection entry and hash it into the ip_vs_conn_tab
+ *	Create a new connection entry and hash it into the conn_tab
  */
 struct ip_vs_conn *
 ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
@@ -961,10 +1357,13 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
 		return NULL;
 	}
 
-	INIT_HLIST_NODE(&cp->c_list);
+	INIT_HLIST_BL_NODE(&cp->hn0.node);
+	INIT_HLIST_BL_NODE(&cp->hn1.node);
 	timer_setup(&cp->timer, ip_vs_conn_expire, 0);
 	cp->ipvs	   = ipvs;
+	cp->hn0.dir	   = 0;
 	cp->af		   = p->af;
+	cp->hn1.dir	   = 1;
 	cp->daf		   = dest_af;
 	cp->protocol	   = p->protocol;
 	ip_vs_addr_set(p->af, &cp->caddr, p->caddr);
@@ -1045,7 +1444,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
 	if (ip_vs_conntrack_enabled(ipvs))
 		cp->flags |= IP_VS_CONN_F_NFCT;
 
-	/* Hash it in the ip_vs_conn_tab finally */
+	/* Hash it in the conn_tab finally */
 	ip_vs_conn_hash(cp);
 
 	return cp;
@@ -1057,32 +1456,50 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
 #ifdef CONFIG_PROC_FS
 struct ip_vs_iter_state {
 	struct seq_net_private	p;
-	unsigned int		bucket;
+	struct ip_vs_rht	*t;
+	int			gen;
+	u32			bucket;
 	unsigned int		skip_elems;
 };
 
-static void *ip_vs_conn_array(struct ip_vs_iter_state *iter)
+static void *ip_vs_conn_array(struct seq_file *seq)
 {
+	struct ip_vs_iter_state *iter = seq->private;
+	struct net *net = seq_file_net(seq);
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct ip_vs_rht *t = iter->t;
+	struct ip_vs_conn_hnode *hn;
+	struct hlist_bl_node *e;
 	int idx;
-	struct ip_vs_conn *cp;
 
-	for (idx = iter->bucket; idx < ip_vs_conn_tab_size; idx++) {
+	if (!t)
+		return NULL;
+	for (idx = iter->bucket; idx < t->size; idx++) {
 		unsigned int skip = 0;
 
-		hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
+		hlist_bl_for_each_entry_rcu(hn, e, &t->buckets[idx], node) {
 			/* __ip_vs_conn_get() is not needed by
 			 * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show
 			 */
+			if (!ip_vs_rht_same_table(t, READ_ONCE(hn->hash_key)))
+				break;
+			if (hn->dir != 0)
+				continue;
 			if (skip >= iter->skip_elems) {
 				iter->bucket = idx;
-				return cp;
+				return hn;
 			}
 
 			++skip;
 		}
 
+		if (!(idx & 31)) {
+			cond_resched_rcu();
+			/* New table installed ? */
+			if (iter->gen != atomic_read(&ipvs->conn_tab_changes))
+				break;
+		}
 		iter->skip_elems = 0;
-		cond_resched_rcu();
 	}
 
 	iter->bucket = idx;
@@ -1093,38 +1510,52 @@ static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
 	__acquires(RCU)
 {
 	struct ip_vs_iter_state *iter = seq->private;
+	struct net *net = seq_file_net(seq);
+	struct netns_ipvs *ipvs = net_ipvs(net);
 
 	rcu_read_lock();
+	iter->gen = atomic_read(&ipvs->conn_tab_changes);
+	smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */
+	iter->t = rcu_dereference(ipvs->conn_tab);
 	if (*pos == 0) {
 		iter->skip_elems = 0;
 		iter->bucket = 0;
 		return SEQ_START_TOKEN;
 	}
 
-	return ip_vs_conn_array(iter);
+	return ip_vs_conn_array(seq);
 }
 
 static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	struct ip_vs_conn *cp = v;
 	struct ip_vs_iter_state *iter = seq->private;
-	struct hlist_node *e;
+	struct ip_vs_conn_hnode *hn = v;
+	struct hlist_bl_node *e;
+	struct ip_vs_rht *t;
 
 	++*pos;
 	if (v == SEQ_START_TOKEN)
-		return ip_vs_conn_array(iter);
+		return ip_vs_conn_array(seq);
+
+	t = iter->t;
+	if (!t)
+		return NULL;
 
 	/* more on same hash chain? */
-	e = rcu_dereference(hlist_next_rcu(&cp->c_list));
-	if (e) {
+	hlist_bl_for_each_entry_continue_rcu(hn, e, node) {
+		/* Our cursor was moved to new table ? */
+		if (!ip_vs_rht_same_table(t, READ_ONCE(hn->hash_key)))
+			break;
+		if (hn->dir != 0)
+			continue;
 		iter->skip_elems++;
-		return hlist_entry(e, struct ip_vs_conn, c_list);
+		return hn;
 	}
 
 	iter->skip_elems = 0;
 	iter->bucket++;
 
-	return ip_vs_conn_array(iter);
+	return ip_vs_conn_array(seq);
 }
 
 static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
@@ -1140,14 +1571,12 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
 		seq_puts(seq,
    "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires PEName PEData\n");
 	else {
-		const struct ip_vs_conn *cp = v;
-		struct net *net = seq_file_net(seq);
+		struct ip_vs_conn_hnode *hn = v;
+		const struct ip_vs_conn *cp = ip_vs_hn0_to_conn(hn);
 		char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
 		size_t len = 0;
 		char dbuf[IP_VS_ADDRSTRLEN];
 
-		if (!net_eq(cp->ipvs->net, net))
-			return 0;
 		if (cp->pe_data) {
 			pe_data[0] = ' ';
 			len = strlen(cp->pe->name);
@@ -1219,10 +1648,6 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
    "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Origin Expires\n");
 	else {
 		const struct ip_vs_conn *cp = v;
-		struct net *net = seq_file_net(seq);
-
-		if (!net_eq(cp->ipvs->net, net))
-			return 0;
 
 #ifdef CONFIG_IP_VS_IPV6
 		if (cp->daf == AF_INET6)
@@ -1312,22 +1737,33 @@ static inline bool ip_vs_conn_ops_mode(struct ip_vs_conn *cp)
 	return svc && (svc->flags & IP_VS_SVC_F_ONEPACKET);
 }
 
-/* Called from keventd and must protect itself from softirqs */
 void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
 {
-	int idx;
+	struct ip_vs_conn_hnode *hn;
+	struct hlist_bl_node *e;
 	struct ip_vs_conn *cp;
+	struct ip_vs_rht *t;
+	unsigned int r;
+	int idx;
 
+	r = get_random_u32();
 	rcu_read_lock();
+	t = rcu_dereference(ipvs->conn_tab);
+	if (!t)
+		goto out;
 	/*
 	 * Randomly scan 1/32 of the whole table every second
 	 */
-	for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) {
-		unsigned int hash = get_random_u32() & ip_vs_conn_tab_mask;
+	for (idx = 0; idx < (t->size >> 5); idx++) {
+		unsigned int hash = (r + idx) & t->mask;
 
-		hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
-			if (cp->ipvs != ipvs)
+		/* Don't care if due to moved entry we jump to another bucket
+		 * and even to new table
+		 */
+		hlist_bl_for_each_entry_rcu(hn, e, &t->buckets[hash], node) {
+			if (hn->dir != 0)
 				continue;
+			cp = ip_vs_hn0_to_conn(hn);
 			if (atomic_read(&cp->n_control))
 				continue;
 			if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
@@ -1374,27 +1810,43 @@ drop:
 			IP_VS_DBG(4, "drop connection\n");
 			ip_vs_conn_del(cp);
 		}
-		cond_resched_rcu();
+		if (!(idx & 31)) {
+			cond_resched_rcu();
+			t = rcu_dereference(ipvs->conn_tab);
+			if (!t)
+				goto out;
+		}
 	}
+
+out:
 	rcu_read_unlock();
 }
 #endif
 
-/*
- *      Flush all the connection entries in the ip_vs_conn_tab
- */
+/* Flush all the connection entries in the conn_tab */
 static void ip_vs_conn_flush(struct netns_ipvs *ipvs)
 {
-	int idx;
+	DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU();
 	struct ip_vs_conn *cp, *cp_c;
+	struct ip_vs_conn_hnode *hn;
+	struct hlist_bl_head *head;
+	struct ip_vs_rht *t, *p;
+	struct hlist_bl_node *e;
+
+	if (!rcu_dereference_protected(ipvs->conn_tab, 1))
+		return;
+	cancel_delayed_work_sync(&ipvs->conn_resize_work);
+	if (!atomic_read(&ipvs->conn_count))
+		goto unreg;
 
 flush_again:
+	/* Rely on RCU grace period while accessing cp after ip_vs_conn_del */
 	rcu_read_lock();
-	for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
-
-		hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
-			if (cp->ipvs != ipvs)
+	ip_vs_rht_walk_buckets_safe_rcu(ipvs->conn_tab, head) {
+		hlist_bl_for_each_entry_rcu(hn, e, head, node) {
+			if (hn->dir != 0)
 				continue;
+			cp = ip_vs_hn0_to_conn(hn);
 			if (atomic_read(&cp->n_control))
 				continue;
 			cp_c = cp->control;
@@ -1415,21 +1867,51 @@ flush_again:
 		schedule();
 		goto flush_again;
 	}
+
+unreg:
+	/* Unregister the hash table and release it after RCU grace period.
+	 * This is needed because other works may not be stopped yet and
+	 * they may walk the tables.
+	 */
+	t = rcu_dereference_protected(ipvs->conn_tab, 1);
+	rcu_assign_pointer(ipvs->conn_tab, NULL);
+	/* Inform readers that conn_tab is changed */
+	smp_mb__before_atomic();
+	atomic_inc(&ipvs->conn_tab_changes);
+	while (1) {
+		p = rcu_dereference_protected(t->new_tbl, 1);
+		call_rcu(&t->rcu_head, ip_vs_rht_rcu_free);
+		if (p == t)
+			break;
+		t = p;
+	}
 }
 
 #ifdef CONFIG_SYSCTL
 void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs)
 {
-	int idx;
+	DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU();
+	unsigned int resched_score = 0;
 	struct ip_vs_conn *cp, *cp_c;
+	struct ip_vs_conn_hnode *hn;
+	struct hlist_bl_head *head;
 	struct ip_vs_dest *dest;
+	struct hlist_bl_node *e;
+	int old_gen, new_gen;
 
+	if (!atomic_read(&ipvs->conn_count))
+		return;
+	old_gen = atomic_read(&ipvs->conn_tab_changes);
 	rcu_read_lock();
-	for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
-		hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
-			if (cp->ipvs != ipvs)
-				continue;
 
+repeat:
+	smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */
+	ip_vs_rht_walk_buckets_rcu(ipvs->conn_tab, head) {
+		hlist_bl_for_each_entry_rcu(hn, e, head, node) {
+			if (hn->dir != 0)
+				continue;
+			cp = ip_vs_hn0_to_conn(hn);
+			resched_score++;
 			dest = cp->dest;
 			if (!dest || (dest->flags & IP_VS_DEST_F_AVAILABLE))
 				continue;
@@ -1444,13 +1926,25 @@ void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs)
 				IP_VS_DBG(4, "del controlling connection\n");
 				ip_vs_conn_del(cp_c);
 			}
+			resched_score += 10;
+		}
+		resched_score++;
+		if (resched_score >= 100) {
+			resched_score = 0;
+			cond_resched_rcu();
+			/* netns clean up started, abort delayed work */
+			if (!READ_ONCE(ipvs->enable))
+				goto out;
+			new_gen = atomic_read(&ipvs->conn_tab_changes);
+			/* New table installed ? */
+			if (old_gen != new_gen) {
+				old_gen = new_gen;
+				goto repeat;
+			}
 		}
-		cond_resched_rcu();
-
-		/* netns clean up started, abort delayed work */
-		if (!READ_ONCE(ipvs->enable))
-			break;
 	}
+
+out:
 	rcu_read_unlock();
 }
 #endif
@@ -1465,6 +1959,10 @@ int __net_init ip_vs_conn_net_init(struct netns_ipvs *ipvs)
 	atomic_set(&ipvs->conn_count, 0);
 	for (idx = 0; idx < IP_VS_AF_MAX; idx++)
 		atomic_set(&ipvs->no_cport_conns[idx], 0);
+	INIT_DELAYED_WORK(&ipvs->conn_resize_work, conn_resize_work_handler);
+	RCU_INIT_POINTER(ipvs->conn_tab, NULL);
+	atomic_set(&ipvs->conn_tab_changes, 0);
+	ipvs->sysctl_conn_lfactor = ip_vs_conn_default_load_factor(ipvs);
 
 #ifdef CONFIG_PROC_FS
 	if (!proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net,
@@ -1500,56 +1998,36 @@ void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs)
 
 int __init ip_vs_conn_init(void)
 {
+	int min = IP_VS_CONN_TAB_MIN_BITS;
+	int max = IP_VS_CONN_TAB_MAX_BITS;
 	size_t tab_array_size;
 	int max_avail;
-#if BITS_PER_LONG > 32
-	int max = 27;
-#else
-	int max = 20;
-#endif
-	int min = 8;
-	int idx;
 
 	max_avail = order_base_2(totalram_pages()) + PAGE_SHIFT;
-	max_avail -= 2;		/* ~4 in hash row */
+	/* 64-bit: 27 bits at 64GB, 32-bit: 20 bits at 512MB */
+	max_avail += 1;		/* hash table loaded at 50% */
 	max_avail -= 1;		/* IPVS up to 1/2 of mem */
 	max_avail -= order_base_2(sizeof(struct ip_vs_conn));
 	max = clamp(max_avail, min, max);
 	ip_vs_conn_tab_bits = clamp(ip_vs_conn_tab_bits, min, max);
 	ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
-	ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;
 
 	/*
 	 * Allocate the connection hash table and initialize its list heads
 	 */
 	tab_array_size = array_size(ip_vs_conn_tab_size,
-				    sizeof(*ip_vs_conn_tab));
-	ip_vs_conn_tab = kvmalloc_objs(*ip_vs_conn_tab, ip_vs_conn_tab_size);
-	if (!ip_vs_conn_tab)
-		return -ENOMEM;
+				    sizeof(struct hlist_bl_head));
 
 	/* Allocate ip_vs_conn slab cache */
 	ip_vs_conn_cachep = KMEM_CACHE(ip_vs_conn, SLAB_HWCACHE_ALIGN);
-	if (!ip_vs_conn_cachep) {
-		kvfree(ip_vs_conn_tab);
+	if (!ip_vs_conn_cachep)
 		return -ENOMEM;
-	}
 
 	pr_info("Connection hash table configured (size=%d, memory=%zdKbytes)\n",
 		ip_vs_conn_tab_size, tab_array_size / 1024);
 	IP_VS_DBG(0, "Each connection entry needs %zd bytes at least\n",
 		  sizeof(struct ip_vs_conn));
 
-	for (idx = 0; idx < ip_vs_conn_tab_size; idx++)
-		INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]);
-
-	for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++)  {
-		spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l);
-	}
-
-	/* calculate the random value for connection hash */
-	get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
-
 	return 0;
 }
 
@@ -1559,5 +2037,4 @@ void ip_vs_conn_cleanup(void)
 	rcu_barrier();
 	/* Release the empty cache */
 	kmem_cache_destroy(ip_vs_conn_cachep);
-	kvfree(ip_vs_conn_tab);
 }
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 869f18e0e835..f5b7a2047291 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -117,6 +117,185 @@ void ip_vs_init_hash_table(struct list_head *table, int rows)
 		INIT_LIST_HEAD(&table[rows]);
 }
 
+/* IPVS Resizable Hash Tables:
+ * - list_bl buckets with bit lock
+ *
+ * Goals:
+ * - RCU lookup for entry can run in parallel with add/del/move operations
+ * - hash keys can be on non-contiguous memory
+ * - support entries with duplicate keys
+ * - unlink entries without lookup, use the saved table and bucket id
+ * - resizing can trigger on load change or depending on key refresh period
+ * - customizable load factor to balance between speed and memory usage
+ * - add/del/move operations should be allowed for any context
+ *
+ * Resizing:
+ * - new table is attached to the current table and all entries are moved
+ * with new hash key. Finally, the new table is installed as current one and
+ * the old table is released after RCU grace period.
+ * - RCU read-side critical sections will walk two tables while resizing is
+ * in progress
+ * - new entries are added to the new table
+ * - entries will be deleted from the old or from the new table, the table_id
+ * can be saved into entry as part of the hash key to know where the entry is
+ * hashed
+ * - move operations may delay readers or to cause retry for the modified
+ * bucket. As result, searched entry will be found but walkers that operate
+ * on multiple entries may see same entry twice if bucket walking is retried.
+ * - for fast path the number of entries (load) can be compared to u_thresh
+ * and l_thresh to decide when to trigger table growing/shrinking. They
+ * are calculated based on load factor (shift count), negative value allows
+ * load to be below 100% to reduce collisions by maintaining larger table
+ * while positive value tolerates collisions by using smaller table and load
+ * above 100%: u_thresh(load) = size * (2 ^ lfactor)
+ *
+ * Locking:
+ * - lock: protect seqc if other context except resizer can move entries
+ * - seqc: seqcount_t, delay/retry readers while entries are moved to
+ * new table on resizing
+ * - bit lock: serialize bucket modifications
+ * - writers may use other locking mechanisms to serialize operations for
+ * resizing, moving and installing new tables
+ */
+
+void ip_vs_rht_free(struct ip_vs_rht *t)
+{
+	kvfree(t->buckets);
+	kvfree(t->seqc);
+	kvfree(t->lock);
+	kfree(t);
+}
+
+void ip_vs_rht_rcu_free(struct rcu_head *head)
+{
+	struct ip_vs_rht *t;
+
+	t = container_of(head, struct ip_vs_rht, rcu_head);
+	ip_vs_rht_free(t);
+}
+
+struct ip_vs_rht *ip_vs_rht_alloc(int buckets, int scounts, int locks)
+{
+	struct ip_vs_rht *t = kzalloc(sizeof(*t), GFP_KERNEL);
+	int i;
+
+	if (!t)
+		return NULL;
+	if (scounts) {
+		int ml = roundup_pow_of_two(nr_cpu_ids);
+
+		scounts = min(scounts, buckets);
+		scounts = min(scounts, ml);
+		t->seqc = kvmalloc_array(scounts, sizeof(*t->seqc), GFP_KERNEL);
+		if (!t->seqc)
+			goto err;
+		for (i = 0; i < scounts; i++)
+			seqcount_init(&t->seqc[i]);
+
+		if (locks) {
+			locks = min(locks, scounts);
+			t->lock = kvmalloc_array(locks, sizeof(*t->lock),
+						 GFP_KERNEL);
+			if (!t->lock)
+				goto err;
+			for (i = 0; i < locks; i++)
+				spin_lock_init(&t->lock[i].l);
+		}
+	}
+
+	t->buckets = kvmalloc_array(buckets, sizeof(*t->buckets), GFP_KERNEL);
+	if (!t->buckets)
+		goto err;
+	for (i = 0; i < buckets; i++)
+		INIT_HLIST_BL_HEAD(&t->buckets[i]);
+	t->mask = buckets - 1;
+	t->size = buckets;
+	t->seqc_mask = scounts - 1;
+	t->lock_mask = locks - 1;
+	t->u_thresh = buckets;
+	t->l_thresh = buckets >> 4;
+	t->bits = order_base_2(buckets);
+	/* new_tbl points to self if no new table is filled */
+	RCU_INIT_POINTER(t->new_tbl, t);
+	get_random_bytes(&t->hash_key, sizeof(t->hash_key));
+	return t;
+
+err:
+	ip_vs_rht_free(t);
+	return NULL;
+}
+
+/* Get the desired table size for n entries based on current table size and
+ * by using the formula size = n / (2^lfactor)
+ * lfactor: shift value for the load factor:
+ * - >0: u_thresh=size << lfactor, for load factor above 100%
+ * - <0: u_thresh=size >> -lfactor, for load factor below 100%
+ * - 0: for load factor of 100%
+ */
+int ip_vs_rht_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, int n,
+			   int lfactor, int min_bits, int max_bits)
+{
+	if (!t)
+		return 1 << min_bits;
+	n = roundup_pow_of_two(n);
+	if (lfactor < 0) {
+		int factor = min(-lfactor, max_bits);
+
+		n = min(n, 1 << (max_bits - factor));
+		n <<= factor;
+	} else {
+		n = min(n >> lfactor, 1 << max_bits);
+	}
+	if (lfactor != t->lfactor)
+		return clamp(n, 1 << min_bits, 1 << max_bits);
+	if (n > t->size)
+		return n;
+	if (n > t->size >> 4)
+		return t->size;
+	/* Shrink but keep it n * 2 to prevent frequent resizing */
+	return clamp(n << 1, 1 << min_bits, 1 << max_bits);
+}
+
+/* Set thresholds based on table size and load factor:
+ * u_thresh = size * (2^lfactor)
+ * l_thresh = u_thresh / 16
+ * u_thresh/l_thresh can be used to check if load triggers a table grow/shrink
+ */
+void ip_vs_rht_set_thresholds(struct ip_vs_rht *t, int size, int lfactor,
+			      int min_bits, int max_bits)
+{
+	if (size >= 1 << max_bits)
+		t->u_thresh = INT_MAX;	/* stop growing */
+	else if (lfactor <= 0)
+		t->u_thresh = size >> min(-lfactor, max_bits);
+	else
+		t->u_thresh = min(size, 1 << (30 - lfactor)) << lfactor;
+
+	/* l_thresh: shrink when load is 16 times lower, can be 0 */
+	if (size >= 1 << max_bits)
+		t->l_thresh = (1 << max_bits) >> 4;
+	else if (size > 1 << min_bits)
+		t->l_thresh = t->u_thresh >> 4;
+	else
+		t->l_thresh = 0;	/* stop shrinking */
+}
+
+/* Return hash value for local info (fast, insecure) */
+u32 ip_vs_rht_hash_linfo(struct ip_vs_rht *t, int af,
+			 const union nf_inet_addr *addr, u32 v1, u32 v2)
+{
+	u32 v3;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		v3 = ipv6_addr_hash(&addr->in6);
+	else
+#endif
+		v3 = addr->all[0];
+
+	return jhash_3words(v1, v2, v3, (u32)t->hash_key.key[0]);
+}
+
 static inline void
 ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 {
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index f7d454df2b58..032425025d88 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -29,6 +29,7 @@
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/mutex.h>
+#include <linux/rcupdate_wait.h>
 
 #include <net/net_namespace.h>
 #include <linux/nsproxy.h>
@@ -293,47 +294,59 @@ ip_vs_use_count_dec(void)
 }
 
 
-
+/* Service hashing:
+ * Operation			Locking order
+ * ---------------------------------------------------------------------------
+ * add table			service_mutex, svc_resize_sem(W)
+ * del table			service_mutex
+ * move between tables		svc_resize_sem(W), seqcount_t(W), bit lock
+ * add/del service		service_mutex, bit lock
+ * find service			RCU, seqcount_t(R)
+ * walk services(blocking)	service_mutex, svc_resize_sem(R)
+ * walk services(non-blocking)	RCU, seqcount_t(R)
+ *
+ * - new tables are linked/unlinked under service_mutex and svc_resize_sem
+ * - new table is linked on resizing and all operations can run in parallel
+ * in 2 tables until the new table is registered as current one
+ * - two contexts can modify buckets: config and table resize, both in
+ * process context
+ * - only table resizer can move entries, so we do not protect t->seqc[]
+ * items with t->lock[]
+ * - lookups occur under RCU lock and seqcount reader lock to detect if
+ * services are moved to new table
+ * - move operations may disturb readers: find operation will not miss entries
+ * but walkers may see same entry twice if they are forced to retry chains
+ * - walkers using cond_resched_rcu() on !PREEMPT_RCU may need to hold
+ * service_mutex to disallow new tables to be installed or to check
+ * svc_table_changes and repeat the RCU read section if new table is installed
+ */
 
 /*
  *	Returns hash value for virtual service
  */
-static inline unsigned int
-ip_vs_svc_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto,
+static inline u32
+ip_vs_svc_hashval(struct ip_vs_rht *t, int af, unsigned int proto,
 		  const union nf_inet_addr *addr, __be16 port)
 {
-	unsigned int porth = ntohs(port);
-	__be32 addr_fold = addr->ip;
-	__u32 ahash;
-
-#ifdef CONFIG_IP_VS_IPV6
-	if (af == AF_INET6)
-		addr_fold = addr->ip6[0]^addr->ip6[1]^
-			    addr->ip6[2]^addr->ip6[3];
-#endif
-	ahash = ntohl(addr_fold);
-	ahash ^= ((size_t) ipvs >> 8);
-
-	return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) &
-	       IP_VS_SVC_TAB_MASK;
+	return ip_vs_rht_hash_linfo(t, af, addr, ntohs(port), proto);
 }
 
 /*
  *	Returns hash value of fwmark for virtual service lookup
  */
-static inline unsigned int ip_vs_svc_fwm_hashkey(struct netns_ipvs *ipvs, __u32 fwmark)
+static inline u32 ip_vs_svc_fwm_hashval(struct ip_vs_rht *t, int af,
+					__u32 fwmark)
 {
-	return (((size_t)ipvs>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
+	return jhash_2words(fwmark, af, (u32)t->hash_key.key[0]);
 }
 
-/*
- *	Hashes a service in the svc_table by <netns,proto,addr,port>
- *	or by fwmark.
- *	Should be called with locked tables.
- */
+/* Hashes a service in the svc_table by <proto,addr,port> or by fwmark */
 static int ip_vs_svc_hash(struct ip_vs_service *svc)
 {
-	unsigned int hash;
+	struct netns_ipvs *ipvs = svc->ipvs;
+	struct hlist_bl_head *head;
+	struct ip_vs_rht *t;
+	u32 hash;
 
 	if (svc->flags & IP_VS_SVC_F_HASHED) {
 		pr_err("%s(): request for already hashed, called from %pS\n",
@@ -341,23 +354,32 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)
 		return 0;
 	}
 
+	/* increase its refcnt because it is referenced by the svc table */
+	atomic_inc(&svc->refcnt);
+
+	/* New entries go into recent table */
+	t = rcu_dereference_protected(ipvs->svc_table, 1);
+	t = rcu_dereference_protected(t->new_tbl, 1);
+
 	if (svc->fwmark == 0) {
 		/*
-		 *  Hash it by <netns,protocol,addr,port>
+		 *  Hash it by <protocol,addr,port>
 		 */
-		hash = ip_vs_svc_hashkey(svc->ipvs, svc->af, svc->protocol,
+		hash = ip_vs_svc_hashval(t, svc->af, svc->protocol,
 					 &svc->addr, svc->port);
 	} else {
 		/*
 		 *  Hash it by fwmark
 		 */
-		hash = ip_vs_svc_fwm_hashkey(svc->ipvs, svc->fwmark);
+		hash = ip_vs_svc_fwm_hashval(t, svc->af, svc->fwmark);
 	}
-	hlist_add_head_rcu(&svc->s_list, &svc->ipvs->svc_table[hash]);
-
+	head = t->buckets + (hash & t->mask);
+	hlist_bl_lock(head);
+	WRITE_ONCE(svc->hash_key, ip_vs_rht_build_hash_key(t, hash));
 	svc->flags |= IP_VS_SVC_F_HASHED;
-	/* increase its refcnt because it is referenced by the svc table */
-	atomic_inc(&svc->refcnt);
+	hlist_bl_add_head_rcu(&svc->s_list, head);
+	hlist_bl_unlock(head);
+
 	return 1;
 }
 
@@ -368,17 +390,45 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)
  */
 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
 {
+	struct netns_ipvs *ipvs = svc->ipvs;
+	struct hlist_bl_head *head;
+	struct ip_vs_rht *t;
+	u32 hash_key2;
+	u32 hash_key;
+
 	if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
 		pr_err("%s(): request for unhash flagged, called from %pS\n",
 		       __func__, __builtin_return_address(0));
 		return 0;
 	}
 
+	t = rcu_dereference_protected(ipvs->svc_table, 1);
+	hash_key = READ_ONCE(svc->hash_key);
+	/* We need to lock the bucket in the right table */
+	if (ip_vs_rht_same_table(t, hash_key)) {
+		head = t->buckets + (hash_key & t->mask);
+		hlist_bl_lock(head);
+		/* Ensure hash_key is read under lock */
+		hash_key2 = READ_ONCE(svc->hash_key);
+		/* Moved to new table ? */
+		if (hash_key != hash_key2) {
+			hlist_bl_unlock(head);
+			t = rcu_dereference_protected(t->new_tbl, 1);
+			head = t->buckets + (hash_key2 & t->mask);
+			hlist_bl_lock(head);
+		}
+	} else {
+		/* It is already moved to new table */
+		t = rcu_dereference_protected(t->new_tbl, 1);
+		head = t->buckets + (hash_key & t->mask);
+		hlist_bl_lock(head);
+	}
 	/* Remove it from svc_table */
-	hlist_del_rcu(&svc->s_list);
+	hlist_bl_del_rcu(&svc->s_list);
 
 	svc->flags &= ~IP_VS_SVC_F_HASHED;
 	atomic_dec(&svc->refcnt);
+	hlist_bl_unlock(head);
 	return 1;
 }
 
@@ -390,18 +440,29 @@ static inline struct ip_vs_service *
 __ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol,
 		     const union nf_inet_addr *vaddr, __be16 vport)
 {
-	unsigned int hash;
+	DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
+	struct hlist_bl_head *head;
 	struct ip_vs_service *svc;
-
-	/* Check for "full" addressed entries */
-	hash = ip_vs_svc_hashkey(ipvs, af, protocol, vaddr, vport);
-
-	hlist_for_each_entry_rcu(svc, &ipvs->svc_table[hash], s_list) {
-		if (svc->af == af && ip_vs_addr_equal(af, &svc->addr, vaddr) &&
-		    svc->port == vport && svc->protocol == protocol &&
-		    !svc->fwmark) {
-			/* HIT */
-			return svc;
+	struct ip_vs_rht *t, *p;
+	struct hlist_bl_node *e;
+	u32 hash, hash_key;
+
+	ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, p) {
+		/* Check for "full" addressed entries */
+		hash = ip_vs_svc_hashval(t, af, protocol, vaddr, vport);
+
+		hash_key = ip_vs_rht_build_hash_key(t, hash);
+		ip_vs_rht_walk_bucket_rcu(t, hash_key, head) {
+			hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
+				if (READ_ONCE(svc->hash_key) == hash_key &&
+				    svc->af == af &&
+				    ip_vs_addr_equal(af, &svc->addr, vaddr) &&
+				    svc->port == vport &&
+				    svc->protocol == protocol && !svc->fwmark) {
+					/* HIT */
+					return svc;
+				}
+			}
 		}
 	}
 
@@ -415,16 +476,26 @@ __ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol,
 static inline struct ip_vs_service *
 __ip_vs_svc_fwm_find(struct netns_ipvs *ipvs, int af, __u32 fwmark)
 {
-	unsigned int hash;
+	DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
+	struct hlist_bl_head *head;
 	struct ip_vs_service *svc;
-
-	/* Check for fwmark addressed entries */
-	hash = ip_vs_svc_fwm_hashkey(ipvs, fwmark);
-
-	hlist_for_each_entry_rcu(svc, &ipvs->svc_table[hash], s_list) {
-		if (svc->fwmark == fwmark && svc->af == af) {
-			/* HIT */
-			return svc;
+	struct ip_vs_rht *t, *p;
+	struct hlist_bl_node *e;
+	u32 hash, hash_key;
+
+	ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, p) {
+		/* Check for fwmark addressed entries */
+		hash = ip_vs_svc_fwm_hashval(t, af, fwmark);
+
+		hash_key = ip_vs_rht_build_hash_key(t, hash);
+		ip_vs_rht_walk_bucket_rcu(t, hash_key, head) {
+			hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
+				if (READ_ONCE(svc->hash_key) == hash_key &&
+				    svc->fwmark == fwmark && svc->af == af) {
+					/* HIT */
+					return svc;
+				}
+			}
 		}
 	}
 
@@ -487,6 +558,220 @@ ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol
 	return svc;
 }
 
+/* Return the number of registered services */
+static int ip_vs_get_num_services(struct netns_ipvs *ipvs)
+{
+	int ns = 0, ni = IP_VS_AF_MAX;
+
+	while (--ni >= 0)
+		ns += atomic_read(&ipvs->num_services[ni]);
+	return ns;
+}
+
+/* Get default load factor to map num_services/u_thresh to t->size */
+static int ip_vs_svc_default_load_factor(struct netns_ipvs *ipvs)
+{
+	int factor;
+
+	if (net_eq(ipvs->net, &init_net))
+		factor = -3;	/* grow if load is above 12.5% */
+	else
+		factor = -2;	/* grow if load is above 25% */
+	return factor;
+}
+
+/* Get the desired svc_table size */
+static int ip_vs_svc_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t,
+				  int lfactor)
+{
+	return ip_vs_rht_desired_size(ipvs, t, ip_vs_get_num_services(ipvs),
+				      lfactor, IP_VS_SVC_TAB_MIN_BITS,
+				      IP_VS_SVC_TAB_MAX_BITS);
+}
+
+/* Allocate svc_table */
+static struct ip_vs_rht *ip_vs_svc_table_alloc(struct netns_ipvs *ipvs,
+					       int buckets, int lfactor)
+{
+	struct ip_vs_rht *t;
+	int scounts, locks;
+
+	/* No frequent lookups to race with resizing, so use max of 64
+	 * seqcounts. Only resizer moves entries, so use 0 locks.
+	 */
+	scounts = clamp(buckets >> 4, 1, 64);
+	locks = 0;
+
+	t = ip_vs_rht_alloc(buckets, scounts, locks);
+	if (!t)
+		return NULL;
+	t->lfactor = lfactor;
+	ip_vs_rht_set_thresholds(t, t->size, lfactor, IP_VS_SVC_TAB_MIN_BITS,
+				 IP_VS_SVC_TAB_MAX_BITS);
+	return t;
+}
+
+/* svc_table resizer work */
+static void svc_resize_work_handler(struct work_struct *work)
+{
+	struct hlist_bl_head *head, *head2;
+	struct ip_vs_rht *t_free = NULL;
+	unsigned int resched_score = 0;
+	struct hlist_bl_node *cn, *nn;
+	struct ip_vs_rht *t, *t_new;
+	struct ip_vs_service *svc;
+	struct netns_ipvs *ipvs;
+	bool more_work = true;
+	seqcount_t *sc;
+	int limit = 0;
+	int new_size;
+	int lfactor;
+	u32 bucket;
+
+	ipvs = container_of(work, struct netns_ipvs, svc_resize_work.work);
+
+	if (!down_write_trylock(&ipvs->svc_resize_sem))
+		goto out;
+	if (!mutex_trylock(&ipvs->service_mutex))
+		goto unlock_sem;
+	more_work = false;
+	clear_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags);
+	if (!READ_ONCE(ipvs->enable) ||
+	    test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
+		goto unlock_m;
+	t = rcu_dereference_protected(ipvs->svc_table, 1);
+	/* Do nothing if table is removed */
+	if (!t)
+		goto unlock_m;
+	/* New table needs to be registered? BUG! */
+	if (t != rcu_dereference_protected(t->new_tbl, 1))
+		goto unlock_m;
+
+	lfactor = sysctl_svc_lfactor(ipvs);
+	/* Should we resize ? */
+	new_size = ip_vs_svc_desired_size(ipvs, t, lfactor);
+	if (new_size == t->size && lfactor == t->lfactor)
+		goto unlock_m;
+
+	t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor);
+	if (!t_new) {
+		more_work = true;
+		goto unlock_m;
+	}
+	/* Flip the table_id */
+	t_new->table_id = t->table_id ^ IP_VS_RHT_TABLE_ID_MASK;
+
+	rcu_assign_pointer(t->new_tbl, t_new);
+	/* Allow add/del to new_tbl while moving from old table */
+	mutex_unlock(&ipvs->service_mutex);
+
+	ip_vs_rht_for_each_bucket(t, bucket, head) {
+same_bucket:
+		if (++limit >= 16) {
+			if (!READ_ONCE(ipvs->enable) ||
+			    test_bit(IP_VS_WORK_SVC_NORESIZE,
+				     &ipvs->work_flags))
+				goto unlock_sem;
+			if (resched_score >= 100) {
+				resched_score = 0;
+				cond_resched();
+			}
+			limit = 0;
+		}
+		if (hlist_bl_empty(head)) {
+			resched_score++;
+			continue;
+		}
+		/* Preemption calls ahead... */
+		resched_score = 0;
+
+		sc = &t->seqc[bucket & t->seqc_mask];
+		/* seqcount_t usage considering PREEMPT_RT rules:
+		 * - we are the only writer => preemption can be allowed
+		 * - readers (SoftIRQ) => disable BHs
+		 * - readers (processes) => preemption should be disabled
+		 */
+		local_bh_disable();
+		preempt_disable_nested();
+		write_seqcount_begin(sc);
+		hlist_bl_lock(head);
+
+		hlist_bl_for_each_entry_safe(svc, cn, nn, head, s_list) {
+			u32 hash;
+
+			/* New hash for the new table */
+			if (svc->fwmark == 0) {
+				/*  Hash it by <protocol,addr,port> */
+				hash = ip_vs_svc_hashval(t_new, svc->af,
+							 svc->protocol,
+							 &svc->addr, svc->port);
+			} else {
+				/* Hash it by fwmark */
+				hash = ip_vs_svc_fwm_hashval(t_new, svc->af,
+							     svc->fwmark);
+			}
+			hlist_bl_del_rcu(&svc->s_list);
+			head2 = t_new->buckets + (hash & t_new->mask);
+
+			hlist_bl_lock(head2);
+			WRITE_ONCE(svc->hash_key,
+				   ip_vs_rht_build_hash_key(t_new, hash));
+			/* t_new->seqc are not used at this stage, we race
+			 * only with add/del, so only lock the bucket.
+			 */
+			hlist_bl_add_head_rcu(&svc->s_list, head2);
+			hlist_bl_unlock(head2);
+			/* Too long chain? Do it in steps */
+			if (++limit >= 64)
+				break;
+		}
+
+		hlist_bl_unlock(head);
+		write_seqcount_end(sc);
+		preempt_enable_nested();
+		local_bh_enable();
+		if (limit >= 64)
+			goto same_bucket;
+	}
+
+	/* Tables can be switched only under service_mutex */
+	while (!mutex_trylock(&ipvs->service_mutex)) {
+		cond_resched();
+		if (!READ_ONCE(ipvs->enable) ||
+		    test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
+			goto unlock_sem;
+	}
+	if (!READ_ONCE(ipvs->enable) ||
+	    test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
+		goto unlock_m;
+
+	rcu_assign_pointer(ipvs->svc_table, t_new);
+	/* Inform readers that new table is installed */
+	smp_mb__before_atomic();
+	atomic_inc(&ipvs->svc_table_changes);
+	t_free = t;
+
+unlock_m:
+	mutex_unlock(&ipvs->service_mutex);
+
+unlock_sem:
+	up_write(&ipvs->svc_resize_sem);
+
+	if (t_free) {
+		/* RCU readers should not see more than two tables in chain.
+		 * To prevent new table to be attached wait here instead of
+		 * freeing the old table in RCU callback.
+		 */
+		synchronize_rcu();
+		ip_vs_rht_free(t_free);
+	}
+
+out:
+	if (!READ_ONCE(ipvs->enable) || !more_work ||
+	    test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
+		return;
+	queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 1);
+}
 
 static inline void
 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
@@ -1357,12 +1642,14 @@ static int
 ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
 		  struct ip_vs_service **svc_p)
 {
-	int ret = 0;
 	struct ip_vs_scheduler *sched = NULL;
+	struct ip_vs_rht *tc_new = NULL;
+	struct ip_vs_rht *t, *t_new = NULL;
 	int af_id = ip_vs_af_index(u->af);
-	struct ip_vs_pe *pe = NULL;
 	struct ip_vs_service *svc = NULL;
+	struct ip_vs_pe *pe = NULL;
 	int ret_hooks = -1;
+	int ret = 0;
 
 	/* increase the module use count */
 	if (!ip_vs_use_count_inc())
@@ -1404,6 +1691,29 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
 	}
 #endif
 
+	t = rcu_dereference_protected(ipvs->svc_table, 1);
+	if (!t) {
+		int lfactor = sysctl_svc_lfactor(ipvs);
+		int new_size = ip_vs_svc_desired_size(ipvs, NULL, lfactor);
+
+		t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor);
+		if (!t_new) {
+			ret = -ENOMEM;
+			goto out_err;
+		}
+	}
+
+	if (!rcu_dereference_protected(ipvs->conn_tab, 1)) {
+		int lfactor = sysctl_conn_lfactor(ipvs);
+		int new_size = ip_vs_conn_desired_size(ipvs, NULL, lfactor);
+
+		tc_new = ip_vs_conn_tab_alloc(ipvs, new_size, lfactor);
+		if (!tc_new) {
+			ret = -ENOMEM;
+			goto out_err;
+		}
+	}
+
 	if (!atomic_read(&ipvs->num_services[af_id])) {
 		ret = ip_vs_register_hooks(ipvs, u->af);
 		if (ret < 0)
@@ -1449,6 +1759,16 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
 	if (ret < 0)
 		goto out_err;
 
+	if (t_new) {
+		clear_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags);
+		rcu_assign_pointer(ipvs->svc_table, t_new);
+		t_new = NULL;
+	}
+	if (tc_new) {
+		rcu_assign_pointer(ipvs->conn_tab, tc_new);
+		tc_new = NULL;
+	}
+
 	/* Update the virtual service counters */
 	if (svc->port == FTPPORT)
 		atomic_inc(&ipvs->ftpsvc_counter[af_id]);
@@ -1470,6 +1790,12 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
 	/* Hash the service into the service table */
 	ip_vs_svc_hash(svc);
 
+	/* Schedule resize work */
+	if (t && ip_vs_get_num_services(ipvs) > t->u_thresh &&
+	    !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags))
+		queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work,
+				   1);
+
 	*svc_p = svc;
 
 	if (!READ_ONCE(ipvs->enable)) {
@@ -1484,6 +1810,10 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
 
 
  out_err:
+	if (tc_new)
+		ip_vs_rht_free(tc_new);
+	if (t_new)
+		ip_vs_rht_free(t_new);
 	if (ret_hooks >= 0)
 		ip_vs_unregister_hooks(ipvs, u->af);
 	if (svc != NULL) {
@@ -1671,10 +2001,38 @@ static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup)
  */
 static int ip_vs_del_service(struct ip_vs_service *svc)
 {
+	struct netns_ipvs *ipvs;
+	struct ip_vs_rht *t, *p;
+	int ns;
+
 	if (svc == NULL)
 		return -EEXIST;
+	ipvs = svc->ipvs;
 	ip_vs_unlink_service(svc, false);
-
+	t = rcu_dereference_protected(ipvs->svc_table, 1);
+
+	/* Drop the table if no more services */
+	ns = ip_vs_get_num_services(ipvs);
+	if (!ns) {
+		/* Stop the resizer and drop the tables */
+		set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags);
+		cancel_delayed_work_sync(&ipvs->svc_resize_work);
+		if (t) {
+			rcu_assign_pointer(ipvs->svc_table, NULL);
+			while (1) {
+				p = rcu_dereference_protected(t->new_tbl, 1);
+				call_rcu(&t->rcu_head, ip_vs_rht_rcu_free);
+				if (p == t)
+					break;
+				t = p;
+			}
+		}
+	} else if (ns <= t->l_thresh &&
+		   !test_and_set_bit(IP_VS_WORK_SVC_RESIZE,
+				     &ipvs->work_flags)) {
+		queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work,
+				   1);
+	}
 	return 0;
 }
 
@@ -1684,14 +2042,36 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
  */
 static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup)
 {
-	int idx;
+	DECLARE_IP_VS_RHT_WALK_BUCKETS();
+	struct hlist_bl_head *head;
 	struct ip_vs_service *svc;
-	struct hlist_node *n;
+	struct hlist_bl_node *ne;
+	struct hlist_bl_node *e;
+	struct ip_vs_rht *t, *p;
+
+	/* Stop the resizer and drop the tables */
+	if (!test_and_set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
+		cancel_delayed_work_sync(&ipvs->svc_resize_work);
+	/* No resizer, so now we have exclusive write access */
+
+	if (ip_vs_get_num_services(ipvs)) {
+		ip_vs_rht_walk_buckets(ipvs->svc_table, head) {
+			hlist_bl_for_each_entry_safe(svc, e, ne, head, s_list)
+				ip_vs_unlink_service(svc, cleanup);
+		}
+	}
 
-	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		hlist_for_each_entry_safe(svc, n, &ipvs->svc_table[idx],
-					  s_list)
-			ip_vs_unlink_service(svc, cleanup);
+	/* Unregister the hash table and release it after RCU grace period */
+	t = rcu_dereference_protected(ipvs->svc_table, 1);
+	if (t) {
+		rcu_assign_pointer(ipvs->svc_table, NULL);
+		while (1) {
+			p = rcu_dereference_protected(t->new_tbl, 1);
+			call_rcu(&t->rcu_head, ip_vs_rht_rcu_free);
+			if (p == t)
+				break;
+			t = p;
+		}
 	}
 	return 0;
 }
@@ -1742,19 +2122,44 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 	struct net *net = dev_net(dev);
 	struct netns_ipvs *ipvs = net_ipvs(net);
+	DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU();
+	unsigned int resched_score = 0;
+	struct hlist_bl_head *head;
 	struct ip_vs_service *svc;
+	struct hlist_bl_node *e;
 	struct ip_vs_dest *dest;
-	unsigned int idx;
+	int old_gen, new_gen;
 
 	if (event != NETDEV_DOWN || !ipvs)
 		return NOTIFY_DONE;
 	IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
+
+	old_gen = atomic_read(&ipvs->svc_table_changes);
+
 	rcu_read_lock();
-	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		hlist_for_each_entry_rcu(svc, &ipvs->svc_table[idx], s_list)
+
+repeat:
+	smp_rmb(); /* ipvs->svc_table and svc_table_changes */
+	ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) {
+		hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
 			list_for_each_entry_rcu(dest, &svc->destinations,
-						n_list)
+						n_list) {
 				ip_vs_forget_dev(dest, dev);
+				resched_score += 10;
+			}
+			resched_score++;
+		}
+		resched_score++;
+		if (resched_score >= 100) {
+			resched_score = 0;
+			cond_resched_rcu();
+			new_gen = atomic_read(&ipvs->svc_table_changes);
+			/* New table installed ? */
+			if (old_gen != new_gen) {
+				old_gen = new_gen;
+				goto repeat;
+			}
+		}
 	}
 	rcu_read_unlock();
 
@@ -1777,14 +2182,28 @@ static int ip_vs_zero_service(struct ip_vs_service *svc)
 
 static int ip_vs_zero_all(struct netns_ipvs *ipvs)
 {
-	int idx;
+	DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU();
+	unsigned int resched_score = 0;
+	struct hlist_bl_head *head;
 	struct ip_vs_service *svc;
+	struct hlist_bl_node *e;
+
+	rcu_read_lock();
 
-	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		hlist_for_each_entry(svc, &ipvs->svc_table[idx], s_list)
+	ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) {
+		hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
 			ip_vs_zero_service(svc);
+			resched_score += 10;
+		}
+		resched_score++;
+		if (resched_score >= 100) {
+			resched_score = 0;
+			cond_resched_rcu();
+		}
 	}
 
+	rcu_read_unlock();
+
 	ip_vs_zero_stats(&ipvs->tot_stats->s);
 	return 0;
 }
@@ -2218,7 +2637,8 @@ static struct ctl_table vs_vars[] = {
 
 struct ip_vs_iter {
 	struct seq_net_private p;  /* Do not move this, netns depends upon it*/
-	int bucket;
+	struct ip_vs_rht *t;
+	u32 bucket;
 };
 
 /*
@@ -2239,17 +2659,23 @@ static inline const char *ip_vs_fwd_name(unsigned int flags)
 	}
 }
 
-
+/* Do not expect consistent view during add, del and move(table resize).
+ * We may miss entries and even show duplicates.
+ */
 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
 {
-	struct net *net = seq_file_net(seq);
-	struct netns_ipvs *ipvs = net_ipvs(net);
 	struct ip_vs_iter *iter = seq->private;
-	int idx;
+	struct ip_vs_rht *t = iter->t;
 	struct ip_vs_service *svc;
+	struct hlist_bl_node *e;
+	int idx;
 
-	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		hlist_for_each_entry_rcu(svc, &ipvs->svc_table[idx], s_list) {
+	if (!t)
+		return NULL;
+	for (idx = 0; idx < t->size; idx++) {
+		hlist_bl_for_each_entry_rcu(svc, e, &t->buckets[idx], s_list) {
+			if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key)))
+				break;
 			if (pos-- == 0) {
 				iter->bucket = idx;
 				return svc;
@@ -2262,18 +2688,22 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
 	__acquires(RCU)
 {
+	struct ip_vs_iter *iter = seq->private;
+	struct net *net = seq_file_net(seq);
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
 	rcu_read_lock();
+	iter->t = rcu_dereference(ipvs->svc_table);
 	return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
 }
 
 
 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	struct hlist_node *e;
-	struct ip_vs_iter *iter;
 	struct ip_vs_service *svc;
-	struct net *net = seq_file_net(seq);
-	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct ip_vs_iter *iter;
+	struct hlist_bl_node *e;
+	struct ip_vs_rht *t;
 
 	++*pos;
 	if (v == SEQ_START_TOKEN)
@@ -2281,15 +2711,22 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 	svc = v;
 	iter = seq->private;
+	t = iter->t;
+	if (!t)
+		return NULL;
 
-	e = rcu_dereference(hlist_next_rcu(&svc->s_list));
-	if (e)
-		return hlist_entry(e, struct ip_vs_service, s_list);
+	hlist_bl_for_each_entry_continue_rcu(svc, e, s_list) {
+		/* Our cursor was moved to new table ? */
+		if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key)))
+			break;
+		return svc;
+	}
 
-	while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
-		hlist_for_each_entry_rcu(svc,
-					 &ipvs->svc_table[iter->bucket],
-					 s_list) {
+	while (++iter->bucket < t->size) {
+		hlist_bl_for_each_entry_rcu(svc, e, &t->buckets[iter->bucket],
+					    s_list) {
+			if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key)))
+				break;
 			return svc;
 		}
 	}
@@ -2770,13 +3207,18 @@ __ip_vs_get_service_entries(struct netns_ipvs *ipvs,
 			    const struct ip_vs_get_services *get,
 			    struct ip_vs_get_services __user *uptr)
 {
-	int idx, count=0;
-	struct ip_vs_service *svc;
 	struct ip_vs_service_entry entry;
+	DECLARE_IP_VS_RHT_WALK_BUCKETS();
+	struct hlist_bl_head *head;
+	struct ip_vs_service *svc;
+	struct hlist_bl_node *e;
+	int count = 0;
 	int ret = 0;
 
-	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		hlist_for_each_entry(svc, &ipvs->svc_table[idx], s_list) {
+	lockdep_assert_held(&ipvs->svc_resize_sem);
+	/* All service modifications are disabled, go ahead */
+	ip_vs_rht_walk_buckets(ipvs->svc_table, head) {
+		hlist_bl_for_each_entry(svc, e, head, s_list) {
 			/* Only expose IPv4 entries to old interface */
 			if (svc->af != AF_INET)
 				continue;
@@ -2948,6 +3390,35 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 		return ret;
 	}
 
+	if (cmd == IP_VS_SO_GET_SERVICES) {
+		struct ip_vs_get_services *get;
+		size_t size;
+
+		get = (struct ip_vs_get_services *)arg;
+		size = struct_size(get, entrytable, get->num_services);
+		if (*len != size) {
+			pr_err("length: %u != %zu\n", *len, size);
+			return -EINVAL;
+		}
+		/* Protect against table resizer moving the entries.
+		 * Try reverse locking, so that we do not hold the mutex
+		 * while waiting for semaphore.
+		 */
+		while (1) {
+			ret = down_read_killable(&ipvs->svc_resize_sem);
+			if (ret < 0)
+				return ret;
+			if (mutex_trylock(&ipvs->service_mutex))
+				break;
+			up_read(&ipvs->svc_resize_sem);
+			cond_resched();
+		}
+		ret = __ip_vs_get_service_entries(ipvs, get, user);
+		up_read(&ipvs->svc_resize_sem);
+		mutex_unlock(&ipvs->service_mutex);
+		return ret;
+	}
+
 	mutex_lock(&ipvs->service_mutex);
 	switch (cmd) {
 	case IP_VS_SO_GET_VERSION:
@@ -2976,22 +3447,6 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	}
 	break;
 
-	case IP_VS_SO_GET_SERVICES:
-	{
-		struct ip_vs_get_services *get;
-		size_t size;
-
-		get = (struct ip_vs_get_services *)arg;
-		size = struct_size(get, entrytable, get->num_services);
-		if (*len != size) {
-			pr_err("length: %u != %zu\n", *len, size);
-			ret = -EINVAL;
-			goto out;
-		}
-		ret = __ip_vs_get_service_entries(ipvs, get, user);
-	}
-	break;
-
 	case IP_VS_SO_GET_SERVICE:
 	{
 		struct ip_vs_service_entry *entry;
@@ -3277,15 +3732,19 @@ nla_put_failure:
 static int ip_vs_genl_dump_services(struct sk_buff *skb,
 				    struct netlink_callback *cb)
 {
-	int idx = 0, i;
-	int start = cb->args[0];
-	struct ip_vs_service *svc;
+	DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU();
 	struct net *net = sock_net(skb->sk);
 	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct hlist_bl_head *head;
+	struct ip_vs_service *svc;
+	struct hlist_bl_node *e;
+	int start = cb->args[0];
+	int idx = 0;
 
+	down_read(&ipvs->svc_resize_sem);
 	rcu_read_lock();
-	for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
-		hlist_for_each_entry_rcu(svc, &ipvs->svc_table[i], s_list) {
+	ip_vs_rht_walk_buckets_safe_rcu(ipvs->svc_table, head) {
+		hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
 			if (++idx <= start)
 				continue;
 			if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
@@ -3297,6 +3756,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
 
 nla_put_failure:
 	rcu_read_unlock();
+	up_read(&ipvs->svc_resize_sem);
 	cb->args[0] = idx;
 
 	return skb->len;
@@ -4306,8 +4766,10 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
 
 	/* Initialize service_mutex, svc_table per netns */
 	__mutex_init(&ipvs->service_mutex, "ipvs->service_mutex", &__ipvs_service_key);
-	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)
-		INIT_HLIST_HEAD(&ipvs->svc_table[idx]);
+	init_rwsem(&ipvs->svc_resize_sem);
+	INIT_DELAYED_WORK(&ipvs->svc_resize_work, svc_resize_work_handler);
+	atomic_set(&ipvs->svc_table_changes, 0);
+	RCU_INIT_POINTER(ipvs->svc_table, NULL);
 
 	/* Initialize rs_table */
 	for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
@@ -4326,6 +4788,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
 	}
 
 	INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler);
+	ipvs->sysctl_svc_lfactor = ip_vs_svc_default_load_factor(ipvs);
 
 	/* procfs stats */
 	ipvs->tot_stats = kzalloc_obj(*ipvs->tot_stats);
diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
index 85f31d71e29a..0c83c7b69581 100644
--- a/net/netfilter/ipvs/ip_vs_pe_sip.c
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -132,9 +132,9 @@ static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p,
 }
 
 static u32 ip_vs_sip_hashkey_raw(const struct ip_vs_conn_param *p,
-				 u32 initval, bool inverse)
+				 struct ip_vs_rht *t, bool inverse)
 {
-	return jhash(p->pe_data, p->pe_data_len, initval);
+	return jhash(p->pe_data, p->pe_data_len, (u32)t->hash_key.key[0]);
 }
 
 static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf)
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index b2ba3befbd55..93038abbf5e0 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1755,6 +1755,28 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
 	if (!ip_vs_use_count_inc())
 		return -ENOPROTOOPT;
 
+	/* Backup server can be started without services just to sync conns,
+	 * make sure conn_tab is created even if ipvs->enable is 0.
+	 */
+	if (state == IP_VS_STATE_BACKUP) {
+		mutex_lock(&ipvs->service_mutex);
+		if (!rcu_dereference_protected(ipvs->conn_tab, 1)) {
+			int lfactor = sysctl_conn_lfactor(ipvs);
+			int new_size = ip_vs_conn_desired_size(ipvs, NULL,
+							       lfactor);
+			struct ip_vs_rht *tc_new;
+
+			tc_new = ip_vs_conn_tab_alloc(ipvs, new_size, lfactor);
+			if (!tc_new) {
+				mutex_unlock(&ipvs->service_mutex);
+				result = -ENOMEM;
+				goto out_module;
+			}
+			rcu_assign_pointer(ipvs->conn_tab, tc_new);
+		}
+		mutex_unlock(&ipvs->service_mutex);
+	}
+
 	/* Do not hold one mutex and then to block on another */
 	for (;;) {
 		rtnl_lock();
@@ -1922,6 +1944,7 @@ out_early:
 	mutex_unlock(&ipvs->sync_mutex);
 	rtnl_unlock();
 
+out_module:
 	/* decrease the module use count */
 	ip_vs_use_count_dec();
 	return result;
diff --git a/net/netfilter/nf_log_syslog.c b/net/netfilter/nf_log_syslog.c
index 41503847d9d7..0507d67cad27 100644
--- a/net/netfilter/nf_log_syslog.c
+++ b/net/netfilter/nf_log_syslog.c
@@ -165,18 +165,26 @@ static struct nf_logger nf_arp_logger __read_mostly = {
 static void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m,
 				   struct sock *sk)
 {
+	const struct socket *sock;
+	const struct file *file;
+
 	if (!sk || !sk_fullsock(sk) || !net_eq(net, sock_net(sk)))
 		return;
 
-	read_lock_bh(&sk->sk_callback_lock);
-	if (sk->sk_socket && sk->sk_socket->file) {
-		const struct cred *cred = sk->sk_socket->file->f_cred;
+	/* The sk pointer remains valid as long as the skb is. The sk_socket and
+	 * file pointer may become NULL if the socket is closed. Both structures
+	 * (including file->cred) are RCU freed which means they can be accessed
+	 * within a RCU read section.
+	 */
+	sock = READ_ONCE(sk->sk_socket);
+	file = sock ? READ_ONCE(sock->file) : NULL;
+	if (file) {
+		const struct cred *cred = file->f_cred;
 
 		nf_log_buf_add(m, "UID=%u GID=%u ",
 			       from_kuid_munged(&init_user_ns, cred->fsuid),
 			       from_kgid_munged(&init_user_ns, cred->fsgid));
 	}
-	read_unlock_bh(&sk->sk_callback_lock);
 }
 
 static noinline_for_stack int
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index b35a90955e2e..730c2541a849 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -611,19 +611,26 @@ __build_packet_message(struct nfnl_log_net *log,
 	/* UID */
 	sk = skb->sk;
 	if (sk && sk_fullsock(sk)) {
-		read_lock_bh(&sk->sk_callback_lock);
-		if (sk->sk_socket && sk->sk_socket->file) {
-			struct file *file = sk->sk_socket->file;
+		const struct socket *sock;
+		const struct file *file;
+
+		/* The sk pointer remains valid as long as the skb is.
+		 * The sk_socket and file pointer may become NULL
+		 * if the socket is closed.
+		 * Both structures (including file->cred) are RCU freed
+		 * which means they can be accessed within a RCU read section.
+		 */
+		sock = READ_ONCE(sk->sk_socket);
+		file = sock ? READ_ONCE(sock->file) : NULL;
+		if (file) {
 			const struct cred *cred = file->f_cred;
 			struct user_namespace *user_ns = inst->peer_user_ns;
 			__be32 uid = htonl(from_kuid_munged(user_ns, cred->fsuid));
 			__be32 gid = htonl(from_kgid_munged(user_ns, cred->fsgid));
-			read_unlock_bh(&sk->sk_callback_lock);
 			if (nla_put_be32(inst->skb, NFULA_UID, uid) ||
 			    nla_put_be32(inst->skb, NFULA_GID, gid))
 				goto nla_put_failure;
-		} else
-			read_unlock_bh(&sk->sk_callback_lock);
+		}
 	}
 
 	/* local sequence number */
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 7f5248b5f1ee..5379d8ff39c0 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -545,14 +545,23 @@ nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet,
 
 static int nfqnl_put_sk_uidgid(struct sk_buff *skb, struct sock *sk)
 {
+	const struct socket *sock;
+	const struct file *file;
 	const struct cred *cred;
 
 	if (!sk_fullsock(sk))
 		return 0;
 
-	read_lock_bh(&sk->sk_callback_lock);
-	if (sk->sk_socket && sk->sk_socket->file) {
-		cred = sk->sk_socket->file->f_cred;
+	/* The sk pointer remains valid as long as the skb is.
+	 * The sk_socket and file pointer may become NULL
+	 * if the socket is closed.
+	 * Both structures (including file->cred) are RCU freed
+	 * which means they can be accessed within a RCU read section.
+	 */
+	sock = READ_ONCE(sk->sk_socket);
+	file = sock ? READ_ONCE(sock->file) : NULL;
+	if (file) {
+		cred = file->f_cred;
 		if (nla_put_be32(skb, NFQA_UID,
 		    htonl(from_kuid_munged(&init_user_ns, cred->fsuid))))
 			goto nla_put_failure;
@@ -560,11 +569,9 @@ static int nfqnl_put_sk_uidgid(struct sk_buff *skb, struct sock *sk)
 		    htonl(from_kgid_munged(&init_user_ns, cred->fsgid))))
 			goto nla_put_failure;
 	}
-	read_unlock_bh(&sk->sk_callback_lock);
 	return 0;
 
 nla_put_failure:
-	read_unlock_bh(&sk->sk_callback_lock);
 	return -1;
 }
 
@@ -585,15 +592,8 @@ static int nfqnl_get_sk_secctx(struct sk_buff *skb, struct lsm_context *ctx)
 {
 	int seclen = 0;
 #if IS_ENABLED(CONFIG_NETWORK_SECMARK)
-
-	if (!skb || !sk_fullsock(skb->sk))
-		return 0;
-
-	read_lock_bh(&skb->sk->sk_callback_lock);
-
 	if (skb->secmark)
 		seclen = security_secid_to_secctx(skb->secmark, ctx);
-	read_unlock_bh(&skb->sk->sk_callback_lock);
 #endif
 	return seclen;
 }
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 983158274c68..d0df6cf374d1 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -131,33 +131,36 @@ nft_meta_get_eval_skugid(enum nft_meta_keys key,
 			 u32 *dest,
 			 const struct nft_pktinfo *pkt)
 {
-	struct sock *sk = skb_to_full_sk(pkt->skb);
-	struct socket *sock;
+	const struct sock *sk = skb_to_full_sk(pkt->skb);
+	const struct socket *sock;
+	const struct file *file;
 
 	if (!sk || !sk_fullsock(sk) || !net_eq(nft_net(pkt), sock_net(sk)))
 		return false;
 
-	read_lock_bh(&sk->sk_callback_lock);
-	sock = sk->sk_socket;
-	if (!sock || !sock->file) {
-		read_unlock_bh(&sk->sk_callback_lock);
+	/* The sk pointer remains valid as long as the skb is. The sk_socket and
+	 * file pointer may become NULL if the socket is closed. Both structures
+	 * (including file->cred) are RCU freed which means they can be accessed
+	 * within a RCU read section.
+	 */
+	sock = READ_ONCE(sk->sk_socket);
+	file = sock ? READ_ONCE(sock->file) : NULL;
+	if (!file)
 		return false;
-	}
 
 	switch (key) {
 	case NFT_META_SKUID:
 		*dest = from_kuid_munged(sock_net(sk)->user_ns,
-					 sock->file->f_cred->fsuid);
+					 file->f_cred->fsuid);
 		break;
 	case NFT_META_SKGID:
 		*dest =	from_kgid_munged(sock_net(sk)->user_ns,
-					 sock->file->f_cred->fsgid);
+					 file->f_cred->fsgid);
 		break;
 	default:
 		break;
 	}
 
-	read_unlock_bh(&sk->sk_callback_lock);
 	return true;
 }
 
diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
index 50332888c8d2..5bfb4843df66 100644
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -63,11 +63,12 @@ static bool
 owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct xt_owner_match_info *info = par->matchinfo;
-	const struct file *filp;
 	struct sock *sk = skb_to_full_sk(skb);
 	struct net *net = xt_net(par);
+	const struct socket *sock;
+	const struct file *filp;
 
-	if (!sk || !sk->sk_socket || !net_eq(net, sock_net(sk)))
+	if (!sk || !READ_ONCE(sk->sk_socket) || !net_eq(net, sock_net(sk)))
 		return (info->match ^ info->invert) == 0;
 	else if (info->match & info->invert & XT_OWNER_SOCKET)
 		/*
@@ -76,23 +77,25 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
 		 */
 		return false;
 
-	read_lock_bh(&sk->sk_callback_lock);
-	filp = sk->sk_socket ? sk->sk_socket->file : NULL;
-	if (filp == NULL) {
-		read_unlock_bh(&sk->sk_callback_lock);
+	/* The sk pointer remains valid as long as the skb is. The sk_socket and
+	 * file pointer may become NULL if the socket is closed. Both structures
+	 * (including file->cred) are RCU freed which means they can be accessed
+	 * within a RCU read section.
+	 */
+	sock = READ_ONCE(sk->sk_socket);
+	filp = sock ? READ_ONCE(sock->file) : NULL;
+	if (filp == NULL)
 		return ((info->match ^ info->invert) &
 		       (XT_OWNER_UID | XT_OWNER_GID)) == 0;
-	}
 
 	if (info->match & XT_OWNER_UID) {
 		kuid_t uid_min = make_kuid(net->user_ns, info->uid_min);
 		kuid_t uid_max = make_kuid(net->user_ns, info->uid_max);
+
 		if ((uid_gte(filp->f_cred->fsuid, uid_min) &&
 		     uid_lte(filp->f_cred->fsuid, uid_max)) ^
-		    !(info->invert & XT_OWNER_UID)) {
-			read_unlock_bh(&sk->sk_callback_lock);
+		    !(info->invert & XT_OWNER_UID))
 			return false;
-		}
 	}
 
 	if (info->match & XT_OWNER_GID) {
@@ -117,13 +120,10 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
 			}
 		}
 
-		if (match ^ !(info->invert & XT_OWNER_GID)) {
-			read_unlock_bh(&sk->sk_callback_lock);
+		if (match ^ !(info->invert & XT_OWNER_GID))
 			return false;
-		}
 	}
 
-	read_unlock_bh(&sk->sk_callback_lock);
 	return true;
 }
author	Paolo Abeni <pabeni@redhat.com>	2026-03-05 13:32:49 +0300
committer	Paolo Abeni <pabeni@redhat.com>	2026-03-05 13:32:50 +0300
commit	6d32a196beb41525f055f720e98f97fb441f0ee0 (patch)
tree	530e7bd31012de8ae7483c50641b48501d070161
parent	d8103bfe41eeb8b6842672502a0025b7278931a8 (diff)
parent	f20c73b0460d15301cf1bddf0f85d060a38a75df (diff)
download	linux-6d32a196beb41525f055f720e98f97fb441f0ee0.tar.xz